加载中…
个人资料
  • 博客等级:
  • 博客积分:
  • 博客访问:
  • 关注人气:
  • 获赠金笔:0支
  • 赠出金笔:0支
  • 荣誉徽章:
正文 字体大小:

gensim学习之corpora.Dictionary

(2017-05-03 19:36:37)
标签:

gensim

python

corpora

dictionary

分类: python

# -*- coding: utf-8 -*-
import jieba, os
import codecs
from gensim import corpora, models, similarities
from pprint import pprint
from collections import defaultdict
import sys
import pickle

reload(sys)
sys.setdefaultencoding('utf-8')


def print_dict(dict):
for key in dict:
print type(key), key, str(dict[key]),
print


def test3():
a = [['','',''],['','','']]
b = ['','','','','']
dictionary = corpora.Dictionary(a)
print "########dictionary信息##########"
print str(dictionary) #
print "字典,{单词id,在多少文档中出现}"
print dictionary.dfs #字典,{单词id,在多少文档中出现}
print "文档数目"
print dictionary.num_docs #文档数目
print "dictionary.items()"
print_dict(dict(dictionary.items()))
print "字典,{单词id,对应的词}"
print_dict(dictionary.id2token) #字典,{单词id,对应的词}
print "字典,{词,对应的单词id}"
print_dict(dictionary.token2id) #字典,{词,对应的单词id}
print "所有词的个数"
print dictionary.num_pos #所有词的个数
print "每个文件中不重复词个数的和"
print dictionary.num_nnz #每个文件中不重复词个数的和
print "########doc2bow##########"
#dictionary.add_documents([b])
#allow_update->更新当前字典;return_missing->返回字典中不存在的词
#resultb文章转换得到的词袋,列表[(单词id,词频)]
result, missing = dictionary.doc2bow(b, allow_update=False, return_missing=True)
print "词袋b,列表[(单词id,词频)]"
print result
print "不在字典中的词及其词频,字典[(单词,词频)]"
print_dict(missing)
print "########bow信息##########"
for id, freq in result:
print id, dictionary.id2token[id], freq
print "########dictionary信息##########"
#过滤文档频率大于no_below,小于no_above*num_docs的词
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=10)
print dictionary.dfs
return

test3()

0

阅读 收藏 喜欢 打印举报/Report
  

新浪BLOG意见反馈留言板 欢迎批评指正

新浪简介 | About Sina | 广告服务 | 联系我们 | 招聘信息 | 网站律师 | SINA English | 产品答疑

新浪公司 版权所有