Python实现分词并做词频统计
(2020-11-26 09:13:17)
标签:
cxd1301轩辕小子分词词频jieba |
分类: python |
#encoding=utf-8
import xlrd
import numpy as np
import pandas as pd
import jieba
import jieba.analyse
jieba.load_userdict('userdict.txt')
import codecs
from openpyxl import Workbook
# def insertOne(value1, value2, sheet):
# row = [value1, value2]
# sheet.append(row)
if __name__ == "__main__":
# 载入排除词1(自定义)
try:
with open('stop_words.txt', 'r',encoding='utf-8') as f:
stop_words = []
for word in f.readlines():
w = word.strip('\n')
stop_words.append(w)
except:
print("加载失败")
# print(stop_words)
# 载入排除词2(省市区)
try:
with open('stop_words_area.txt', 'r', encoding='utf-8') as f:
for word in f.readlines():
w = word.strip('\n')
if w not in stop_words:
stop_words.append(w)
except:
print("加载失败")
# 载入政策标题
file_name = 'resource.xlsx'
xl = xlrd.open_workbook(file_name)
table = xl.sheets()[0]
rows = table.row_values(0)
cols = table.col_values(0)
# print(rows)
# print(cols)
# 对标题进行分词
word_lst = []
for mycol in cols:
print(mycol)
tags = jieba.cut(mycol.strip(),cut_all=False)
for t in tags:
if t not in stop_words and t.isspace() == False:
print(t)
word_lst.append(t)
# 去重提取分词
word_dict = {}
for item in word_lst:
if item not in word_dict:
word_dict[item] = 1
else:
word_dict[item] +=1
# print(word_lst)
# print(word_dict)
# 排序输出
sorted_dict = sorted(word_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_dict)
# for i in range(0,len(sorted_dict)):
# print(sorted_dict[i])
# 数据存储
file = open('segment_frequency.txt', 'w', encoding='utf-8')
for k, v in sorted_dict:
if v>10:
file.write(str(k) + ' ' + str(v) + '\n')
file.close()
前一篇:时间的力量