bash
def stopword():
stop_word_path = r'C:/Users/DELL/douban/douban/cn_stopwords.txt'
stopword_list = [sw.replace('\n', '') for sw in open(stop_word_path,encoding='utf-8').readlines()]
return stopword_list
def cut_word(sentence):
seg_list = jieba.cut(sentence)
return seg_list
def word_filter(seg_list):
stopword_list = stopword()
filter_list = []
for seg in seg_list:
word = seg.word
flag = seg.flag
if not flag.startswith('n'):
continue
if not word in stopword_list and len(word) > 1:
filter_list.append(word)
return filter_list
def tf_value(filter_list):
filter_list = filter_list
tf_value_dict = {}
tf_value = {}
for word in filter_list:
tf_value_dict[word] = tf_value_dict.get(word, 0.0) + 1.0
for key, value in tf_value_dict.items():
tf_value[key] = float(value / len(filter_list))
return tf_value
def load_data():
corpus_path = r'C:/Users/DELL/douban/douban/why.txt'
doc_list = []
for line in open(corpus_path, 'r', encoding='utf-8'):
content = str(line.strip())
seg_list = cut_word(content)
filter_word = word_filter(seg_list)
doc_list.append(filter_word)
return doc_list
def train_idf():
doc_list = load_data()
idf_dic = {}
total_doc_num = len(doc_list) # 总的文档的数目
# 每个词出现的文档数
for doc in doc_list:
for word in set(doc):
idf_dic[word] = idf_dic.get(word, 0.0) + 1.0
# 按照idf公式进行转换
for key, value in idf_dic.items():
# 加1是拉普拉斯平滑,防止部分新词在语料库中没有出现导致分母为0
idf_dic[key] = math.log(total_doc_num / (1.0 + value))
return idf_dic
def tf_idf(tf):
tf_value_dict = tf # tf的值,tf_value是个字典
idf_value = train_idf() # idf的值,idf是个字典
tf_idf_dict = {}
for key, value in tf_value_dict.items():
tf_idf_dict[key] = value
for key_idf, value_idf in idf_value.items():
if key == key_idf:
tf_idf_dict[key] = value * value_idf
return tf_idf_dict
def rank():
keyword_num = 10
tf_idf_dict = tf_idf(tf)
final_dict = sorted(tf_idf_dict.items(), key = lambda x: x[1], reverse = True)
for i in range(0, len(final_dict)):
print(final_dict[i][0] + '/', end = '')
if i > 10:
break
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import jieba
import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
if __name__ == '__main__':
text = '文学'
seg_list = cut_word(text)
filter_word = word_filter(seg_list)
tf = tf_value(filter_word)
tf_idf(tf)
rank()
各位大佬怎么搞啊这个