先处理文章,去掉空格,标点符号,及非汉字内容。
程序一:
import re
import os
from collections import Counter
path = r"D:\stzf"
os.chdir(path) # 修改工作路径
# 读取Word文档内容
with open('abc.txt', 'r', encoding='utf-8') as file:
content = file.read()
# 提取所有汉字
pattern = re.compile('[\u4e00-\u9fff]+') # 匹配所有汉字
chinese_words = pattern.findall(content)
# for i in chinese_words:
# print(i)
#
# print('==========================================')
# 统计单个汉字的出现次数
char_counter = Counter(''.join(chinese_words))
# 统计词组的出现次数,可根据需要设定词组长度
word_counter = Counter(chinese_words)
# 输出高频出现的单个汉字
print('高频出现的单个汉字:')
for char, count in char_counter.most_common(10): # 输出出现次数最频繁的前10个汉字
print(char, count)
# 输出高频出现的词组
print('高频出现的词组:')
for word, count in word_counter.most_common(10): # 输出出现次数最频繁的前10个词组
if len(word) > 1:
print(word, count)
程序二,使用jieba分词。
先安装jieba分词,pip install jieba --upgrade
import os
import jieba
from collections import Counter
path = r"D:\stzf"
os.chdir(path) # 修改工作路径
with open('ABC.txt', 'rb') as file:
content = file.read()
# 分词并记录所有汉字
seg_list = jieba.cut(content)
chinese_words = []
for word in seg_list:
for char in word:
if '\u4e00' <= char <= '\u9fff':
chinese_words.append(char)
# 统计单个汉字的出现次数
char_counter = Counter(chinese_words)
# 统计词组的出现次数,根据需要设定词组长度
word_counter = Counter([''.join(chinese_words[i:i+2]) for i in range(len(chinese_words)-1)])
# 输出高频出现的单个汉字
print('高频出现的单个汉字:')
for char, count in char_counter.most_common(200):
print(char, count)
# 输出高频出现的词组
print('高频出现的词组:')
for word, count in word_counter.most_common(200):
print(word, count)