import random
def choose_from_hist(hist_dict):
frequency_list = list()
# 遍历键值对
for key, val in hist_dict.items():
# 列表添加val个键
for i in range(val):
frequency_list.append(key)
random_res = random.choice(frequency_list)
return random_res
for v in range(3):
histogram_dict = {'a': 2, 'b': 1}
res = choose_from_hist(histogram_dict)
print(res)
# 完整代码
import string
# 移除符号, 文本中有中文符号"与"
remove_symbol = string.punctuation + string.whitespace + '""'
# 读取文件
def read_file(file_name):
"""
:param file_name: 文件的路径
:return: 直方图
"""
hist = dict()
file_data = open(file_name, encoding='utf8')
# 跳过开头部分
skip_header(file_data)
# 现在的file_data以及没有了开头部分.
for line in file_data:
# 跳过结尾部分
if line.startswith('*** END OF THIS PROJECT'):
break
# 单词很多使用'-'拼在一起, 将'-'替换为空格
words_line = line.replace('-', ' ')
# 字符串按空格切分
words_list = words_line.split()
for word in words_list:
# 处理单词前后的符号, 中间的不要管
word = word.strip(remove_symbol)
# 转为全小写
lower_word = word.lower()
# 统计频率
hist: dict
hist[lower_word] = hist.get(lower_word, 0) + 1
return hist
def skip_header(file_obj):
"""
:param file_obj: 文件对象
:return: None
读取文件对象, 读取到 *** START OF THIS PROJECT GUTENBERG EBOOK EMMA *** 这一行则停止
"""
for line in file_obj:
if line.startswith('*** START OF THIS PROJECT'):
break
def most_common(hist):
t = []
for key, value in hist.items():
t.append((value, key))
t.sort(reverse=True)
return t
hist = read_file('emma.txt')
t = most_common(hist)
print('The most common word are:')
for freq, word in t[:10]:
print(word, freq, sep='\t')
python复制代码
# 简化版本, 注意: 需要去掉开头与结尾的..
import string
# 移除符号, 文本中有中文符号"与"
remove_symbol = string.punctuation + string.whitespace + '""'
# 读取文件
def read_file(file_name):
hist = dict()
file_data = open(file_name, encoding='utf8')
# 跳过开头部分
skip_header(file_data)
# 现在的file_data以及没有了开头部分.
for line in file_data:
# 跳过结尾部分
if line.startswith('*** END OF THIS PROJECT'):
break
# 单词很多使用'-'拼在一起, 将'-'替换为空格
words_line = line.replace('-', ' ')
# 字符串按空格切分
words_list = words_line.split()
for word in words_list:
# 处理单词前后的符号, 中间的不要管
word = word.strip(remove_symbol)
# 转为全小写
lower_word = word.lower()
# 统计频率
hist: dict
hist[lower_word] = hist.get(lower_word, 0) + 1
return hist
def skip_header(file_obj):
for line in file_obj:
if line.startswith('*** START OF THIS PROJECT'):
break
hist = read_file('emma.txt')
# 简化
hist_list = list(hist.items())
hist_list.sort(key=lambda x: x[1], reverse=True)
print('The most common word are:')
for freq, word in hist_list[:10]:
print(word, freq, sep='\t')
# 完整代码
import string
# 移除符号, 文本中有中文符号"与"
remove_symbol = string.punctuation + string.whitespace + '""'
# 读取文件
def read_file(file_name):
hist = dict()
file_data = open(file_name, encoding='utf8')
# 跳过开头部分
skip_header(file_data)
# 现在的file_data以及没有了开头部分.
for line in file_data:
# 跳过结尾部分
if line.startswith('*** END OF THIS PROJECT'):
break
# 单词很多使用'-'拼在一起, 将'-'替换为空格
words_line = line.replace('-', ' ')
# 字符串按空格切分
words_list = words_line.split()
for word in words_list:
# 处理单词前后的符号, 中间的不要管
word = word.strip(remove_symbol)
# 转为全小写
lower_word = word.lower()
# 统计频率
hist: dict
hist[lower_word] = hist.get(lower_word, 0) + 1
return hist
def skip_header(file_obj):
for line in file_obj:
if line.startswith('*** START OF THIS PROJECT'):
break
def most_common(hist):
t = []
for key, value in hist.items():
t.append((value, key))
t.sort(reverse=True)
return t
def print_most_common(hist, num=10):
t = most_common(hist)
print('The most common word are:')
for freq, word in t[:num]:
print(word, freq, sep='\t')
hist = read_file('emma.txt')
print_most_common(hist)
# print_most_common(hist, 20)
words = process_file('words')
diff = subtract(hist, words)
print("Words in the book that aren't in the word list:")
for word in diff:
print(word, end=' ')
# 我的结果
Words in the book that aren't in the word list:
emma austen i woodhouse a sister's remembrance taylor mr woodhouse's taylor's ...
python复制代码
# 完整代码, 有一点不同, 就是调用函数生成直方图时, 要不要跳过开头的配置...
import string
# 移除符号, 文本中有中文符号"与"
remove_symbol = string.punctuation + string.whitespace + '""'
# 读取文件
def process_file(file_name, is_skip=True):
hist = dict()
file_data = open(file_name, encoding='utf8')
if is_skip:
# 跳过开头部分
skip_header(file_data)
# 现在的file_data以及没有了开头部分.
for line in file_data:
# 跳过结尾部分
if line.startswith('*** END OF THIS PROJECT'):
break
# 单词很多使用'-'拼在一起, 将'-'替换为空格
words_line = line.replace('-', ' ')
# 字符串按空格切分
words_list = words_line.split()
for word in words_list:
# 处理单词前后的符号, 中间的不要管
word = word.strip(remove_symbol)
# 转为全小写
lower_word = word.lower()
# 统计频率
hist: dict
hist[lower_word] = hist.get(lower_word, 0) + 1
return hist
def skip_header(file_obj):
for line in file_obj:
if line.startswith('*** START OF THIS PROJECT'):
break
# 字典减法
def subtract(d1, d2):
res = dict()
for key in d1:
if key not in d2:
res[key] = None
return res
# 跳过靠头结尾
hist = process_file('emma.txt')
# 不跳过开头
words = process_file('words.txt', False)
diff = subtract(hist, words)
print("Words in the book that aren't in the word list:")
for word in diff:
print(word, end=' ')
import string
import random
# 移除符号, 文本中有中文符号"与"
remove_symbol = string.punctuation + string.whitespace + '""'
# 读取文件
def process_file(file_name, is_skip=True):
hist = dict()
file_data = open(file_name, encoding='utf8')
if is_skip:
# 跳过开头部分
skip_header(file_data)
# 现在的file_data以及没有了开头部分.
for line in file_data:
# 跳过结尾部分
if line.startswith('*** END OF THIS PROJECT'):
break
# 单词很多使用'-'拼在一起, 将'-'替换为空格
words_line = line.replace('-', ' ')
# 字符串按空格切分
words_list = words_line.split()
for word in words_list:
# 处理单词前后的符号, 中间的不要管
word = word.strip(remove_symbol)
# 转为全小写
lower_word = word.lower()
# 统计频率
hist: dict
hist[lower_word] = hist.get(lower_word, 0) + 1
return hist
def skip_header(file_obj):
for line in file_obj:
if line.startswith('*** START OF THIS PROJECT'):
break
# 创建值的累加列表
def make_count_list(d):
count_list = list()
count = 0
for val in d.values():
count = count + val
count_list.append(count)
return count_list
# 完整单词列表
def full_list(hist):
# 单词从1开始
full = [' ', ]
for k, v in hist.items():
for i in range(v):
full.append(k)
return full
# 二分法查找随机值
def dichotomy(tem_list, find, high_index, lower_index=0, index=None, direction=None):
# 基准情形
len_list = len(tem_list)
if len_list == 0:
print('找不到', direction)
# 当找不到时, 还需要往右切, 则需要加1, 往左则直接返回.
# 需要向右切则说明是下一个单词, 向左还在这个单词的范围内, 不需要操作.
if direction == 'right':
return index + 1
if direction == 'left':
return index
# 获取中间值
middle = len_list // 2
index = (lower_index + high_index) // 2
print(f'当前列表的第{middle}个元素, 原本列表的第{(lower_index + high_index) // 2}个元素.')
# 右切 -->
if find > tem_list[middle]:
lower = middle + 1
# lower_index 向右移动middle+1位
lower_index = lower_index + lower
index = dichotomy(tem_list[lower:], find, high_index, lower_index, index, direction='right')
return index
elif find < tem_list[middle]:
high = middle
# 高位向左移动
high_index -= (len_list - len(tem_list[:high]))
index = dichotomy(tem_list[:high], find, high_index, lower_index, index, direction='left')
return index
else:
print('找到了')
return index
# 随机函数
def randint_num():
# 跳过靠头结尾
res_hist = process_file('emma.txt')
# 创建单词列表
g_words_list = list(res_hist.keys())
# 每个单词只有一个, 每个值的下标范围很广, 如第一个单词, produced': 15, 那么1-15的坐标都是produced
res_count_list = make_count_list(res_hist)
# 获取随机数, 从1开始, 如果从0开始第一个单词就多了一次.
fin_num = random.randint(1, res_count_list[-1])
# 测试数据, 将fin_num替换成列表中的数字
# [15[范围在1-15], 588[16-588], 1052, 1053, 1054, 1841, 2124]
# 'produced', 'by', 'an', 'anonymous', 'volunteer', 'emma', 'jane',
# 查找这个值的下标在哪里
# fin_num = 588
index = dichotomy(res_count_list, fin_num, len(res_count_list))
print(f'精简列表的第{index}位', g_words_list[index])
# 使用完整列表测试
full = full_list(res_hist)
print(f'完整列表中的第{fin_num}位', full[fin_num])
randint_num()
"""
精简列表的第58位 mistress
完整列表中的第45709位 been
"""
python复制代码
# 作者的代码, 有一点不同的是, 他的随机数和存储单词的位置从0开始, 我从1开始.
import string
import random
from bisect import bisect
def process_file(filename, skip_header):
hist = {}
fp = open(filename, encoding='utf8')
if skip_header:
skip_gutenberg_header(fp)
for line in fp:
if line.startswith('*** END OF THIS'):
break
process_line(line, hist)
return hist
def skip_gutenberg_header(fp):
for line in fp:
if line.startswith('*** START OF THIS'):
break
def process_line(line, hist):
line = line.replace('-', ' ')
strip_ables = string.punctuation + string.whitespace
for word in line.split():
word = word.strip(strip_ables)
word = word.lower()
hist[word] = hist.get(word, 0) + 1
def random_word(hist):
# 单词列表
words = []
# 频率
freqs = []
# 频率总数
total_freq = 0
# 遍历键值对
for word, freq in hist.items():
# 频率总数
total_freq += freq
# 收录单词
words.append(word)
# 频率列表
freqs.append(total_freq)
# 查找的数
x = random.randint(0, total_freq - 1)
# 返回索引
index = bisect(freqs, x)
return words[index]
hist = process_file('158-0.txt', skip_header=True)
print(random_word(hist), end=' ')
13.8 马尔可夫分析
handlebars复制代码
如果你从书中随机地获取单词, 可以凭此感受一下书中地词汇, 但可能无法通过随机获取来得到一句话:
handlebars复制代码
this the small regard harriet which knightley's it most things
handlebars复制代码
一个随机单词地序列, 很难组成有意义地话, 因为相邻地词之间没有任何关联.
例如, 在一个真实地句子中, 冠词'the'应当会后接一个形容词, 而不应是动词或副词.
测量这种类型的关联的方法之一是使用马尔可夫分析,
它能够用于描述给定地单词序列中下一个可能出现地单词地概率.
例如, 歌曲<<Eric, the Half a Bee>>的开头是:
handlebars复制代码
Half a bee, philosophically,
Must, ipso facto, half not be.
But half the bee has got to be
Vis a vis, its entity. D'you see?
But can a bee be said to be
Or not to be an entire bee
When half the bee is not a bee
Due to some ancient injury?
马尔可夫分析:
1. 编写一个程序从文件中读入文本, 并进行马尔可夫分析.
结果因该是一个字典, 将前缀映射到可能后缀的集合. 集合可以是列表, 元组或者字典.
由你来做出合适的选择, 你可以使用前缀长度2来测试程序,
但编写程序时应当考虑可以方便地改为其他前缀长度.
2. 在前面编写地程序中添加一个函数, 基于马尔可夫分析地结果随机生成文本.
下面时一个从<<爱玛>>中使用前缀长度2生成地例子:
He was very clever, be it sweetness or be angry, ashamed or only amused,
at such a stroke. She had never thought of Hannah till you were never meant for me?"
"I cannot make speechess, Emma:" he soon cut it all himself.
对这个例子, 我留下每个单词后面地标点. 结果几乎时语法正确地, 但也不完整正确.
语义上, 它看起来像是有意义的, 但也不完全是.
当增加前缀长度是, 结果会怎么样? 随机生成的文本会不会看起来更有意义?
3. 一旦你的程序可以正常运行后, 可以考虑尝试一下混搭: 如果对两本或更多本书进行组合,
则生成的随机文本会以一种有趣的方式混合各书中的词汇和短语.
致谢: 本案例分析基于Kernighan和Pike和The Practice of Programming(Addison-Wesley, 1999)
一书中的一个实例.
你应当在继续阅读全尝试这个练习, 接着可从↓下载我的解答:
https://raw.githubusercontent.com/AllenDowney/ThinkPython2/master/code/markov.py
你也需要↓:
https://raw.githubusercontent.com/AllenDowney/ThinkPython2/master/code/emma.txt