-- coding: utf-8 --**
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import os
获取当前目录
current_dir = os.getcwd()
获取当前目录下所有xlsx文件名
xlsx_files = [file for file in os.listdir(current_dir) if file.endswith(".xlsx")]
打印xlsx文件名
for file in xlsx_files:
print(file)
读取Excel文件
excel_file = file
data = pd.read_excel(excel_file)
提取第3列和第5列的值
column3_values = data.iloc[:, 1] # 第3列的值
column5_values = data.iloc[:, 3] # 第5列的值
打印提取的值
print("第3列的值:")
print(column3_values)
print("\n第5列的值:")
print(column5_values)
print(len(column3_values))
print(len(column5_values))
res=[]
# 两段话
for i in range(len(column3_values)):
创建计数向量器
vectorizer = CountVectorizer().fit_transform([column3_values[i], column5_values[i]])
计算余弦相似度
cosine_sim = cosine_similarity(vectorizer)
提取余弦相似度值
similarity_value = cosine_sim[0][1]
if similarity_value>0.3:
res.append("正确")
elif similarity_value>0.05 and similarity_value<0.3:
res.append("部分正确")
else:
res.append("错误")
res.append(similarity_value)
print("两段话的余弦相似度:", similarity_value)
指定txt文件名
txt_file =excel_file.replace(".xlsx","")+".txt"
将列表逐行写入txt文件
with open(txt_file, 'w') as f:
for item in res:
f.write("%s\n" % item)
print("内容已写入到", txt_file)