python
import os
import pandas as pd
import jieba
# 加载停用词
def load_stopwords(filenames):
stopwords = set()
for filename in filenames:
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
return stopwords
# 中文分词并去除停用词
def segment_and_remove_stopwords(text, stopwords):
words = jieba.cut(text)
filtered_words = [word for word in words if word not in stopwords and len(word) > 1]
return ' '.join(filtered_words)
# 处理评论数据
def process_comments(df, comment_column, stopwords):
df['connected_words'] = df[comment_column].apply(lambda x: segment_and_remove_stopwords(x, stopwords))
return df
# 主函数
def main(input_file_path, output_file_path, comment_column, stopwords_files=[]):
# 加载停用词
stopwords = load_stopwords(stopwords_files)
# 读取CSV文件
df = pd.read_csv(input_file_path, encoding='utf-8')
# 处理评论数据
processed_df = process_comments(df, comment_column, stopwords)
# 保存处理后的数据到新的CSV文件
processed_df.to_csv(output_file_path, index=False, encoding='utf-8-sig')
print(f"数据预处理完成,已保存到 {output_file_path}")
if __name__ == '__main__':
input_file_path = r"D:\pycharm\爬虫案列\24.汽车之家\_0_10.csv" # 你的CSV文件路径
output_file_path = 'comments_processed.csv' # 输出文件的路径
comment_column = '空间' # 假设评论数据在'comment'列中
# 停用词文件列表,确保这些文件在你的工作目录中
stopwords_files = [
r"stopwords-master\baidu_stopwords.txt",
r"stopwords-master\cn_stopwords.txt",
r"stopwords-master\hit_stopwords.txt",
r"stopwords-master\scu_stopwords.txt",
# ... 其他停用词文件
]
# 确保所有停用词文件都存在
for filename in stopwords_files:
if not os.path.exists(filename):
print(f"Stopwords file {filename} not found.")
exit(1)
# 调用主函数处理评论数据
main(input_file_path, output_file_path, comment_column, stopwords_files)
停用词表可以去看一下博主的上传的资源 , 可以免费获取的