import pandas as pd
import re
import nltk
from nltk import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from gensim.corpora import Dictionary
from gensim.models import LdaModel
下载NLTK的停用词、情感分析和词性标注所需的资源
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('vader_lexicon')
加载SpaCy的英文NLP模型
nlp = spacy.load("en_core_web_sm")
读取Excel文件
df = pd.read_excel('nltk分词处理结果第二次.xlsx')
定义文本清洗函数
def clean_text(text):
去除HTML标签
cleaned_text = re.sub(r'<.*?>', '', text)
去除多余空格和换行符
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
转换为小写
cleaned_text = cleaned_text.lower()
return cleaned_text
清洗文本数据
df['cleaned_content'] = df['content'].apply(clean_text)
词频分析
words = []
for text in df['cleaned_content']:
words += word_tokenize(text)
freq_dist = FreqDist(words)
print("词频分析结果:", freq_dist.most_common(10))
情感分析
sia = SentimentIntensityAnalyzer()
df['sentiment_score'] = df['cleaned_content'].apply(lambda x: sia.polarity_scores(x)['compound'])
print("情感分析结果:", df['sentiment_score'])
定义阈值
positive_threshold = 0.5
negative_threshold = -0.5
根据情感分数进行分类
def classify_sentiment(score):
if score > positive_threshold:
return '积极'
elif score < negative_threshold:
return '消极'
else:
return '中性'
应用分类函数,创建新的列 'sentiment_category'
df['sentiment_category'] = df['sentiment_score'].apply(classify_sentiment)
输出带有情感分类的数据
print(df[['cleaned_content', 'sentiment_score', 'sentiment_category']])
主题建模
tokens = [[token.text.lower() for token in nlp(text) if token.is_alpha and token.text.lower() not in STOP_WORDS] for text in df['cleaned_content']]
dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(text) for text in tokens]
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=15)
topics = lda_model.print_topics(num_words=5)
print("主题建模结果:")
for topic in topics:
print(topic)