机器学习：对某宁某商品的好评差评数据使用朴素贝叶斯分类，并进行合理预测

import requests
import time
import re
from lxml import etree

head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}

def pa():
    for j in range(1, 200):
        time.sleep(3)
        url = f"https://review.suning.com/cluster_cmmdty_review/cluster-38249278-000000012389328846-0000000000-{j}-good.htm?originalCmmdtyType=general&safp=d488778a.10004.loverRight.166"
        response = requests.get(url, headers=head)
        sn_text = response.text
        tree = etree.HTML(sn_text)

        li_list = tree.xpath("//div[@class='rv-target-item']/div")
        sn_coments = []
        for i in li_list:
            sn_coment = ''.join(i.xpath(".//p[@class='body-content']/text()"))  # xpath返回的是列表
            sn_coment = re.match(r".*[\u4e00-\u9fff]*.*?", sn_coment.strip()).group()
            sn_coments.append(sn_coment)
            print(sn_coment)
            sngood.write(sn_coment + '\n')
    # print(sn_coments)

if __name__ == '__main__':
    sngood = open('sngood.txt', 'w', encoding='utf8')
    pa()
    sngood.close()

好评数据：

2.获取差评数据

python 复制代码

import requests
import time
import re
from lxml import etree

head = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
}

def pa():
    for j in range(1, 3):
        time.sleep(3)
        url = f"https://review.suning.com/cluster_cmmdty_review/cluster-38249278-000000012389328846-0000000000-{j}-bad.htm?originalCmmdtyType=general&safp=d488778a.10004.loverRight.166&safpn=10009"
        response = requests.get(url, headers=head)
        sn_text = response.text
        tree = etree.HTML(sn_text)

        li_list = tree.xpath("//div[@class='rv-target-item']/div")
        sn_coments = []
        for i in li_list:
            sn_coment = ''.join(i.xpath(".//p[@class='body-content']/text()"))  # xpath返回的是列表
            sn_coment = re.match(r".*[\u4e00-\u9fff]*.*?", sn_coment.strip()).group()
            sn_coments.append(sn_coment)
            print(sn_coment)
            snbad.write(sn_coment + '\n')
    # print(sn_coments)

if __name__ == '__main__':
    snbad = open('snbad.txt', 'w', encoding='utf8')
    pa()
    snbad.close()

差评数据：

二、处理数据并建立模型实现目标

1.处理数据

1.读取数据

python 复制代码

import pandas as pd

cp_content = pd.read_csv('snbad.txt', encoding='utf8', sep='\t')
hp_content = pd.read_csv('sngood.txt', encoding='utf8', sep='\t')

2.使用jieba库对好评差评进行分词

python 复制代码

"""
对差评分词
"""
import jieba

cp_segments = []
contents = cp_content['content'].values.tolist()  # 将评论转换到列表内
for content in contents:
    results = jieba.lcut(content)  # 对每条评论进行分词
    if len(results) > 1:
        cp_segments.append(results)

cp_fc_results = pd.DataFrame({'content': cp_segments})
cp_fc_results.to_excel('snbad.xlsx', index=False)

"""
对好评分词
"""
hp_segments = []
contents = hp_content['content'].values.tolist()  # 将评论转换到列表内
for content in contents:
    results = jieba.lcut(content)  # 对每条评论进行分词
    if len(results) > 1:
        hp_segments.append(results)

hp_fc_results = pd.DataFrame({'content': hp_segments})
hp_fc_results.to_excel('sngood.xlsx', index=False)

3.去除停用词

python 复制代码

# 导入停词库
stopwords = pd.read_csv('StopwordsCN.txt', encoding='utf8', engine='python', index_col=False)


# 定义去除停用词函数
def drop_stopwords(contents, stopwords):
    segments_clean = []
    for content in contents:
        line_clean = []
        for word in content:
            if word in stopwords:
                continue
            line_clean.append(word)
        segments_clean.append(line_clean)  # 每一个字符串代表每条评论去除停用词之后的词
    return segments_clean


# 调用去除停用词函数
stopwords = stopwords['stopword'].tolist()  # list类型   / stopwords['stopword'].values  ndarray类型

contents = cp_fc_results['content'].tolist()
cp_fc_contents_clean_s = drop_stopwords(contents, stopwords)

contents = hp_fc_results['content'].tolist()
hp_fc_contents_clean_s = drop_stopwords(contents, stopwords)

2.建立模型

1.将处理之后的数据转换成词向量

注意！！！要使用训练集的数据去建立词库

python 复制代码

"""
朴素贝叶斯分类
"""
'''给每个数据添加数字标签'''
cp_train = pd.DataFrame({'segments_clean': cp_fc_contents_clean_s, 'label': 1})
hp_train = pd.DataFrame({'segments_clean': hp_fc_contents_clean_s, 'label': 0})

pj_train = pd.concat([cp_train, hp_train])  # 上下相连
pj_train.to_excel('pj_train.xlsx', index=False)

'''数据切分'''
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = \
    train_test_split(pj_train['segments_clean'].values, pj_train['label'].values, random_state=0)
# values 可加可不加 作用是将pj_train['segments_clean']获取到的series对象转换成numpy数组
# pj_train['segments_clean'].values 将该列所有值按顺序取出来，不带索引 取出来之后自动加上索引 x_train 也是随机取值 不带索引 取出来之后自动加上索引

'''将所有词转换成词向量'''
words = []
for index in range(len(x_train)):
    words.append(' '.join(x_train[index]))
# print(words)

from sklearn.feature_extraction.text import CountVectorizer

vec = CountVectorizer(max_features=4000, lowercase=False, ngram_range=(1, 1))
# lowercase参数的功能:把所有的词是是否需要转换为小写。False。
vec.fit(words)  # 传入训练集的所有文字, 根据文字构建词汇表

2.训练模型并预测

应将原数据用transform转换成词向量矩阵之后再带入模型进行训练以及预测

python 复制代码

'''导入朴素贝叶斯分类器'''
from sklearn.naive_bayes import MultinomialNB

x_train = vec.transform(words)
classifier = MultinomialNB(alpha=0.1)
classifier.fit(x_train, y_train)

train_predict = classifier.predict(x_train)

# 训练集预测得分
from sklearn import metrics

print(metrics.classification_report(y_train, train_predict))

# 测试集数据进行分析
test_words = []
for line_index in range(len(x_test)):
    test_words.append(' '.join(x_test[line_index]))

x_test = vec.transform(test_words)

test_predict = classifier.predict(x_test)
print(metrics.classification_report(y_test, test_predict))
print(test_predict)

输出结果：

python 复制代码

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      1163
           1       0.21      1.00      0.35         7

    accuracy                           0.98      1170
   macro avg       0.61      0.99      0.67      1170
weighted avg       1.00      0.98      0.98      1170

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       386
           1       0.50      0.60      0.55         5

    accuracy                           0.99       391
   macro avg       0.75      0.80      0.77       391
weighted avg       0.99      0.99      0.99       391

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

三、对某评论是好评还是差评进行合理预测

对评论进行分词
去除停用词
转换成词向量矩阵
进行预测

python 复制代码

"""单条评论判断"""

def text(comment):
    # 1.分词
    a = jieba.lcut(comment)  
    
    # 2.去除停用词
    b = []                   
    for i in a:
        if i not in stopwords and len(i.strip()) > 0:
            b.append(i)

    c = []  
    c.append(' '.join(b[:]))  # 将多个字符串连接并放到列表里

    # 3.转换成词向量矩阵
    c_train = vec.transform(c)

    # 4.进行预测
    c_pr = classifier.predict(c_train)
    return c_pr

comment1 = '这个玩意真好，我很喜欢'
comment2 = '这个玩意太垃圾了'
print(text(comment1))
print(text(comment2))

输出：

分类正确

python 复制代码

[0]
[1]

总结

将这个模型训练出来，有自动化评论分类 、改进客户服务和个性化推荐等作用。