1、移除常见的中文标点符号
python
def remove_punc(text):
# 定义需要去除的中文标点符号的正则表达式
chinese_punctuation = r'[\u3000-\u303F\uFF01-\uFF0F\uFF1A-\uFF20\uFF3B-\uFF40\uFF5B-\uFF65\u2026]'
# 使用re.sub函数替换这些标点符号为空字符
text_without_punctuation = re.sub(chinese_punctuation, '', text)
return text_without_punctuation
s = "!今天:?、,》。,《》,、天气不错。。。"
ret = remove_punc(s)
print(ret)
2、敏感词过滤
python
import pandas as pd
import re
# 构建测试DataFrame数据
dct = {
"content": [
"您好,哎,先生您好,我这边是家装平台的客服,给您做个回访。",
"你好,家长,你翻译给我听啊。",
],
"label": [0, 1],
}
lst = ["先生", "您好", "家长", "天天"]
test_df = pd.DataFrame(dct)
def remove_sensitive_words(text, sensitive_word_list):
pattern = "|".join(sensitive_word_list)
return re.sub(pattern, "", text, flags=re.IGNORECASE)
test_df["content_new"] = test_df["content"].apply(remove_sensitive_words, args=(lst,))
# 作用在原始数据上
# test_df["content"] = test_df["content"].apply(remove_sensitive_words, args=(lst,))
test_df.head()
3、jieba分词加载停用词和自定义分词
python
import jieba
from collections import Counter
def cut_words(text, stop_path, custom_path=None):
"""
args:
text: (str) 待分词的文本
stop_path: (str) 停用词路径
custom_path: (str) 自定义分词路径 default: None
"""
with open(stop_path, "r", encoding="utf-8") as f:
stopwords = set(f.read().splitlines())
if custom_path:
jieba.load_userdict(custom_path)
words = jieba.lcut(text)
# 去停用词
filter_words = [word for word in words if word not in stopwords]
return "".join(filter_words)
4、文本加载并且保存DataFrame格式
原始文本数据格式如:train.txt
体验2D巅峰 倚天屠龙记十大创新概览 8
60年铁树开花形状似玉米芯(组图) 5
同步A股首秀:港股缩量回调 2
中青宝sg现场抓拍 兔子舞热辣表演 8
python
from tqdm import tqdm
import pandas as pd
def load_dataset(path):
contents = []
labels = []
with open(path, "r", encoding="UTF-8") as f:
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split("\t")
contents.append(content)
labels.append(label)
df = pd.DataFrame({"content": contents, "label": labels})
df.to_csv("./data.csv", index=False)
5、保存自定义文件
python
s = "你好,stop_word1,stop_word2"
lst = s.split(",")
# "w" 写模式 "a" 追加模式
with open("b.txt", "w") as file:
for item in lst:
file.write(f"{item}\n")
保存结果
你好
stop_word1
stop_word2