在分析这个代码之前我们先看看jieba这个库

jieba是一个开源的中文分词工具库，将连续的中文字符串切分为有意义的词语

posseg=partof speech segmentation词性标注分词

有常见词性标签 flag n是名词 nr是人名 ns是地名 v是动词巴拉巴拉自己可以去看，然后我现在就是要使用这个库，把这些txt文件中的每句话中可以提出的词给归类到四个txt文件当中去

import os

import re

import jieba.posseg as pseg

from multiprocessing import Pool, cpu_count

INPUT_DIR = "."#输入目录路径，表示当前工作目录 .代表当前目录

OUTPUT_ROOT = "output" # 所有结果的根目录，表示生成的所有文件存放在当前目录下的output/文件夹中

def has_chinese(text):

#re.search 使用正则表达式在text中搜索匹配项，r'[\u4e00-\u9fff]',表示的是一个范围，覆盖几乎所有的常用中文汉字 bool re.search找到的话返回真值否则none bool将其转化为true/false

return bool(re.search(r'[\u4e00-\u9fff]', text))

#对每一行，看https巴拉巴拉是否有这些东西

def is_url_line(line):

return 'https://' in line or 'www.' in line or '.com' in line

def process_single_file(filepath):

"""

处理单个文件，输出到独立目录

"""

构造输出目录名：原文件路径哈希 or 用安全文件名

basename = os.path.basename(filepath)#从完整路径中提取纯文件名

替换非法字符（如 / \ : * ? " < > |）

#遍历每个文件名，如果是isalnum是字母数字或者下划线保留，不然就替换非法字符为_

safe_name = "".join(c if c.isalnum() or c in ('', '-', '.') else '' for c in basename)

#构造输出的路径输出路径+safe_name载加上------output

output_dir = os.path.join(OUTPUT_ROOT, f"{safe_name}_output")

#os.makedirs递归创建多级目录，如果目录已经存在，不报错，静默跳过

os.makedirs(output_dir, exist_ok=True)

初始化四类列表

entities = {

"人名.txt": [],

"地名.txt": [],

"机构名.txt": [],

"英文名.txt": []

}

try:

#这边就是删选，一定带汉字且不带之前说的www之类的

with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:

for line in f:

line = line.strip()

if not line:

continue

if is_url_line(line):

continue

if not has_chinese(line):

continue

#欧克，筛选完之后进行分词，

jieba 分词 pseg.cut（line) 使用jieba.posseg对Line进行分词和词性标注，返回一个生成器 word切出的词语（字符串） flag：该词语的词性标签 entities在前面有初始化

for word, flag in pseg.cut(line):

word = word.strip()

if not word:

continue

if flag == 'nr':

entities["人名.txt"].append(word)

elif flag == 'ns':

entities["地名.txt"].append(word)

elif flag == 'nt':

entities["机构名.txt"].append(word)

elif flag == 'nz':

entities["英文名.txt"].append(word)

【可选】补充正则提取英文单词（增强 brand 识别）

english_words = re.findall(r'\b[A-Za-z][A-Za-z0-9]*\b', line)

for eng in english_words:

if len(eng) >= 2:

entities["英文名.txt"].append(eng)

写入四个文件（不去重，保留原始出现）

#下面的不想仔细看了，就是还是提取完之后写入各自的列表，然后开始遍历，构造路径写到对应的文件中，有文件名字

for filename, words in entities.items():

out_path = os.path.join(output_dir, filename)

with open(out_path, 'w', encoding='utf-8') as f_out:

for w in words:

f_out.write(w + '\n')

return f"✅ {filepath} → {output_dir}"

except Exception as e:

return f"❌ {filepath} 处理失败: {e}"

def main():

收集所有文件

filepaths = []

#递归遍历input_dir

for root, _, files in os.walk(INPUT_DIR):

for file in files:

#对于每个文件，

filepaths.append(os.path.join(root, file))

if not filepaths:

print("❌ 未找到任何文件")

return

os.makedirs(OUTPUT_ROOT, exist_ok=True)

print(f"📁 共 {len(filepaths)} 个文件，开始并行处理...")

#启用多进程，同时启动64个子进程并行处理

num_workers = 64

with Pool(processes=num_workers) as pool:，pool.map将filepaths中的每个路径分配给一个子进程，调用process_single_file(filepath)并收集所有返回结果

results = pool.map(process_single_file, filepaths)

for res in results:

print(res)

print(f"\n✅ 所有文件处理完成！结果在 '{OUTPUT_ROOT}/' 下各子目录中")

print("💡 后续可运行 merge_and_count.py 进行合并与词频统计")

if name == "main":

main()

代码分析

构造输出目录名：原文件路径哈希 or 用安全文件名

替换非法字符（如 / \ : * ? " < > |）

初始化四类列表

jieba 分词 pseg.cut（line) 使用jieba.posseg对Line进行分词和词性标注，返回一个生成器 word切出的词语（字符串） flag：该词语的词性标签 entities在前面有初始化

【可选】补充正则提取英文单词（增强 brand 识别）

写入四个文件（不去重，保留原始出现）

收集所有文件