实现了一个新闻数据采集与分析系统python

摘要：该代码实现了一个新闻数据采集与分析系统，包含四个模块：1) 模块2通过URL列表下载HTML网页并提取新闻标题、时间、正文和分类；2) 模块3对提取的数据进行清洗过滤；3) 模块4使用jieba进行中文分词和词频统计，并去除停用词、标点符号等干扰项；4) config模块提供全局配置参数。系统实现了从网页抓取、数据清洗到文本分析的完整流程，可对不同新闻分类进行关键词提取和对比分析。
python 复制代码
Module_2

import csv
import time
import os
import re
import requests
from bs4 import BeautifulSoup
import config



def get_urls_by_file(filename):
    list_urls = []
    with open(filename) as f:
        for line in f:
            list_urls.append(line.strip())
    return list_urls



def getHtml(index,url,download_html_base_dir):
    #  全局请求头，多个函数会用到
    headers = config.HEADERS
    response = requests.get(url,headers=headers)
    response.encoding = 'utf-8'
    html = response.text
    filepath = os.path.join(download_html_base_dir,str(index)+r"_"+url.split('/')[-1])
    # print(filepath)
    with open(filepath,'w',encoding='utf-8') as f:
        f.write(html)
    print(filepath,"saved")
    pass



def main(): # 下载所有html
    # 获取新闻url列表位置
    filename = config.FILENAME
    download_html_base_dir = config.DOWNLOAD_HTML_BASE_DIR

    config.create_dir(download_html_base_dir)

    list_urls = get_urls_by_file(filename)

    # for index,url in enumerate(list_urls):
    #     print(index+1,url)

    for index,url in enumerate(list_urls):
        getHtml(index+1,url,download_html_base_dir)
        time.sleep(1)
        # break
    print("all done")

def get_files_by_path(path):
    if not os.path.exists(path):
        return []
    list_files = []
    files = os.listdir(path)
    for file in files:
        list_files.append(os.path.join(path,file))
    return list_files

def worker(file):
    list_=[] # 标题 发布时间 正文内容 栏目分类
    with open(file,'r',encoding='utf-8') as f:
        html = f.read()
    # print(html)
    soup = BeautifulSoup(html,'html.parser')
    # print(soup)
    title = soup.find('title')
    titles = title.text.split('--') # 得到三个元素
    div = soup.find('div',class_='col-1-1 fl')
    if div:
        time_ = div.text.strip().split('|')[0].strip()
    else:
        time_ = ""
    # print(time_)
    div = soup.find('div',class_='rm_txt_con cf')
    context = div.text
    context = context.replace('\n','')
    list_.append(titles[0])
    list_.append(time_)
    list_.append(context)
    list_.append(titles[1])
    # print(list_[0])
    # print(list_[1])
    # print(list_[2])
    # print(list_[3])
    return list_
    pass
def write_csv(index,list_,path,encoding='utf-8'):
    line =','.join(list_)
    line = str(index)+','+line
    line=line+'\n'

    with open(path,'a',encoding=encoding) as f:
        f.write(line)
    print(index,"saved")


def main1(): #获取信息写CSV
    download_html_base_dir = config.DOWNLOAD_HTML_BASE_DIR
    files = get_files_by_path(download_html_base_dir)
    csv_path=config.CSV_PATH

    if os.path.exists(csv_path):
        os.remove(csv_path)


    for index, file in enumerate(files):
        list_ = worker(file)
        if list_:
            write_csv(index+1,list_,csv_path)
    print("all done")
    pass



if __name__ == '__main__':
    # main() # 下载所有html
    main1()  #


Module_3


from bs4 import BeautifulSoup

import config
import os


def csv_to_rescsv(csv_path,rescsv_path):

    if not os.path.exists(csv_path):# 程序退出
        print('原始数据不存在')
        exit(1)
    csv_cnt = 0
    rescsv_cnt = 0
    with open(rescsv_path,'w',encoding='utf-8') as res_f:
        with open(csv_path,'r',encoding='utf-8') as f:
            for line in f:
                csv_cnt += 1
                list_ = line.strip().split(',')
                cnt = 0
                for item in list_:
                    if item:
                        cnt += 1
                if cnt == 5:
                    rescsv_cnt +=1
                    res_f.write(line)

    return [csv_cnt,rescsv_cnt]
    pass






def main():
    csv_path = config.CSV_PATH
    res_csv_path = config.RES_CSV_PATH
    csv_cnt,rescsv_cnt = csv_to_rescsv(csv_path,res_csv_path)
    print('csv_cnt:',csv_cnt)
    print('rescsv_cnt:',rescsv_cnt)


if __name__ == '__main__':
    main()



Module_4

import config
import jieba


# 中文停用词表（扩展版）
chinese_stopwords = {''
                     '1','2','3','4','5','6','7','8','9','0',
    # 功能词
    '的', '地', '得', '了', '着', '过', '之', '乎', '者', '也', '焉', '哉',

    # 人称代词
    '我', '你', '他', '她', '它', '我们', '你们', '他们', '她们', '它们',
    '咱', '咱们', '俺', '俺们', '您', '各位', '大家', '自己', '自身',

    # 指示代词
    '这', '那', '这个', '那个', '这些', '那些', '这样', '那样', '这里', '那里',
    '这边', '那边', '这么', '那么', '这样', '那样', '这会儿', '那会儿',

    # 疑问代词
    '谁', '什么', '哪', '哪里', '哪儿', '几时', '何时', '怎么', '怎样', '怎么样',
    '为什么', '为何', '多少', '几', '多么',

    # 介词
    '在', '于', '从', '自', '打', '由', '朝', '向', '往', '到', '至', '对于', '关于',
    '至于', '按照', '依照', '根据', '通过', '经过', '由于', '因为', '为了', '为着',
    '除了', '除开', '除去',

    # 连词
    '和', '与', '跟', '同', '及', '以及', '或', '或者', '还是', '而且', '并且',
    '不但', '不仅', '虽然', '但是', '可是', '然而', '不过', '只是', '除非',
    '既然', '如果', '假如', '假若', '要是', '即使', '哪怕', '不论', '不管',
    '因为', '所以', '因此', '于是', '然后', '接着', '那么',

    # 副词
    '很', '非常', '十分', '特别', '极其', '最', '太', '更', '更加', '越', '越来越',
    '都', '全', '全都', '总', '总是', '一直', '一向', '从来', '永远', '始终',
    '就', '才', '刚', '刚刚', '已经', '曾经', '正在', '在', '将', '将要',
    '立刻', '马上', '顿时', '忽然', '突然', '偶尔', '有时', '常常', '经常',
    '往往', '一直', '一再', '再三', '屡次', '依然', '仍然', '还是', '果然',
    '居然', '竟然', '简直', '几乎', '差不多', '大概', '大约', '或许', '也许',
    '一定', '必定', '必然', '必须', '的确', '确实', '实在', '其实', '当然',

    # 助词
    '吗', '呢', '吧', '啊', '呀', '哇', '啦', '诶', '噢', '哦', '哼', '嗯',
    '罢了', '而已', '的话', '来看', '来说', '来讲', '起见', '一般', '一样',

    # 数量词
    '一', '二', '三', '四', '五', '六', '七', '八', '九', '十', '百', '千', '万', '亿',
    '零', '半', '几', '多少', '若干', '第一', '第二', '第三', '首先', '其次', '最后',

    # 常用动词（无实际意义）
    '是', '有', '没有', '无', '做', '作', '搞', '弄', '干', '搞', '进行', '开展',
    '实施', '实行', '执行', '完成', '结束', '开始', '继续', '停止',

    # 方位词
    '上', '下', '左', '右', '前', '后', '里', '外', '中', '内', '间', '旁',
    '上面', '下面', '左边', '右边', '前面', '后面', '里面', '外面', '中间',

    # 时间词
    '现在', '目前', '当前', '今天', '明天', '昨天', '前天', '后天', '今年', '明年',
    '去年', '上午', '下午', '晚上', '早晨', '中午', '傍晚', '时候', '时间', '时刻',
    '时期', '时代', '年代', '岁月', '光阴',

    # 其他高频词
    '就', '都', '还', '又', '再', '也', '却', '倒', '偏', '竟', '可', '真',
    '好', '大', '小', '多', '少', '长', '短', '高', '低', '远', '近', '重', '轻',
    '可以', '能够', '应该', '应当', '必须', '需要', '愿意', '想要', '希望',

    # 现代网络用语（可选）
    '这个', '那个', '什么', '怎么', '为什么', '然后', '所以', '因为', '但是',
    '不过', '其实', '当然', '确实', '真的', '好像', '似乎', '可能', '应该',
}

# 中文标点符号
chinese_punctuation = {
    '，', '。', '！', '？', '；', '：', '「', '」', '『', '』', '《', '》', '（', '）',
    '【', '】', '｛', '｝', '［', '］', '〈', '〉', '〝', '〞', '〿', '--', '---', '......',
    '......', '......', '·', '～', '＠', '＃', '￥', '％', '＆', '＊', '＋', '－', '／',
    '＝', '＾', '＿', '｀', '｜', '￣', '﹏', '﹋', '﹌', '﹍', '﹎', '﹏', '﹟',
    '﹠', '﹡', '﹢', '﹣', '﹤', '﹥', '﹦', '﹨', '﹩', '﹪', '﹫', 'ﹰ', 'ﹱ', 'ﹲ',
    'ﹳ', 'ﹴ', 'ﹶ', 'ﹷ', 'ﹸ', 'ﹹ', 'ﹺ', 'ﹻ', 'ﹼ', 'ﹽ', 'ﹾ', 'ﹿ', '﹁', '﹂',
    '﹃', '﹄', '﹅', '﹆', '﹇', '﹈', '﹉', '﹊', '﹋', '﹌', '﹍', '﹎', '﹏'
}

# 英文标点符号和特殊字符
english_punctuation = {
    '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/',
    ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~'
}

# 数学符号
math_symbols = {
    '+', '-', '*', '/', '=', '≠', '≈', '≡', '≤', '≥', '<', '>', '±', '∓', '×', '÷',
    '√', '∞', '∝', '∫', '∬', '∭', '∮', '∯', '∰', '∇', '∂', '∆', '∑', '∏', '∐',
    '∧', '∨', '∩', '∪', '∈', '∉', '⊂', '⊃', '⊆', '⊇', '⊄', '⊅', '⊈', '⊉', '∀',
    '∃', '∄', '∅', 'ℵ', 'ℶ', 'ℷ', 'ℸ'
}

# 全角字符
full_width_chars = {
    '！', '＂', '＃', '＄', '％', '＆', '＇', '（', '）', '＊', '＋', '，', '－', '．', '／',
    '：', '；', '＜', '＝', '＞', '？', '＠', '［', '＼', '］', '＾', '＿', '｀', '｛', '｜', '｝', '～',
    '０', '１', '２', '３', '４', '５', '６', '７', '８', '９',
    'Ａ', 'Ｂ', 'Ｃ', 'Ｄ', 'Ｅ', 'Ｆ', 'Ｇ', 'Ｈ', 'Ｉ', 'Ｊ', 'Ｋ', 'Ｌ', 'Ｍ',
    'Ｎ', 'Ｏ', 'Ｐ', 'Ｑ', 'Ｒ', 'Ｓ', 'Ｔ', 'Ｕ', 'Ｖ', 'Ｗ', 'Ｘ', 'Ｙ', 'Ｚ',
    'ａ', 'ｂ', 'ｃ', 'ｄ', 'ｅ', 'ｆ', 'ｇ', 'ｈ', 'ｉ', 'ｊ', 'ｋ', 'ｌ', 'ｍ',
    'ｎ', 'ｏ', 'ｐ', 'ｑ', 'ｒ', 'ｓ', 'ｔ', 'ｕ', 'ｖ', 'ｗ', 'ｘ', 'ｙ', 'ｚ'
}

# 空白字符
whitespace_chars = {
    ' ', '\t', '\n', '\r', '\v', '\f',
    '\u2000', '\u2001', '\u2002', '\u2003', '\u2004', '\u2005',  # 各种空格
    '\u2006', '\u2007', '\u2008', '\u2009', '\u200A',
    '\u2028', '\u2029', '\u3000'  # 行分隔符、段分隔符、全角空格
}

def get_columns(path):
    set_=set()
    with open(path,'r',encoding='utf-8') as f:
        for line in f:
            set_.add(line.split(',')[4].strip())
    return list(set_)


def get_context(path,column):
    list_=[]
    with open(path,'r',encoding='utf-8') as f:
        for line in f:
            string = line.split(',')[4].strip()
            if string == column:
                list_.append(line.split(',')[3].strip())
    return "".join(list_)


def jieba_worker(context):

    res = jieba.lcut(context)
    print(len(res))

    for word in chinese_stopwords:
        context=context.replace(word,'')

    res = jieba.lcut(context)
    print(len(res))

    for word in chinese_punctuation:
        context=context.replace(word,'')

    res = jieba.lcut(context)
    print(len(res))

    for word in english_punctuation:
        context = context.replace(word, '')

    res = jieba.lcut(context)
    print(len(res))

    for word in math_symbols:
        context = context.replace(word, '')

    res = jieba.lcut(context)
    print(len(res))


    for word in full_width_chars:
        context = context.replace(word, '')

    res = jieba.lcut(context)
    print(len(res))

    for word in whitespace_chars:
        context = context.replace(word, '')

    res = jieba.lcut(context)
    print(len(res))

    users=['、','"','"','发展','分享','创新','责编','文化','乡村','产业','提升','服务','部','活动','推动']
    for word in users:
        context = context.replace(word, '')

    res = jieba.lcut(context)
    print(len(res))
    print(len(set(res)))

    counts={}
    for word in res:
        if len(word)==1:
            continue
        counts[word]=counts.get(word,0)+1
    items = list(counts.items())
    print(len(items))
    items.sort(key=lambda x:x[1],reverse=True)
    l_=[]
    for i in range(20):
        l_.append(items[i][0])
        word,count = items[i]
        print(f'{word:<10}:{count:>5}')
    return l_
    pass


def main():
    csv = config.RES_CSV_PATH
    columns = get_columns(csv)
    ls=[]
    for column in columns:
        context = get_context(csv,column)
        print(column)
        l_ = jieba_worker(context)
        ls.append(l_)

    for word in ls[0]:
        if word in ls[1] or word in ls[2]:
            print(word)
    for word in ls[1]:
        if word in ls[0] or word in ls[2]:
            print(word)
    for word in ls[2]:
        if word in ls[1] or word in ls[0]:
            print(word)





if __name__ == '__main__':
    main()




config


import os

FILENAME = r"./data/urls.txt" # 获取新闻url列表保存的位置

#  全局请求头，多个函数会用到
HEADERS ={
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36 Edg/142.0.0.0",
    "Accept":"image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8"
}

DOWNLOAD_HTML_BASE_DIR = ".\\data\\html"

CSV_PATH = ".\\data\\data.csv"

RES_CSV_PATH = ".\\data\\res_data.csv"

# 建立路径功能
def create_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)