Python爬虫使用示例-古诗词摘录

一、分析需求

目标地址:

https://www.sou-yun.cn/Query.aspx?type=poem&id=×××××

二、提取诗句

python 复制代码
import os
import re
import requests
import parsel

#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 正则表达式匹配
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)

# 清理并输出提取的诗句
for sentence in poem_sentences:
    # 移除HTML标签
    clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
    if clean_sentence:  # 过滤掉空句
        print(clean_sentence)

三、其他信息

提取all需要信息,title+author+sentences

python 复制代码
import os
import re
import requests
import parsel

#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:
    title = title_match.group(1) + title_match.group(2)  # 合并标题部分
    author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者

# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)

# 清理并输出提取的信息
print("标题:", title)
print("作者:", author)
print("诗句:")

for sentence in poem_sentences:
    # 移除HTML标签
    clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
    if clean_sentence:  # 过滤掉空句
        print(clean_sentence)

微调格式

python 复制代码
import os
import re
import requests
import parsel

#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:
    title = title_match.group(1) + title_match.group(2)  # 合并标题部分
    author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者

# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)

# 清理并输出提取的信息
print("《 " + title + "》 ("+ author + ")")
#print("作者:", author)
#print("诗句:")

for sentence in poem_sentences:
    # 移除HTML标签
    clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
    if clean_sentence:  # 过滤掉空句
        print(clean_sentence)

四、保存文档

保存到txt里面,单首诗歌

python 复制代码
import os
import re
import requests
import parsel


#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:
    title = title_match.group(1) + title_match.group(2)  # 合并标题部分
    author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者

# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)

# 清理并准备写入文件的内容
output = f"《 " + title + "》 ("+ author + ")\n"
print("《 " + title + "》 ("+ author + ")")

for sentence in poem_sentences:
    # 移除HTML标签
    clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
    if clean_sentence:  # 过滤掉空句
        output += clean_sentence + "\n"
        print(clean_sentence)

# 将结果写入文本文件

with open('poem.txt', 'w', encoding='utf-8') as file:
    file.write(output)

print("信息已保存到 poem.txt")

五、多首继续

不一定是符合要求的,因为这个id暂时得不到(内容结构问题)


python 复制代码
import os
import re
import requests
import parsel


#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
#url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
#headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
#response = requests.get(url=url,headers=headers)
#html_content= response.text
#print(response.text)

# 指定保存文件的路径
output_file_path = 'all_poems.txt'

# 先清空(如果存在)或创建目标文件
with open(output_file_path, 'w', encoding='utf-8') as file:
    file.write("")  # 清空文件内容

# 循环下载每首诗
for poem_id in range(36647, 36848):
    url = f'https://www.sou-yun.cn/Query.aspx?type=poem1&id={poem_id}'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
    response = requests.get(url=url, headers=headers)
    #html_content = response.text
    # 获取网页内容
    #response = requests.get(url)

    if response.status_code == 200:
        html_content = response.text

        # 提取标题
        title_match = re.search(
            r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>",
            html_content)
        if title_match:
            title = title_match.group(1) + title_match.group(2)  # 合并标题部分
            author = re.sub(r"<.*?>", "", title_match.group(3)).strip()  # 处理作者

            # 正则表达式匹配诗句
            poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)

            # 清理并准备写入文件的内容
            output = f"《 " + title + "》 ("+ author + ")\n"

            for sentence in poem_sentences:
                # 移除HTML标签
                clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
                if clean_sentence:  # 过滤掉空句
                    output += clean_sentence + "\n"

            # 为每首诗添加分隔线
            output += "\n" + "=" * 50 + "\n\n"  # 分隔线,用于区分不同的诗

            # 将结果追加到文本文件
            with open(output_file_path, 'a', encoding='utf-8') as file:  # 以追加模式打开文件
                file.write(output)

            print(f"信息已保存到 {output_file_path}")
        else:
            print(f"在ID {poem_id} 的页面中找不到诗的标题或作者。")
    else:
        print(f"无法获取ID {poem_id} 的页面,状态码: {response.status_code}")
        

运行结果:

相关推荐
喵叔哟13 分钟前
重构代码中引入外部方法和引入本地扩展的区别
java·开发语言·重构
尘浮生19 分钟前
Java项目实战II基于微信小程序的电影院买票选座系统(开发文档+数据库+源码)
java·开发语言·数据库·微信小程序·小程序·maven·intellij-idea
hopetomorrow33 分钟前
学习路之PHP--使用GROUP BY 发生错误 SELECT list is not in GROUP BY clause .......... 解决
开发语言·学习·php
小牛itbull43 分钟前
ReactPress vs VuePress vs WordPress
开发语言·javascript·reactpress
请叫我欧皇i1 小时前
html本地离线引入vant和vue2(详细步骤)
开发语言·前端·javascript
nuclear20111 小时前
使用Python 在Excel中创建和取消数据分组 - 详解
python·excel数据分组·创建excel分组·excel分类汇总·excel嵌套分组·excel大纲级别·取消excel分组
躺平的花卷1 小时前
Python爬虫案例八:抓取597招聘网信息并用xlutils进行excel数据的保存
爬虫·excel
闲暇部落1 小时前
‌Kotlin中的?.和!!主要区别
android·开发语言·kotlin
GIS瞧葩菜1 小时前
局部修改3dtiles子模型的位置。
开发语言·javascript·ecmascript
chnming19871 小时前
STL关联式容器之set
开发语言·c++