趣笔阁爬虫实验
用BeautifulSoup解析网页结构,爬取指定小说的页面,将每个章节的内容保存到txt文件中
可以改进的点:(待更新
1.反爬措施
2.多线程
3.保存为markdown格式更加美观
python
import os
import re
import requests
from bs4 import BeautifulSoup
import time
def getHtml(url, param=None, encoding=None):
# 获取url内的html文本内容
try:
# 构造访问头
header = {
'user-agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
}
# 获取返回
html = requests.get(url, headers=header, timeout=10)
# 定义编码方式
if encoding is None:
encoding = html.apparent_encoding
html.encoding = encoding
# 保存html文本
content = html.text
# 关闭连接
html.close()
# 间隔1s 防止过快访问
time.sleep(1)
# 返回网页内容
return content
except requests.RequestException as e:
print(f"请求失败: {e}")
return None # 返回None表示失败
def getLink(content):
soup = BeautifulSoup(content, "html.parser")
res = []
# 查找所有章节标题的标签
titles = soup.find_all('dd')
# 提取每个章节的文本
for title in titles:
a_tag = title.find('a')
if a_tag:
name = a_tag.text
res.append(name)
print(res)
return res
def save(name, passage_content, path):
# 定义文件路径,文件名使用章节名称
file_path = os.path.join(path, f"{name}.txt")
# 将连续空格替换为换行符,并将其写入文件
with open(file_path, 'w', encoding='utf-8') as file:
for content in passage_content:
# 替换连续空格为换行符
formatted_content = re.sub(r'\s{2,}', '\n', content.text)
file.write(formatted_content + "\n") # 额外加上换行符,使得段落更清晰
print(f"章节 {name} 已保存")
def saveImg(title,imgLink,path):
# 定义文件路径,文件名使用章节名称
file_path = os.path.join(path, f"{title}.jpg")
response = requests.get(imgLink)
# 将连续空格替换为换行符,并将其写入文件
if response.status_code == 200:
with open(file_path, 'wb') as file:
file.write(response.content) # 额外加上换行符,使得段落更清晰
print(f"图片 {title} 已保存")
else:
print(f"图片 {title} 保存失败")
def getMain(content):
soup = BeautifulSoup(content, "html.parser")
name = ''
titles = soup.find_all('div',attrs={"class":"info"})
# 提取每个章节的文本
for title in titles:
img_tag = title.find('img')
img = img_tag['src']
# print(img_tag['src'])
name = img_tag['alt']
# print(img_tag['alt'])
return name,img
def getConcent(root,titles,path):
pat = re.compile(r'第(.*?)章')
for title in titles:
res = pat.search(title)
if res:
print(res.groups())
page = res.group(1)
url = root + page + ".html"
content = getHtml(url)
soup = BeautifulSoup(content, "html.parser")
# print(content)
passage = []
passage_content = soup.find_all("div",attrs={"id":"chaptercontent"})
for item in passage_content:
passage.append(item.text)
print(item.text)
print(markdownify.markdownify(item.text))
# print(passage_content)
save(title, passage_content, path)
if __name__ == "__main__":
try:
# 目标贴吧
url = "https://www.3bqg.cc/book/152484/"
# 目标输出csv文件路径
path = "./novel/"
# 获取目标网页的源码
content = getHtml(url)
title,img = getMain(content)
saveImg(title,img,path+title)
titles = getLink(content)
getConcent(url,titles,path+title)
except Exception as e:
print(f"程序运行出错: {e}")