Python爬虫实战:话本小说网通用爬虫开发指南
一、前言
话本小说网是一个轻小说创作平台,本文将介绍如何开发一个通用的Python爬虫,用于爬取话本小说网的小说内容。本教程仅供学习交流使用,请遵守相关法律法规和网站Robots协议。
二、环境准备
2.1 所需库安装
bash
pip install requests beautifulsoup4
2.2 主要库介绍
- requests:发送HTTP请求,获取网页内容
- BeautifulSoup4:解析HTML,提取所需数据
- re:正则表达式,处理字符串匹配
- urllib.parse:URL拼接处理
三、网站结构分析
3.1 第一级页面(书籍目录页)
URL格式:https://www.ihuaben.com/book/{book_id}.html
关键HTML结构:
html
<p>
<span class="text-muted number">39</span>
<span class="chapterTitle">
<a href="//www.ihuaben.com/book/3462644/27488837.html"
title="真假长老的关系">真假长老的关系</a>
</span>
</p>
需要提取的字段:
number:章节序号href:章节链接title:章节标题
3.2 第二级页面(章节内容页)
URL格式:https://www.ihuaben.com/book/{book_id}/{chapter_id}.html
关键HTML结构:
html
<!-- 章节标题 -->
<h1>找到真的加多长老</h1>
<!-- 章节内容 -->
<div id="contentsource">
<p><span><a href="/juese/作者大大">作者大大</a></span>正文内容...</p>
<p><i>宣墨(阿部多瑞)</i>对话内容...</p>
</div>
需要提取的字段:
h1:章节标题div#contentsource中的p标签:章节段落内容
四、爬虫代码设计
4.1 爬虫类结构设计
python
import requests
from bs4 import BeautifulSoup
import time
import re
from urllib.parse import urljoin
class HuabenSpider:
def __init__(self, book_url):
self.book_url = book_url # 书籍目录页URL
self.base_url = "https://www.ihuaben.com"
self.headers = { # 请求头,模拟浏览器
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
self.book_info = {} # 存储书籍信息
self.chapters = [] # 存储章节列表
4.2 获取书籍基本信息
python
def get_book_info(self):
"""获取书籍基本信息"""
response = requests.get(self.book_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 提取书名(h1标签,class为text-danger)
book_name_elem = soup.find('h1', class_='text-danger')
if book_name_elem:
self.book_info['book_name'] = book_name_elem.text.strip()
# 提取作者(链接包含/user/)
author_elem = soup.find('a', href=re.compile(r'/user/\d+'))
if author_elem:
self.book_info['author'] = author_elem.text.strip()
4.3 获取章节列表
python
def get_chapter_list(self):
"""获取所有章节列表"""
response = requests.get(self.book_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 遍历所有p标签
for p in soup.find_all('p'):
number_span = p.find('span', class_='number')
title_span = p.find('span', class_='chapterTitle')
if number_span and title_span:
chapter_num = number_span.text.strip()
chapter_link = title_span.find('a')
if chapter_link:
# 拼接完整URL
chapter_url = urljoin(self.base_url, chapter_link.get('href'))
chapter_title = chapter_link.get('title', chapter_link.text.strip())
self.chapters.append({
'number': chapter_num,
'title': chapter_title,
'url': chapter_url
})
4.4 获取章节内容
python
def get_chapter_content(self, chapter_url):
"""获取单个章节的内容"""
response = requests.get(chapter_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取内容容器
content_div = soup.find('div', id='contentsource')
if not content_div:
return None
# 提取所有段落
paragraphs = content_div.find_all('p')
content_lines = []
for p in paragraphs:
# 处理角色链接(/juese/开头的链接)
role_links = p.find_all('a', href=re.compile(r'/juese/'))
p_text = ''
# 如果有角色链接,提取角色名
for link in role_links:
role_name = link.text.strip()
p_text += role_name + ':'
# 获取纯文本内容
text_content = p.get_text()
if text_content:
p_text += text_content
content_lines.append(p_text)
return '\n\n'.join(content_lines)
4.5 内容清洗函数
python
def clean_content(self, text):
"""清理内容中的HTML标签和多余空白"""
# 移除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 替换多个空白行为单个换行
text = re.sub(r'\n\s*\n', '\n\n', text)
# 去除首尾空白
return text.strip()
4.6 保存为Markdown文件
python
def save_to_markdown(self, filename=None):
"""保存所有章节到Markdown文件"""
if not filename:
safe_name = re.sub(r'[\\/*?:"<>|]', '',
self.book_info.get('book_name', '小说'))
filename = f"{safe_name}.md"
with open(filename, 'w', encoding='utf-8') as f:
# 写入书籍信息
f.write(f"# {self.book_info.get('book_name', '未知')}\n\n")
f.write(f"**作者:** {self.book_info.get('author', '未知')}\n\n")
f.write(f"**总章节数:** {len(self.chapters)}\n\n---\n\n")
# 逐个写入章节
for i, chapter in enumerate(self.chapters, 1):
print(f"正在获取第 {i} 章: {chapter['title']}")
content = self.get_chapter_content(chapter['url'])
content = self.clean_content(content) if content else "内容获取失败"
# 章节标题用##
f.write(f"## 第{chapter['number']}章 {chapter['title']}\n\n")
f.write(content)
f.write("\n\n---\n\n")
time.sleep(1) # 避免请求过快
4.7 主运行函数
python
def run(self):
"""运行爬虫"""
print("开始爬取...")
self.get_book_info()
self.get_chapter_list()
if self.chapters:
self.save_to_markdown()
else:
print("没有找到任何章节")
print("爬取完成!")
五、完整代码使用示例
python
def main():
# 目标书籍URL
book_url = "https://www.ihuaben.com/book/3462644.html"
# 创建爬虫实例
spider = HuabenSpider(book_url)
# 运行爬虫
spider.run()
if __name__ == "__main__":
main()
六、关键字段总结
6.1 第一级页面(目录页)
| 字段 | CSS选择器 | 说明 |
|---|---|---|
| 书名 | h1.text-danger |
书籍标题 |
| 作者 | a[href*="/user/"] |
作者链接中的文本 |
| 章节序号 | span.number |
章节编号 |
| 章节标题 | span.chapterTitle a |
章节名称 |
| 章节链接 | span.chapterTitle a的href属性 |
相对路径 |
6.2 第二级页面(内容页)
| 字段 | CSS选择器 | 说明 |
|---|---|---|
| 章节标题 | h1 |
当前章节标题 |
| 章节内容 | div#contentsource p |
段落内容 |
| 角色名称 | a[href*="/juese/"] |
对话角色 |
七、注意事项
- 请求频率控制 :使用
time.sleep()添加延迟,避免对服务器造成压力 - User-Agent伪装:设置合理的请求头,模拟浏览器访问
- 异常处理:建议添加try-except处理网络异常
- 编码问题:使用utf-8编码保存文件
- URL拼接 :使用
urljoin处理相对路径 - 文件命名:去除文件名中的非法字符
八、扩展建议
- 添加多线程支持提高爬取效率
- 实现断点续传功能
- 支持代理IP轮换
- 添加数据存储到数据库
- 开发GUI界面便于操作
九、免责声明
本教程仅供学习Python网络爬虫技术使用,请勿用于商业用途或大规模爬取。尊重网站版权,遵守robots.txt规则,合理控制请求频率。
十、代码实例
python
import requests
from bs4 import BeautifulSoup
import time
import os
import re
from urllib.parse import urljoin
class HuabenSpider:
def __init__(self, book_url):
self.book_url = book_url
self.base_url = "https://www.ihuaben.com"
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.book_info = {}
self.chapters = []
def get_book_info(self):
"""获取书籍基本信息"""
print(f"正在获取书籍信息: {self.book_url}")
response = requests.get(self.book_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取书名
book_name_elem = soup.find('h1', class_='text-danger')
if book_name_elem:
self.book_info['book_name'] = book_name_elem.text.strip()
# 获取作者
author_elem = soup.find('a', href=re.compile(r'/user/\d+'))
if author_elem:
self.book_info['author'] = author_elem.text.strip()
print(f"书名: {self.book_info.get('book_name')}")
print(f"作者: {self.book_info.get('author')}")
def get_chapter_list(self):
"""获取所有章节列表"""
print("正在获取章节列表...")
response = requests.get(self.book_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 找到章节列表容器
chapter_list = soup.find_all('p')
for p in chapter_list:
number_span = p.find('span', class_='number')
title_span = p.find('span', class_='chapterTitle')
if number_span and title_span:
chapter_num = number_span.text.strip()
chapter_link = title_span.find('a')
if chapter_link:
chapter_url = urljoin(self.base_url, chapter_link.get('href'))
chapter_title = chapter_link.get('title', chapter_link.text.strip())
# 从URL中提取chapter_id
chapter_id = re.search(r'/(\d+)\.html$', chapter_url)
if chapter_id:
chapter_id = chapter_id.group(1)
self.chapters.append({
'number': chapter_num,
'title': chapter_title,
'url': chapter_url,
'id': chapter_id
})
print(f"找到 {len(self.chapters)} 个章节")
return self.chapters
def clean_content(self, text):
"""清理内容中的HTML标签和多余空白"""
# 移除HTML标签
text = re.sub(r'<[^>]+>', '', text)
# 替换多个空白行为单个换行
text = re.sub(r'\n\s*\n', '\n\n', text)
# 去除首尾空白
text = text.strip()
return text
def get_chapter_content(self, chapter_url):
"""获取单个章节的内容"""
try:
response = requests.get(chapter_url, headers=self.headers)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取章节标题
title_elem = soup.find('h1')
chapter_title = title_elem.text.strip() if title_elem else "未知标题"
# 获取内容容器
content_div = soup.find('div', id='contentsource')
if not content_div:
return None
# 提取所有段落
paragraphs = content_div.find_all('p')
content_lines = []
for p in paragraphs:
# 处理对话和角色链接
p_text = ''
# 检查是否包含角色链接
role_links = p.find_all('a', href=re.compile(r'/juese/'))
if role_links:
for link in role_links:
role_name = link.text.strip()
# 用角色名替换链接
if not p_text:
p_text = role_name + ':'
else:
p_text = p_text + role_name + ':'
# 获取纯文本内容
text_content = p.get_text()
if text_content:
# 如果已经有角色名,就直接追加内容
if p_text:
# 移除可能重复的角色名
for role in role_links:
role_name = role.text.strip()
text_content = text_content.replace(role_name, '', 1)
p_text = p_text + text_content.strip()
else:
p_text = text_content.strip()
if p_text:
content_lines.append(p_text)
content = '\n\n'.join(content_lines)
return self.clean_content(content)
except Exception as e:
print(f"获取章节内容失败: {chapter_url}, 错误: {e}")
return None
def save_to_markdown(self, filename=None):
"""保存所有章节到Markdown文件,章节标题用##"""
if not filename:
# 生成文件名:书名_作者.md
safe_name = re.sub(r'[\\/*?:"<>|]', '', self.book_info.get('book_name', '小说'))
filename = f"{safe_name}.md"
elif not filename.endswith('.md'):
filename = filename + '.md'
print(f"正在保存到Markdown文件: {filename}")
with open(filename, 'w', encoding='utf-8') as f:
# 写入书籍信息(Markdown格式)
f.write(f"# {self.book_info.get('book_name', '未知')}\n\n")
f.write(f"**作者:** {self.book_info.get('author', '未知')}\n\n")
f.write(f"**总章节数:** {len(self.chapters)}\n\n")
f.write("---\n\n")
# 生成目录
f.write("## 目录\n\n")
for i, chapter in enumerate(self.chapters, 1):
# 目录链接到锚点
anchor = f"chapter-{i}"
f.write(f"{i}. [{chapter['title']}](#{anchor})\n")
f.write("\n---\n\n")
# 逐个章节写入
for i, chapter in enumerate(self.chapters, 1):
print(f"正在获取第 {i}/{len(self.chapters)} 章: {chapter['title']}")
content = self.get_chapter_content(chapter['url'])
if content:
# 章节标题用##,并添加锚点
anchor = f"chapter-{i}"
f.write(f'## <span id="{anchor}">第{chapter["number"]}章 {chapter["title"]}</span>\n\n')
f.write(content)
f.write("\n\n---\n\n")
else:
f.write(f'## 第{chapter["number"]}章 {chapter["title"]} (获取失败)\n\n')
f.write("> 内容获取失败\n\n---\n\n")
# 添加延迟,避免请求过快
time.sleep(1)
print(f"保存完成!文件: {filename}")
def save_to_txt(self, filename=None):
"""保存所有章节到TXT文件(备选格式)"""
if not filename:
safe_name = re.sub(r'[\\/*?:"<>|]', '', self.book_info.get('book_name', '小说'))
filename = f"{safe_name}.txt"
elif not filename.endswith('.txt'):
filename = filename + '.txt'
print(f"正在保存到TXT文件: {filename}")
with open(filename, 'w', encoding='utf-8') as f:
# 写入书籍信息
f.write(f"书名:{self.book_info.get('book_name', '未知')}\n")
f.write(f"作者:{self.book_info.get('author', '未知')}\n")
f.write(f"总章节数:{len(self.chapters)}\n")
f.write("=" * 50 + "\n\n")
# 逐个章节写入
for i, chapter in enumerate(self.chapters, 1):
print(f"正在获取第 {i}/{len(self.chapters)} 章: {chapter['title']}")
content = self.get_chapter_content(chapter['url'])
if content:
f.write(f"第{chapter['number']}章 {chapter['title']}\n")
f.write("-" * 30 + "\n")
f.write(content)
f.write("\n\n" + "=" * 50 + "\n\n")
else:
f.write(f"第{chapter['number']}章 {chapter['title']} (获取失败)\n\n")
time.sleep(1)
print(f"保存完成!文件: {filename}")
def save_to_separate_markdown(self, output_dir="chapters"):
"""保存每个章节为单独的Markdown文件"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print(f"正在保存章节到目录: {output_dir}")
# 保存书籍信息文件
info_file = os.path.join(output_dir, "00_README.md")
with open(info_file, 'w', encoding='utf-8') as f:
f.write(f"# {self.book_info.get('book_name', '未知')}\n\n")
f.write(f"**作者:** {self.book_info.get('author', '未知')}\n\n")
f.write(f"**总章节数:** {len(self.chapters)}\n\n")
f.write("## 章节列表\n\n")
for i, chapter in enumerate(self.chapters, 1):
f.write(f"{i}. [{chapter['title']}]({i:03d}_{chapter['title']}.md)\n")
for i, chapter in enumerate(self.chapters, 1):
print(f"正在获取第 {i}/{len(self.chapters)} 章: {chapter['title']}")
content = self.get_chapter_content(chapter['url'])
if content:
# 生成安全的文件名
safe_title = re.sub(r'[\\/*?:"<>|]', '', chapter['title'])
filename = f"{output_dir}/{i:03d}_{safe_title}.md"
with open(filename, 'w', encoding='utf-8') as f:
f.write(f"# 第{chapter['number']}章 {chapter['title']}\n\n")
f.write(f"*返回[目录](00_README.md)*\n\n")
f.write("---\n\n")
f.write(content)
print(f"已保存: {filename}")
else:
print(f"获取失败: {chapter['title']}")
time.sleep(1)
print(f"所有章节保存完成!目录: {output_dir}")
def run(self, format='md'):
"""运行爬虫
format: 'md' 或 'txt' 或 'separate'
"""
print("开始爬取...")
self.get_book_info()
self.get_chapter_list()
if self.chapters:
if format == 'md':
self.save_to_markdown()
elif format == 'txt':
self.save_to_txt()
elif format == 'separate':
self.save_to_separate_markdown()
else:
print(f"未知格式: {format}")
else:
print("没有找到任何章节")
print("爬取完成!")
def main():
# 小说URL
book_url = "https://www.ihuaben.com/book/3462644.html"
# 创建爬虫实例并运行
spider = HuabenSpider(book_url)
# 选择输出格式:
# 生成单个Markdown文件(默认)
#spider.run(format='md')
# 生成单个TXT文件
#spider.run(format='txt')
# 生成单独的章节文件
#spider.run(format='separate')
spider.run(format='md') # 默认生成Markdown文件
if __name__ == "__main__":
main()
十一、效果

虽然前面有3张重复,但后面的完整到结尾,删去即可
通过以上步骤,我们就完成了一个通用的话本小说网爬虫开发。代码结构清晰,易于理解和扩展,适合作为Python爬虫学习的实践项目。