1 简介
BeautifulSoup 是一个用于解析HTML和XML文档的Python库。它提供了一种灵活和便捷的方式来导航、搜索和修改解析树。BeautifulSoup简化了网络爬虫的工作,使得开发者可以轻松地解析网页内容,提取所需的数据。
2 初体验
使用BeautifulSoup的第一步是安装它,可以通过pip进行安装:
pip install beautifulsoup4
然后,你可以导入BeautifulSoup类并创建一个BeautifulSoup对象来解析HTML文档:
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>测试页面</title></head>
<body>
<p class="title">标题</p>
<p class="story">这是一个故事。</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
3 节点选择器
BeautifulSoup提供了多种选择HTML节点的方法。例如,可以使用find()
或find_all()
方法根据标签名、属性或文本内容来查找节点:
# 查找第一个<p>标签
p_tag = soup.find('p')
# 查找所有<p>标签
p_tags = soup.find_all('p')
# 查找class为"title"的<p>标签
title_p = soup.find('p', class_='title')
4 tag对象嵌套选择
你可以使用.
来访问tag对象的子节点:
# 访问<p>标签内部的文本
text = title_p.string
# 访问<p>标签的子标签
children = title_p.children
5 关联选择
关联选择是指通过父节点、子节点或兄弟节点之间的关系来选择元素。BeautifulSoup提供了.parent
、.children
、.next_sibling
和.previous_sibling
等属性来访问这些关系:
# 获取<p>标签的父节点
parent_tag = title_p.parent
# 遍历<p>标签的所有子节点
for child in title_p.children:
print(child)
6 方法选择器
BeautifulSoup还提供了多种方法来过滤和选择节点,如基于文本内容、正则表达式或lambda函数的选择:
# 查找包含特定文本的<p>标签
p_with_text = soup.find('p', text='标题')
# 查找所有包含特定文本模式的<p>标签
p_with_pattern = soup.find_all('p', text=re.compile('故事'))
7 CSS选择器
类似于CSS选择器,BeautifulSoup也支持通过CSS选择器语法来选取节点:
# 使用CSS选择器选择class为"title"的<p>标签
title_p_css = soup.select_one('p.title')
# 选择所有<p>标签
p_tags_css = soup.select('p')
8 案例
案例 1:提取网页标题
from bs4 import BeautifulSoup
import requests
url = 'http://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
title_tag = soup.find('title')
if title_tag:
print("网页标题:", title_tag.string)
案例 2:提取网页链接
from bs4 import BeautifulSoup
import requests
url = 'http://example.com'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a') # 查找所有 <a> 标签(链接)
for link in links:
href = link.get('href') # 获取链接的 href 属性
print("链接:", href)
案例 3:提取表格数据
from bs4 import BeautifulSoup
html_doc = """
<table>
<tr>
<th>姓名</th>
<th>年龄</th>
</tr>
<tr>
<td>张三</td>
<td>25</td>
</tr>
<tr>
<td>李四</td>
<td>30</td>
</tr>
</table>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
table = soup.find('table')
rows = table.find_all('tr')
for row in rows:
cells = row.find_all(['th', 'td']) # 查找表头和单元格
for cell in cells:
print(cell.get_text(), end='\t') # 打印单元格内容,使用制表符分隔
print() # 换行
以下是一些使用BeautifulSoup的额外代码案例,这些案例涵盖了从网页中提取信息、修改HTML内容以及使用CSS选择器等不同方面的应用。
案例 4:使用CSS选择器提取数据
from bs4 import BeautifulSoup
import requests
url = 'https://example.com/somepage'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 使用CSS选择器提取所有的图片链接
img_links = soup.select('img[src]')
for img in img_links:
print(img['src'])
# 使用CSS选择器提取具有特定类的元素
elements_with_class = soup.select('.class-name')
for element in elements_with_class:
print(element.get_text())
案例 5:提取特定表格中的数据
from bs4 import BeautifulSoup
html_doc = """
<table id="data-table">
<tr>
<th>Name</th>
<th>Age</th>
</tr>
<tr>
<td>Alice</td>
<td>28</td>
</tr>
<tr>
<td>Bob</td>
<td>35</td>
</tr>
</table>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# 提取ID为"data-table"的表格中的所有行
table = soup.find('table', {'id': 'data-table'})
rows = table.find_all('tr')
# 遍历行并提取数据
for row in rows:
cols = row.find_all(['th', 'td'])
cols = [ele.text.strip() for ele in cols]
print(cols)
案例 6:修改HTML内容
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>Test Page</title></head>
<body>
<p class="title">Old Title</p>
<p class="story">Old Story</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# 修改标题文本
soup.title.string = 'New Title'
# 修改类为"title"的p标签的文本
soup.find('p', class_='title').string = 'New Title Text'
# 添加新的p标签
new_p = soup.new_tag('p')
new_p.string = 'This is a new paragraph.'
soup.body.append(new_p)
# 输出修改后的HTML
print(soup.prettify())
案例7 :处理嵌套的HTML结构
from bs4 import BeautifulSoup
html_doc = """
<div class="container">
<div class="item">
<h2>Item 1</h2>
<p>Description for item 1</p>
</div>
<div class="item">
<h2>Item 2</h2>
<p>Description for item 2</p>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# 提取所有的item类div,并处理它们的内容
items = soup.find_all('div', class_='item')
for item in items:
header = item.find('h2').text
description = item.find('p').text
print(f"Header: {header}")
print(f"Description: {description}")
print("-" * 20)
9 爬取某笔趣阁小说
#!/usr/bin/env python
from urllib import response
from bs4 import BeautifulSoup
import requests
import os
import logging
from fake_useragent import UserAgent
# 隧道域名:端口号
# tunnel = "r250.kdltps.com:15818"
# 用户名密码方式
# username = "t19754578624032"
# password = "hemrc89p"
# proxies = {
# "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
# "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
# }
# 白名单方式(需提前设置白名单)
# proxies = {
# # "http": "http://%(proxy)s/" % {"proxy": tunnel},
# "https": "http://%(proxy)s/" % {"proxy": tunnel}
# }
# # # 要访问的目标网页
# target_url = "https://dev.kdlapi.com/testproxy"
# # 使用隧道域名发送请求
# response = requests.get(target_url, proxies=proxies)
# # 获取页面内容
# if response.status_code == 200:
# print(response.text) # 请勿使用keep-alive复用连接(会导致隧道不能切换IP)
local_save_path = 'C:/Users/EA/Desktop/10-爬虫篇-第十次直播/Code/novel/'
ua = UserAgent()
headers = {
"User-Agent":ua.random
}
logging.basicConfig(level=logging.INFO,
format='%(asctime)s -%(levelname)s: %(message)s')
url = 'https://www.biqukan8.cc/0_790/'
url_list = []
name_list = []
flag_name = ''
# print(url_list)
# print(name_list)
def novel_content(url,name):
txt_response = requests.get(url=url,headers=headers)
# print(txt_response.text)
txts_soup = BeautifulSoup(str(txt_response.text),"lxml")
txts = txts_soup.find_all(id = 'content',class_='showtxt')
# print(type(list(enumerate(txt))))
# print(len(list(enumerate(txts)))) #1 证明他是一个整段
text_soup = BeautifulSoup(str(txts),'lxml')
text = text_soup.div.text
file_write(name,text)
def file_write(name,text):
directory_path = local_save_path + novel_name
if os.path.exists(directory_path):
print(f"目录'{directory_path}'存在!")
else:
#创建路径
os.mkdir(directory_path)
print(f"目录'{directory_path}'已经创建!")
#将刚刚获取到的小说内容写进去
write_flag = True
name_path = os.path.join(directory_path,f"{name}.txt")
with open(name_path,"a+",encoding='utf-8') as file:
for each in text:
if each == 'h':
write_flag = False
if write_flag == True and each != '':
file.write(each)
file.write('\n\n')
response = requests.get(url,headers=headers)
response.encoding = 'gbk'
# logging.info(response.text)
soup = BeautifulSoup(response.text,"lxml")
chapters = soup.find_all('div',class_='listmain')
# logging.info(chapters)
download_soup = BeautifulSoup(str(chapters),"lxml")
# logging.info(download_soup.contents)
#小说名
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
# print(str(novel_name).split("》")[0][5:])
#《元尊》正文卷
flag_name = "《"+novel_name+"》"+"正文卷"
# logging.info(flag_name)
begin_flag = False
for child in download_soup.dl.children:
if child != '\n':
if child.string == u"%s" % flag_name:
begin_flag = True
if begin_flag == True and child.a != None:
download_url = "https://www.biqukan8.cc/" + child.a.get("href")
download_name = child.a.string
# print(download_url,download_name)
url_list.append(download_url)
name_list.append(download_name)
# #用zip函数把两个列表合并起来
combined_list = zip(url_list,name_list)
for item1,item2 in combined_list:
novel_content(item1,item2)