python爬虫6 - 技术栈

1 简介

BeautifulSoup 是一个用于解析HTML和XML文档的Python库。它提供了一种灵活和便捷的方式来导航、搜索和修改解析树。BeautifulSoup简化了网络爬虫的工作，使得开发者可以轻松地解析网页内容，提取所需的数据。

2 初体验

使用BeautifulSoup的第一步是安装它，可以通过pip进行安装：

复制代码

pip install beautifulsoup4

然后，你可以导入BeautifulSoup类并创建一个BeautifulSoup对象来解析HTML文档：

复制代码

from bs4 import BeautifulSoup  
  
html_doc = """  
<html><head><title>测试页面</title></head>  
<body>  
<p class="title">标题</p>  
<p class="story">这是一个故事。</p>  
</body>  
</html>  
"""  
  
soup = BeautifulSoup(html_doc, 'html.parser')

3 节点选择器

BeautifulSoup提供了多种选择HTML节点的方法。例如，可以使用find()或find_all()方法根据标签名、属性或文本内容来查找节点：

复制代码

# 查找第一个<p>标签  
p_tag = soup.find('p')  
  
# 查找所有<p>标签  
p_tags = soup.find_all('p')  
  
# 查找class为"title"的<p>标签  
title_p = soup.find('p', class_='title')

4 tag对象嵌套选择

你可以使用.来访问tag对象的子节点：

复制代码

# 访问<p>标签内部的文本  
text = title_p.string  
  
# 访问<p>标签的子标签  
children = title_p.children

5 关联选择

关联选择是指通过父节点、子节点或兄弟节点之间的关系来选择元素。BeautifulSoup提供了.parent、.children、.next_sibling和.previous_sibling等属性来访问这些关系：

复制代码

# 获取<p>标签的父节点  
parent_tag = title_p.parent  
  
# 遍历<p>标签的所有子节点  
for child in title_p.children:  
    print(child)

6 方法选择器

BeautifulSoup还提供了多种方法来过滤和选择节点，如基于文本内容、正则表达式或lambda函数的选择：

复制代码

# 查找包含特定文本的<p>标签  
p_with_text = soup.find('p', text='标题')  
  
# 查找所有包含特定文本模式的<p>标签  
p_with_pattern = soup.find_all('p', text=re.compile('故事'))

7 CSS选择器

类似于CSS选择器，BeautifulSoup也支持通过CSS选择器语法来选取节点：

复制代码

# 使用CSS选择器选择class为"title"的<p>标签  
title_p_css = soup.select_one('p.title')  
  
# 选择所有<p>标签  
p_tags_css = soup.select('p')

8 案例

案例 1：提取网页标题

复制代码

from bs4 import BeautifulSoup  
import requests  
  
url = 'http://example.com'  
response = requests.get(url)  
soup = BeautifulSoup(response.text, 'html.parser')  
  
title_tag = soup.find('title')  
if title_tag:  
    print("网页标题:", title_tag.string)

案例 2：提取网页链接

复制代码

from bs4 import BeautifulSoup  
import requests  
  
url = 'http://example.com'  
response = requests.get(url)  
soup = BeautifulSoup(response.text, 'html.parser')  
  
links = soup.find_all('a')  # 查找所有 <a> 标签（链接）  
for link in links:  
    href = link.get('href')  # 获取链接的 href 属性  
    print("链接:", href)

案例 3：提取表格数据

复制代码

from bs4 import BeautifulSoup  
  
html_doc = """  
<table>  
    <tr>  
        <th>姓名</th>  
        <th>年龄</th>  
    </tr>  
    <tr>  
        <td>张三</td>  
        <td>25</td>  
    </tr>  
    <tr>  
        <td>李四</td>  
        <td>30</td>  
    </tr>  
</table>  
"""  
  
soup = BeautifulSoup(html_doc, 'html.parser')  
  
table = soup.find('table')  
rows = table.find_all('tr')  
for row in rows:  
    cells = row.find_all(['th', 'td'])  # 查找表头和单元格  
    for cell in cells:  
        print(cell.get_text(), end='\t')  # 打印单元格内容，使用制表符分隔  
    print()  # 换行

以下是一些使用BeautifulSoup的额外代码案例，这些案例涵盖了从网页中提取信息、修改HTML内容以及使用CSS选择器等不同方面的应用。

案例 4：使用CSS选择器提取数据

复制代码

from bs4 import BeautifulSoup  
import requests  
  
url = 'https://example.com/somepage'  
response = requests.get(url)  
soup = BeautifulSoup(response.text, 'html.parser')  
  
# 使用CSS选择器提取所有的图片链接  
img_links = soup.select('img[src]')  
for img in img_links:  
    print(img['src'])  
  
# 使用CSS选择器提取具有特定类的元素  
elements_with_class = soup.select('.class-name')  
for element in elements_with_class:  
    print(element.get_text())

案例 5：提取特定表格中的数据

复制代码

from bs4 import BeautifulSoup  
  
html_doc = """  
<table id="data-table">  
    <tr>  
        <th>Name</th>  
        <th>Age</th>  
    </tr>  
    <tr>  
        <td>Alice</td>  
        <td>28</td>  
    </tr>  
    <tr>  
        <td>Bob</td>  
        <td>35</td>  
    </tr>  
</table>  
"""  
  
soup = BeautifulSoup(html_doc, 'html.parser')  
  
# 提取ID为"data-table"的表格中的所有行  
table = soup.find('table', {'id': 'data-table'})  
rows = table.find_all('tr')  
  
# 遍历行并提取数据  
for row in rows:  
    cols = row.find_all(['th', 'td'])  
    cols = [ele.text.strip() for ele in cols]  
    print(cols)

案例 6：修改HTML内容

复制代码

from bs4 import BeautifulSoup  
  
html_doc = """  
<html><head><title>Test Page</title></head>  
<body>  
<p class="title">Old Title</p>  
<p class="story">Old Story</p>  
</body>  
</html>  
"""  
  
soup = BeautifulSoup(html_doc, 'html.parser')  
  
# 修改标题文本  
soup.title.string = 'New Title'  
  
# 修改类为"title"的p标签的文本  
soup.find('p', class_='title').string = 'New Title Text'  
  
# 添加新的p标签  
new_p = soup.new_tag('p')  
new_p.string = 'This is a new paragraph.'  
soup.body.append(new_p)  
  
# 输出修改后的HTML  
print(soup.prettify())

案例7 ：处理嵌套的HTML结构

复制代码

from bs4 import BeautifulSoup  
  
html_doc = """  
<div class="container">  
    <div class="item">  
        <h2>Item 1</h2>  
        <p>Description for item 1</p>  
    </div>  
    <div class="item">  
        <h2>Item 2</h2>  
        <p>Description for item 2</p>  
    </div>  
</div>  
"""  
  
soup = BeautifulSoup(html_doc, 'html.parser')  
  
# 提取所有的item类div，并处理它们的内容  
items = soup.find_all('div', class_='item')  
for item in items:  
    header = item.find('h2').text  
    description = item.find('p').text  
    print(f"Header: {header}")  
    print(f"Description: {description}")  
    print("-" * 20)

9 爬取某笔趣阁小说

复制代码

#!/usr/bin/env python

from urllib import response
from bs4 import BeautifulSoup
import requests
import os
import logging
from fake_useragent import UserAgent

# 隧道域名:端口号
# tunnel = "r250.kdltps.com:15818"

# 用户名密码方式
# username = "t19754578624032"
# password = "hemrc89p"
# proxies = {
#     "http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel},
#     "https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": username, "pwd": password, "proxy": tunnel}
# }

# 白名单方式（需提前设置白名单）
# proxies = {
#     # "http": "http://%(proxy)s/" % {"proxy": tunnel},
#     "https": "http://%(proxy)s/" % {"proxy": tunnel}
# }

# # # 要访问的目标网页
# target_url = "https://dev.kdlapi.com/testproxy"

# # 使用隧道域名发送请求
# response = requests.get(target_url, proxies=proxies)

# # 获取页面内容
# if response.status_code == 200:
#     print(response.text)  # 请勿使用keep-alive复用连接(会导致隧道不能切换IP)

local_save_path = 'C:/Users/EA/Desktop/10-爬虫篇-第十次直播/Code/novel/'
ua = UserAgent()
headers = {
    "User-Agent":ua.random
}
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s -%(levelname)s: %(message)s')
url = 'https://www.biqukan8.cc/0_790/'
url_list = []
name_list = []
flag_name = ''
# print(url_list)
# print(name_list)            
def novel_content(url,name):
    txt_response = requests.get(url=url,headers=headers)
# print(txt_response.text)
    txts_soup = BeautifulSoup(str(txt_response.text),"lxml")
    txts = txts_soup.find_all(id = 'content',class_='showtxt')
# print(type(list(enumerate(txt))))
# print(len(list(enumerate(txts))))   #1  证明他是一个整段
    text_soup = BeautifulSoup(str(txts),'lxml')
    text = text_soup.div.text
    file_write(name,text)

def file_write(name,text):
    directory_path = local_save_path + novel_name
    if os.path.exists(directory_path):
        print(f"目录'{directory_path}'存在！")
    else:
        #创建路径
        os.mkdir(directory_path)
        print(f"目录'{directory_path}'已经创建！")
    #将刚刚获取到的小说内容写进去
    write_flag = True
    name_path = os.path.join(directory_path,f"{name}.txt")
    with open(name_path,"a+",encoding='utf-8') as file:
        for each in text:
            if each == 'h':
                write_flag = False
            if write_flag == True and each != '':
                file.write(each)
        file.write('\n\n')  
        

response = requests.get(url,headers=headers)
response.encoding = 'gbk'
# logging.info(response.text)
soup = BeautifulSoup(response.text,"lxml")
chapters = soup.find_all('div',class_='listmain')
# logging.info(chapters)
download_soup = BeautifulSoup(str(chapters),"lxml")
# logging.info(download_soup.contents)
#小说名
novel_name = str(download_soup.dl.dt).split("》")[0][5:]
# print(str(novel_name).split("》")[0][5:])
#《元尊》正文卷
flag_name = "《"+novel_name+"》"+"正文卷"
# logging.info(flag_name)
begin_flag = False
for child in download_soup.dl.children:
    if child != '\n':
        if child.string == u"%s" % flag_name:
            begin_flag = True
        if begin_flag == True and child.a != None:
            download_url = "https://www.biqukan8.cc/" + child.a.get("href")     
            download_name = child.a.string
            # print(download_url,download_name)
            url_list.append(download_url)
            name_list.append(download_name)
    
#     #用zip函数把两个列表合并起来
combined_list = zip(url_list,name_list)
for item1,item2 in combined_list:
    novel_content(item1,item2)