看小说？笔趣阁？你是怎么爬取的？

笔趣阁小说爬虫源代码开源

具体运用的知识：

request库中的get请求
正则表达式的匹配规则
parsel的Selector.css的解析
进制数的转换
数据保存

具体怎么查看和解析网页，我就不过多解释了，不过可以私信问我，我会教小伙伴怎么完成，其实我也挺想知道关于实习生就业？如何才能去找好一份好工作？面临着就业问题和期末考试等诸多问题，自己也忘记了创作、更新，在此对你们说声抱歉，由于在生活中各种繁琐的事情，让脑袋也焦头烂额，对未来的迷茫，也对未来憧憬的没信息，刚开始面试的时候，自己还是什么不懂而且紧张的小群众，我也想在这个行业脱颖而出，也知道现在这个就业环境，几乎都是不要实习岗位，想在一个岗位体现自己的价值，确实很重要，当然我也会不断精进自己，表达方面还有待提升，会好好在生活中珍惜每时每刻，希望我这个小博主能给你们带来启发。大家一起加油。

代码块：

python 复制代码

"""
request:数据请求
parsel:数据解析
tqdm:下载进度条
pandas:输入格式好看些
selenium 可以模拟人的行为去操作浏览器
"""
import keyboard

"""
可爬取
https://www.bigee.cc/book/941/ 
不可爬取
https://www.beqege.com/61467/
"""

import parsel
import re
import binascii
import requests
import pandas as pd
# 显示进度条
from tqdm import tqdm


# 打包
# pyinstaller -F 文件名.py

def get_Html(url, cookies, headers):
    try:
        response = requests.get(url=url, cookies=cookies, headers=headers, timeout=30)
        response.raise_for_status()
        return response.text
    except:
        return "请求失败"


"""
    selector.css()
    class选择器：属性前面加个.
    id选择器：属性前面加个#
    getall()：将查询到的结果转换为python列表，所有数据
    get()：将但查询到的第一个结果转换为str类型
    ::text :提取出文字
"""


def replace_question_mark(strings):
    if '?' in strings:
        result = strings.replace('?', '')
    else:
        result = strings
    return result


"""
https://www.bigee.cc/s?q=%E6%B7%B1%E7%A9%BA%E5%BD%BC%E5%B2%B8
https://www.bigee.cc/s?q=%E4%BB%99%E9%80%86
"""


def getData(html):
    """
    该网站的搜索到的信息需要编码格式转换
    """
    try:
        # 提取小说名称
        data_book_articlename = re.findall(r'"articlename":"(.*?)","a', html)
        # 提取小说url编号
        data_book_urls = re.findall(r'url_list":"\\/book\\/(.*?)\\/","', html)
        # 提取小说作者信息
        data_book_author = re.findall(r'"author":"(.*?)","i', html)
        if data_book_articlename:
            books_list = []
            for name, author, id_url in zip(data_book_articlename, data_book_author, data_book_urls):
                # 方法1 Python 3 - 使用原始字符串字面量
                # utf8_str = bytes(fr"{cn}", 'utf-8').decode('unicode_escape')
                # 或者更简单地，如果你是从文件或网络等外部来源得到的字符串，可以直接这样处理：
                book_cn = bytes(name.replace(r'\\', r'\\\\'), 'utf-8').decode('unicode_escape')  # 需要双层转义
                book_author = bytes(author.replace(r'\\', r'\\\\'), 'utf-8').decode('unicode_escape')  # 需要双层转义
                books_list.append({
                    "书名": book_cn,
                    "作者": book_author,
                    "书ID": id_url,
                })
                # print(book_cn,book_author,id_url)  # 输出实际的字符，假设是"仙逆"
                # break
            return books_list
        else:
            # return "没有找到你想搜索的内容 / 没有这本书的数据 ... ..."
            return "1"
    except:
        return "获取失败"


def get_one_chapter(html, cookies, headers,file_path):
    # 利用正则表达式，提取出dd标签中的所有内容
    books_dd = re.findall(r'<dd>(.*?)</dd>', html, re.DOTALL)
    book_dd_str = ''.join(books_dd)  # 将提取的列表数据转换成字符串
    # 提取每章小说的url链接
    books_urls = re.findall(r'href ="(.*?)">', book_dd_str)

    for url in books_urls:
        book_url = f'https://www.bigee.cc{url}'
        r = requests.get(url=book_url, cookies=cookies, headers=headers, timeout=30)

        selector = parsel.Selector(r.text)  # 将response.text转换成selector对象
        book_title_css = selector.css('.content h1::text').get()  # get()取单个数据
        title = replace_question_mark(book_title_css)
        book_content_css = selector.css('#chaptercontent ::text').getall()  # getall()取所有数据

        # 将列表转换成字符串数据 join \n换行符
        book_content = '\n'.join(book_content_css)
        # print(book_content)
        # 保存数据
        # w写入数据但是覆盖 a写入追加写入，写入文本末尾 b 二进制模式
        with open(f'{file_path}{title}.txt', mode='w', encoding='utf-8') as f:
            # f.write(book_title_css)
            f.write(book_content)
            f.close()
            print(f"{title} 爬取成功")


def main():
    cookies = {
        'Hm_lvt_985c57aa6304c183e46daae6878b243b': '1718978766,1719023702',
        'hm': 'bb293a4c202fd7e635bdf44e601d3c27',
        'Hm_lpvt_985c57aa6304c183e46daae6878b243b': '1719025617',
    }

    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,static/avif,static/webp,static/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'accept-language': 'zh-CN,zh;q=0.9',
        'cache-control': 'no-cache',
        # 'cookie': 'Hm_lvt_985c57aa6304c183e46daae6878b243b=1718978766,1719023702; hm=bb293a4c202fd7e635bdf44e601d3c27; Hm_lpvt_985c57aa6304c183e46daae6878b243b=1719025617',
        'pragma': 'no-cache',
        'priority': 'u=0, i',
        'referer': 'https://www.bigee.cc/s?q=%E6%B7%B1%E7%A9%BA%E5%BD%BC%E5%B2%B8',
        'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"',
        'sec-ch-ua-mobile': '?0',
        'sec-ch-ua-platform': '"Windows"',
        'sec-fetch-dest': 'document',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-site': 'same-origin',
        'upgrade-insecure-requests': '1',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36',
    }
    while True:
        book_name = input("输入你想爬取的小说(输入0即可退出):")
        if book_name == "0":
            break
            # https: // www.bigee.cc / user / search.html?q = %E6 % 96 % 97 % E7 % BD % 97 % E5 % A4 % A7 % E9 % 99 % 86
        book_name_url = f"https://www.bigee.cc/user/search.html?q={book_name}"
        html_s = get_Html(book_name_url, cookies, headers)
        list = getData(html_s)
        # 存储数据，方便整齐观看
        if list != "1":
            search_data = pd.DataFrame(list)
            print(search_data)
            key_num = int(input("输入小说序号："))
            id = list[key_num]['书ID']
            """https://www.bigee.cc/book/941/"""
            chapter_url = f"https://www.bigee.cc/book/{id}/"
            html_c = get_Html(chapter_url, cookies, headers)
            # file_path = 'E:/Hui/爬虫/自修案例/新：笔趣阁小说爬取/'
            file_path = input("输入你想存储的位置(如：E:/Hui/爬虫/自修案例/新：笔趣阁小说爬取/)：")
            get_one_chapter(html_c, cookies, headers,file_path)
        else:
            print("没有找到你想搜索的内容 / 没有这本书的数据 ... ...")


if __name__ == "__main__":
    main()