python爬虫-爬小说

python 复制代码
# 导入BeautifulSoup
from bs4 import BeautifulSoup as bf
from fastapi import FastAPI,Form,File
import time
import random
import requests
import traceback

app = FastAPI(
    title='爬虫',
    description='regex web: https://regexr-cn.com/  \n  eg : <a href="https://www.zbytb.com/s-zb-.*?</a>  \n eg : <a href="[./].*?</a>',
    version='1.0.0')

headers = [
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"},
    {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},
    {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14"},
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"},
    {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"},
    {"User-Agent":"Opera/9.25 (Windows NT 5.1; U; en)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},
    {"User-Agent":"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"},
    {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12"},
    {"User-Agent":"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9"},
    {"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"},
    {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}
]

proxys = []

def wait():
    time.sleep(0.2)

def getHeader():
    return random.choice(headers)

def getProxy():
    return random.choice(proxys)

def parseUrl(url):
    if(url.startswith('./')):
        url = url.replace('./','')
    return url

def start():
    try:
        list_html = requests.get('https://www.xjwxsw.com/xsmulu/27614204/', headers=getHeader())
        list_html.encoding = list_html.apparent_encoding
        list_obj = bf(list_html.text, 'html.parser')
        atags = list_obj.find_all('div', id='content_1')[0].find_all('a')
        f = open('C://Users//admin//Desktop//777.txt', "a", encoding='utf-8')
        for atag in atags:
            title = atag.text
            print(title)
            f.write(title)
            f.write("\n")
            href1 = 'https://www.xjwxsw.com'+atag.get('href')
            href2 = href1.split('.html')[0]+'_2.html'

            context1 = requests.get(href1, headers=getHeader())
            context1.encoding = context1.apparent_encoding
            context_obj1 = bf(context1.text, 'html.parser')
            ptags1 = context_obj1.find_all('div', id='booktxt')[0].find_all('p')
            for ptag1 in ptags1:
                f.write(ptag1.text)
                f.write("\n")

            context2 = requests.get(href2, headers=getHeader())
            context2.encoding = context2.apparent_encoding
            context_obj2 = bf(context2.text, 'html.parser')
            ptags2 = context_obj2.find_all('div', id='booktxt')[0].find_all('p')
            for ptag2 in ptags2:
                f.write(ptag2.text)
                f.write("\n")
    except Exception as e:
        traceback.print_exc()
    finally:
        f.close()
if __name__ == '__main__':
    start()
python 复制代码
# 导入BeautifulSoup
from bs4 import BeautifulSoup as bf
from fastapi import FastAPI,Form,File
import time
import random
import requests
import traceback

app = FastAPI(
    title='爬虫',
    description='regex web: https://regexr-cn.com/  \n  eg : <a href="https://www.zbytb.com/s-zb-.*?</a>  \n eg : <a href="[./].*?</a>',
    version='1.0.0')

headers = [
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"},
    {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"},
    {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},
    {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14"},
    {"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"},
    {"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"},
    {"User-Agent":"Opera/9.25 (Windows NT 5.1; U; en)"},
    {"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},
    {"User-Agent":"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"},
    {"User-Agent":"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12"},
    {"User-Agent":"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9"},
    {"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"},
    {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}
]

proxys = []

def wait():
    time.sleep(0.2)

def getHeader():
    return random.choice(headers)

def getProxy():
    return random.choice(proxys)

def parseUrl(url):
    if(url.startswith('./')):
        url = url.replace('./','')
    return url

def start():
    try:
        list_html = requests.get('https://www.uuks5.com/book/766295/', headers=getHeader())
        list_html.encoding = list_html.apparent_encoding
        list_obj = bf(list_html.text, 'html.parser')
        atags = list_obj.find_all('ul', id='chapterList')[0].find_all('a')
        f = open('C://Users//admin//Desktop//123.txt', "a", encoding='utf-8')
        for atag in atags:
            title = atag.text
            print(title)
            f.write(title)
            f.write("\n")
            href1 = 'https://www.uuks5.com/'+atag.get('href')

            context1 = requests.get(href1, headers=getHeader())
            context1.encoding = context1.apparent_encoding
            context_obj1 = bf(context1.text, 'html.parser')
            ptags1 = context_obj1.find_all('div', id='TextContent')[0].find_all('p')
            for ptag1 in ptags1:
                f.write(ptag1.text)
                f.write("\n")
    except Exception as e:
        traceback.print_exc()
    finally:
        f.close()
if __name__ == '__main__':
    start()
相关推荐
小白学大数据14 小时前
Selenium+Python 爬虫:动态加载头条问答爬取
爬虫·python·selenium
搂着猫睡的小鱼鱼16 小时前
从选型到落地:京东评论爬虫开发历程(含反爬应对与经验总结)
爬虫
祭曦念16 小时前
越权漏洞的克星!用爬虫自动化检测平行越权/垂直越权漏洞
爬虫·安全·自动化
qq_2837200518 小时前
Python 爬虫实战:从入门到精通,爬取某站数据
爬虫·逆向·反爬虫
深蓝电商API18 小时前
反爬虫对抗策略在海淘场景的应用
爬虫·海淘·反爬
tang7778919 小时前
小红书平台用什么代理IP?数据采集IP封禁解决方法
数据库·爬虫·python·网络协议·ip
亿牛云爬虫专家19 小时前
学术文献爬虫 OOM 崩溃与 403 风暴
爬虫·rust·爬虫代理·403·oom killer·学术文献·403 forbidden
嫂子的姐夫1 天前
33-补环境介绍
爬虫·js逆向·逆向
轩轩分享AI1 天前
DeepSeek、Kimi、笔灵谁最好用?5款网文作者亲测的AI写作神器横评
人工智能·ai·ai写作·小说写作·小说·小说干货
ZC跨境爬虫1 天前
Python异步IO详解:原理、应用场景与实战指南(高并发爬虫首选)
爬虫·python·算法·自动化