一、简述

将之前所做的爬虫案例放出，方便查阅，对代码整合函数内容并不进行说明。
二、代码

复制代码
import time
from concurrent.futures import ThreadPoolExecutor

import requests
import re
import csv
from bs4 import BeautifulSoup
import os
from lxml import etree

def top250():
#re实例1
    for a in range(0,250,25):
        url="https://movie.douban.com/top250?start={0}".format(a)
        #print(url)
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
        }
        resp = requests.get(url,headers=headers)#get模式带UA防反爬
        page_content=resp.text
        obj=re.compile(r'<li>.*?<em class="">(?P<ranking>.*?)</em>'
                       r'.*?<span class="title">(?P<name>.*?)</span>.*?'
                       r'.*?<br>(?P<year>.*?)&nbsp.*?'
                       r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'
                       r'.*?<span>(?P<number>.*?)</span>'
                       ,re.S)
        result= obj.finditer(page_content)

        f=open("DoubanTop250.csv",mode="a")
        csvwriter=csv.writer(f)

        for i in result:
            # print("排名:{0}; 电影名:{1}; 年份:{2}; 评分:{3} ".format(
            #     i.group("ranking"),
            #     i.group("name"),
            #     i.group("year").strip(),
            #     i.group("score"))
            #       )
            dic=i.groupdict()
            dic['year']=dic['year'].strip()
            csvwriter.writerow(dic.values())
        f.close()
        time.sleep(2)#多个页面采用时延防反爬，不然ip会被ban掉
        print("收集到{0}个信息".format(a+25))

def MovieDownload():
#re实例2
    domain ="https://dytt89.com/"
    resp = requests.get(domain,verify=False)#get中特殊的verify=False处理
    resp.encoding='gbk'#国标语言

    #print(resp.text)

    f = open("Dytt2022新片精品电影下载地址.csv", mode="a")
    csvwriter = csv.writer(f)

    obj1=re.compile(r'2022新片精品.*?'
               r'<ul>(?P<ul>.*?)</ul>',re.S)

    obj2=re.compile(r'''<li><a href='(?P<href>.*?)' title="''',re.S)

    obj3=re.compile(r'<div class="title_all"><h1>(?P<movie>.*?)</h1></div>.*?'
                    r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf">'
                    r'<a href="(?P<download>.*?)">',re.S)
    child_href_list=[]
    result1=obj1.finditer(resp.text)
    for i in result1:#第一层------------获取主页面代码
        ul=i.group('ul')
        #print(ul)
        #time.sleep(1)
        result2=obj2.finditer(ul)

        for j in result2:#第二层------------获取进入后的代码
             #time.sleep(1)
             child_href=domain+j.group('href').strip("/")#主页面代码与子页面的特殊部分拼接
             child_href_list.append(child_href)
    k=0
    for href in child_href_list:
        child_resp=requests.get(href,verify=False)
        child_resp.encoding='gbk'
        result3=obj3.search(child_resp.text)#获取所需的下载内容
        #print(result3.group('movie'))
        #print(result3.group('download'))
        dic = result3.groupdict()
        csvwriter.writerow(dic.values())
        k=k+1#计数
        print("已收集到{0}个电影".format(k))
    f.close()

def VegetableValue():
    #POST实例
    url = "http://www.xinfadi.com.cn/getPriceData.html"
    #由于页面源代码没有数据，所以F12用抓包工具获取到数据的链接使用即可
    head = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
    }

    f = open("菜价.csv", mode="a")
    csvwriter = csv.writer(f)

    data = {'limit': 3}  # 所要统计的数据数量
    #get和post获取数据的区别
    #get只能直接获取当前页面内所有资源
    #post可以控制所要获取的资源数量data
    #resp = requests.get(url, headers=head).json()
    resp = requests.post(url, headers=head, data=data).json()
    lis = resp.get('list')

    for i in lis:
        name = i.get("prodName")
        low_price = i.get("lowPrice")
        high_price = i.get("highPrice")
        average_price = i.get("avgPrice")
        producing_area = i.get("place")
        unit = i.get("unitInfo")
        date = i.get("pubDate")

        csvwriter.writerow([name,low_price,high_price,average_price,producing_area,unit,date])
    f.close()

def CatchPicture(url):
    #bs4实例
    url_download = "https://pic.netbian.com/"
    resp = requests.get(url)
    resp.encoding="gbk"
    #print(resp.text)
    main_page=BeautifulSoup(resp.text,"html.parser")
    alist=main_page.find("div",class_="slist").find_all("a")
    #print(alist)
    for a in alist:
        #print(a.get('href'))
        href = url_download + a.get('href').strip("/")  # 主页面代码与子页面的特殊部分拼接
        #print(href)
        child_page_resp=requests.get(href)
        child_page_resp.encoding='gbk'
        child_page_text=child_page_resp.text
        child_page=BeautifulSoup(child_page_text,"html.parser")
        img = child_page.find("div",class_="photo").find("img")
        img_name= child_page.find("div",class_="photo").find("img").get("title")
        #print(img.get("src"))
        src=url_download+img.get("src").strip("/")
        #print(src)
        #print(img_name)
        img_resp=requests.get(src)
        img = img_resp.content#拿到字节
        with open("img2/"+img_name+".jpeg",mode="wb")as f:
            f.write(img)

        print(img_name+"下载好了!!")
        #break
        time.sleep(0.5)#防反爬必要时延

def Xpath():
    #网站不知道为啥总是返回空列表找不到数据，但是自己做的html却很轻松能访问到节点目标

    tree = etree.parse('file:///C:/Users/86183/Desktop/1.html')
    r1 = tree.xpath('/html/body/div[2]/p')  # 直接从上往下挨着找节点
    # /html/body/div[2]/p[1]
    for div in r1:
        # /html/body/div[2]/p[1]
        a = div.xpath('./text()')
        print(a)
    #浏览器中的console调用xpath的基本格式：$x("xpath表达式")，若格式正确则返回值，错误则无

def Vidio():
    #梨视频防盗链破解
    url = "https://www.pearvideo.com/video_1733893"#拉取视频地址
    contId = url.split("_")[1]#拿到1733893

    resp = requests.get(url)
    resp.encoding="utf-8"
    #print(resp.text)
    main_page = BeautifulSoup(resp.text,"html.parser")
    title = main_page.find("div",class_="box-left clear-mar").find("h1").text
    #print(title)

    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
        # referer防盗链:溯源，访问顺序必须是1->2->3，所以加个referer模拟该状态；若是只有1->3则失效
        ,"Referer": url
    }
    vidio_status = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.5623242368176109"
    resp = requests.get(vidio_status,headers=header)
    #print(resp.text)
    dic = resp.json()
    #print(dic)

    srcUrl = dic["videoInfo"]["videos"]['srcUrl']
    systemTime = dic['systemTime']#systemTime:1660186591481
    # 假：https://video.pearvideo.com/mp4/adshort/20210701/1660186531481-15708981_adpkg-ad_hd.mp4
    # 真：https://video.pearvideo.com/mp4/adshort/20210701/cont-1733893-15708981_adpkg-ad_hd.mp4
    srcUrl_true=srcUrl.replace(systemTime,f"cont-{contId}")#将两个链接不同处对比替换掉旧链接的无用部分
    #print(srcUrl_true)
    with open("videos/"+title+".mp4",mode= "wb")as f:
        f.write(requests.get(srcUrl_true).content)
    print(title+"下载完成!")




def aiodownload(cid,title,book):

    url = f"https://www.23qb.com/book/{cid}.html"
    page = 1
    with open(f"novels/{book}.txt", mode="a+") as f:
        f.write("\n")
        f.write("\n"+title+"\n")
        f.write("\n")
        while True:
            resp = requests.get(url).text
            page_thing = BeautifulSoup(resp.replace('\ufffd', ''), "html.parser")
            lists = page_thing.find_all("div", class_="read-content")

            for texts in lists:
                text = texts.find_all('p')
                del text[-1]
            if text[-1].string == "（继续下一页）":
                del text[-1]
                page = page+1

                url = f"https://www.23qb.com/book/{cid}_{page}.html"

                for line in text:
                    txt = line.string

                    try:
                        f.write(txt+"\n")
                    except Exception as e:
                        f.write("！！！！！！！！"+"\n")
                        continue

                continue
            else:
                for line in text:
                    txt = line.string

                    try:
                        f.write(txt + "\n")
                    except Exception as e:
                        f.write("！！！！！！！！" + "\n")
                        continue

                break
        print(title + "下载完成")

def getCatalog(url):
    resp = requests.get(url)

    #print(resp.text)
    obj1 = re.compile(r'<meta property="og:novel:book_name" content="(?P<book>.*?)"/>.*?'
                      r'<ul class="chaw_c" id="chapterList">(?P<url>.*?)</ul>',re.S)
    obj2 = re.compile(r'<li><a href="/book/(?P<c_id>.*?).html">(?P<name>.*?).</a></li>',re.S)
    main_page = resp.text
    result = obj1.finditer(main_page)
    for i in result:
        ul = i.group('url')
        book = i.group("book")
        #print(ul)
        result2= obj2.finditer(ul)
        for ii in result2:
            cid=ii.group("c_id")
            title = ii.group("name")

            aiodownload(cid,title,book)

    print(book+"下载完成！")


if __name__ == '__main__':
    #top250()
    #MovieDownload()
    #VegetableValue()

    #   CatchPicture主函数
    '''
    start_time = time.time()
    with ThreadPoolExecutor(10) as t:#这里只开启了20个线程，可以更多
        for i in range (2,119):
            t.submit(CatchPicture,f"https://pic.netbian.com/4kdongman/index_{i}.html")
            time.sleep(1)#多线程记得时延防止反爬禁ip（已经被网站禁了四个ip了谢邀）
            print(f"第{i}页内容下载完毕")#如果控制台只显示此内容，则说明ip已经被网站反爬
    print("全部下载完毕")
    end_time = time.time()
    print('总共的时间为:', round(end_time - start_time, 2), '秒')
    '''

    # Xpath()

    #Vidio()

    # 小说下载

    start_time = time.time()
    b_id = "116418" #input("输入你想下载的书的id：")#"60218","27309","4286","719","189697"
    url  =f"https://www.23qb.com/book/{b_id}/"
    getCatalog(url)

    end_time = time.time()
    print("下载时间为:", round(end_time - start_time, 2), '秒')
Python爬虫相关案例汇总

一、简述

二、代码