一、简述
将之前所做的爬虫案例放出,方便查阅,对代码整合函数内容并不进行说明。
二、代码
import time
from concurrent.futures import ThreadPoolExecutor
import requests
import re
import csv
from bs4 import BeautifulSoup
import os
from lxml import etree
def top250():
#re实例1
for a in range(0,250,25):
url="https://movie.douban.com/top250?start={0}".format(a)
#print(url)
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
}
resp = requests.get(url,headers=headers)#get模式带UA防反爬
page_content=resp.text
obj=re.compile(r'<li>.*?<em class="">(?P<ranking>.*?)</em>'
r'.*?<span class="title">(?P<name>.*?)</span>.*?'
r'.*?<br>(?P<year>.*?) .*?'
r'.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>'
r'.*?<span>(?P<number>.*?)</span>'
,re.S)
result= obj.finditer(page_content)
f=open("DoubanTop250.csv",mode="a")
csvwriter=csv.writer(f)
for i in result:
# print("排名:{0}; 电影名:{1}; 年份:{2}; 评分:{3} ".format(
# i.group("ranking"),
# i.group("name"),
# i.group("year").strip(),
# i.group("score"))
# )
dic=i.groupdict()
dic['year']=dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
time.sleep(2)#多个页面采用时延防反爬,不然ip会被ban掉
print("收集到{0}个信息".format(a+25))
def MovieDownload():
#re实例2
domain ="https://dytt89.com/"
resp = requests.get(domain,verify=False)#get中特殊的verify=False处理
resp.encoding='gbk'#国标语言
#print(resp.text)
f = open("Dytt2022新片精品电影下载地址.csv", mode="a")
csvwriter = csv.writer(f)
obj1=re.compile(r'2022新片精品.*?'
r'<ul>(?P<ul>.*?)</ul>',re.S)
obj2=re.compile(r'''<li><a href='(?P<href>.*?)' title="''',re.S)
obj3=re.compile(r'<div class="title_all"><h1>(?P<movie>.*?)</h1></div>.*?'
r'<td style="WORD-WRAP: break-word" bgcolor="#fdfddf">'
r'<a href="(?P<download>.*?)">',re.S)
child_href_list=[]
result1=obj1.finditer(resp.text)
for i in result1:#第一层------------获取主页面代码
ul=i.group('ul')
#print(ul)
#time.sleep(1)
result2=obj2.finditer(ul)
for j in result2:#第二层------------获取进入后的代码
#time.sleep(1)
child_href=domain+j.group('href').strip("/")#主页面代码与子页面的特殊部分拼接
child_href_list.append(child_href)
k=0
for href in child_href_list:
child_resp=requests.get(href,verify=False)
child_resp.encoding='gbk'
result3=obj3.search(child_resp.text)#获取所需的下载内容
#print(result3.group('movie'))
#print(result3.group('download'))
dic = result3.groupdict()
csvwriter.writerow(dic.values())
k=k+1#计数
print("已收集到{0}个电影".format(k))
f.close()
def VegetableValue():
#POST实例
url = "http://www.xinfadi.com.cn/getPriceData.html"
#由于页面源代码没有数据,所以F12用抓包工具获取到数据的链接使用即可
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
}
f = open("菜价.csv", mode="a")
csvwriter = csv.writer(f)
data = {'limit': 3} # 所要统计的数据数量
#get和post获取数据的区别
#get只能直接获取当前页面内所有资源
#post可以控制所要获取的资源数量data
#resp = requests.get(url, headers=head).json()
resp = requests.post(url, headers=head, data=data).json()
lis = resp.get('list')
for i in lis:
name = i.get("prodName")
low_price = i.get("lowPrice")
high_price = i.get("highPrice")
average_price = i.get("avgPrice")
producing_area = i.get("place")
unit = i.get("unitInfo")
date = i.get("pubDate")
csvwriter.writerow([name,low_price,high_price,average_price,producing_area,unit,date])
f.close()
def CatchPicture(url):
#bs4实例
url_download = "https://pic.netbian.com/"
resp = requests.get(url)
resp.encoding="gbk"
#print(resp.text)
main_page=BeautifulSoup(resp.text,"html.parser")
alist=main_page.find("div",class_="slist").find_all("a")
#print(alist)
for a in alist:
#print(a.get('href'))
href = url_download + a.get('href').strip("/") # 主页面代码与子页面的特殊部分拼接
#print(href)
child_page_resp=requests.get(href)
child_page_resp.encoding='gbk'
child_page_text=child_page_resp.text
child_page=BeautifulSoup(child_page_text,"html.parser")
img = child_page.find("div",class_="photo").find("img")
img_name= child_page.find("div",class_="photo").find("img").get("title")
#print(img.get("src"))
src=url_download+img.get("src").strip("/")
#print(src)
#print(img_name)
img_resp=requests.get(src)
img = img_resp.content#拿到字节
with open("img2/"+img_name+".jpeg",mode="wb")as f:
f.write(img)
print(img_name+"下载好了!!")
#break
time.sleep(0.5)#防反爬必要时延
def Xpath():
#网站不知道为啥总是返回空列表找不到数据,但是自己做的html却很轻松能访问到节点目标
tree = etree.parse('file:///C:/Users/86183/Desktop/1.html')
r1 = tree.xpath('/html/body/div[2]/p') # 直接从上往下挨着找节点
# /html/body/div[2]/p[1]
for div in r1:
# /html/body/div[2]/p[1]
a = div.xpath('./text()')
print(a)
#浏览器中的console调用xpath的基本格式:$x("xpath表达式"),若格式正确则返回值,错误则无
def Vidio():
#梨视频防盗链破解
url = "https://www.pearvideo.com/video_1733893"#拉取视频地址
contId = url.split("_")[1]#拿到1733893
resp = requests.get(url)
resp.encoding="utf-8"
#print(resp.text)
main_page = BeautifulSoup(resp.text,"html.parser")
title = main_page.find("div",class_="box-left clear-mar").find("h1").text
#print(title)
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 SLBrowser/8.0.0.3161 SLBChan/25"
# referer防盗链:溯源,访问顺序必须是1->2->3,所以加个referer模拟该状态;若是只有1->3则失效
,"Referer": url
}
vidio_status = f"https://www.pearvideo.com/videoStatus.jsp?contId={contId}&mrd=0.5623242368176109"
resp = requests.get(vidio_status,headers=header)
#print(resp.text)
dic = resp.json()
#print(dic)
srcUrl = dic["videoInfo"]["videos"]['srcUrl']
systemTime = dic['systemTime']#systemTime:1660186591481
# 假:https://video.pearvideo.com/mp4/adshort/20210701/1660186531481-15708981_adpkg-ad_hd.mp4
# 真:https://video.pearvideo.com/mp4/adshort/20210701/cont-1733893-15708981_adpkg-ad_hd.mp4
srcUrl_true=srcUrl.replace(systemTime,f"cont-{contId}")#将两个链接不同处对比替换掉旧链接的无用部分
#print(srcUrl_true)
with open("videos/"+title+".mp4",mode= "wb")as f:
f.write(requests.get(srcUrl_true).content)
print(title+"下载完成!")
def aiodownload(cid,title,book):
url = f"https://www.23qb.com/book/{cid}.html"
page = 1
with open(f"novels/{book}.txt", mode="a+") as f:
f.write("\n")
f.write("\n"+title+"\n")
f.write("\n")
while True:
resp = requests.get(url).text
page_thing = BeautifulSoup(resp.replace('\ufffd', ''), "html.parser")
lists = page_thing.find_all("div", class_="read-content")
for texts in lists:
text = texts.find_all('p')
del text[-1]
if text[-1].string == "(继续下一页)":
del text[-1]
page = page+1
url = f"https://www.23qb.com/book/{cid}_{page}.html"
for line in text:
txt = line.string
try:
f.write(txt+"\n")
except Exception as e:
f.write("!!!!!!!!"+"\n")
continue
continue
else:
for line in text:
txt = line.string
try:
f.write(txt + "\n")
except Exception as e:
f.write("!!!!!!!!" + "\n")
continue
break
print(title + "下载完成")
def getCatalog(url):
resp = requests.get(url)
#print(resp.text)
obj1 = re.compile(r'<meta property="og:novel:book_name" content="(?P<book>.*?)"/>.*?'
r'<ul class="chaw_c" id="chapterList">(?P<url>.*?)</ul>',re.S)
obj2 = re.compile(r'<li><a href="/book/(?P<c_id>.*?).html">(?P<name>.*?).</a></li>',re.S)
main_page = resp.text
result = obj1.finditer(main_page)
for i in result:
ul = i.group('url')
book = i.group("book")
#print(ul)
result2= obj2.finditer(ul)
for ii in result2:
cid=ii.group("c_id")
title = ii.group("name")
aiodownload(cid,title,book)
print(book+"下载完成!")
if __name__ == '__main__':
#top250()
#MovieDownload()
#VegetableValue()
# CatchPicture主函数
'''
start_time = time.time()
with ThreadPoolExecutor(10) as t:#这里只开启了20个线程,可以更多
for i in range (2,119):
t.submit(CatchPicture,f"https://pic.netbian.com/4kdongman/index_{i}.html")
time.sleep(1)#多线程记得时延防止反爬禁ip(已经被网站禁了四个ip了谢邀)
print(f"第{i}页内容下载完毕")#如果控制台只显示此内容,则说明ip已经被网站反爬
print("全部下载完毕")
end_time = time.time()
print('总共的时间为:', round(end_time - start_time, 2), '秒')
'''
# Xpath()
#Vidio()
# 小说下载
start_time = time.time()
b_id = "116418" #input("输入你想下载的书的id:")#"60218","27309","4286","719","189697"
url =f"https://www.23qb.com/book/{b_id}/"
getCatalog(url)
end_time = time.time()
print("下载时间为:", round(end_time - start_time, 2), '秒')