python
复制代码
"""
https://www.duitang.com/napi/blogv2/list/by_search/?
堆糖页面分析:
使用Ajax加载,aferid是控制加载的图片和页面,从零开始,会提前加载下一页的Ajax数据
第一页的图片是after_id从0到120,会提前加载下一页的after_id:124
"""
import time
from urllib.parse import urlencode
import requests
import re
from threading import Thread
from queue import Queue
import json
import os
class ThreadFetchUrl(Thread):
def __init__(self, url_queue, img_data_queue, headers):
super().__init__()
self.url_queue = url_queue
self.headers = headers
self.img_data_queue = img_data_queue
def run(self):
while not self.url_queue.empty():
url = self.url_queue.get()
response = requests.get(url, headers=self.headers).text
"""
'''将Ajax中的json字符串写入文本'''
page_name = re.findall('&after_id=(.*?)&', url)[0]
with open(self.path + page_name + '.txt', 'w', encoding='utf-8') as f:
f.write(response.text)
"""
dict_resp = json.loads(response)
list = dict_resp['data']['object_list']
for i in list:
id = i['photo']['id']
href = i['photo']['path']
self.img_data_queue.put((id, href))
else:
print('url_queue已空,线程结束')
class ThreadSaveImg(Thread):
''' 将url添加到队列中 '''
def __init__(self, img_data_queue, path):
super().__init__()
self.path = path
self.img_data_queue = img_data_queue
def run(self):
''' 线程执行代码块 '''
while True:
try:
id, href = self.img_data_queue.get(timeout=3)
except:
print('等待超时,线程停止!')
break
else:
postfix = href.split('.')[-1]
img_data = requests.get(href).content
with open(self.path + str(id) + '.' + postfix, 'wb') as f:
f.write(img_data)
print(f'图片{id},保存成功!')
class ImageDuitang(ThreadFetchUrl, ThreadSaveImg):
def __init__(self):
self.url_prefix = 'https://www.duitang.com/napi/blogv2/list/by_search/?'
self.headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 Edg/116.0.1938.69",
}
self.url_queue = Queue()
self.img_data_queue = Queue()
if not os.path.exists('./duitang1'):
os.mkdir('./duitang1')
self.path = './duitang1/'
def urlenqueue(self, page_num, kwd):
for i in range(0, 24 * 5 * page_num, 24):
params = {
'kw': '{}'.format(kwd),
'after_id': f'{i}',
'type': 'feed',
'include_fields': 'top_comments,is_root, source_link, item, buyable, root_id, status, like_count, like_id, sender, album, reply_count, favorite_blog_id',
'_type': '',
}
url = self.url_prefix + urlencode(params)
self.url_queue.put(url)
def main(self):
kwd = input('请输入数据关键字:')
page_num = int(input('请输入要抓取前几页:'))
self.urlenqueue(page_num, kwd)
for i in range(10):
t1 = ThreadFetchUrl(self.url_queue, self.img_data_queue, self.headers)
t1.start()
for i in range(30):
t2 = ThreadSaveImg(self.img_data_queue, self.path)
t2.start()
if __name__ == '__main__':
DT = ImageDuitang()
DT.main()
print('\n&&&&&&&&&&主线程已结束&&&&&&&&&&\n')