这是一个用于批量下载漫画图片的多线程爬虫程序,主要针对特定网站的漫画《寄宿日记》实现自动抓取。
但是因为目标网站有反扒机制,且因为防火墙管制,可能需要挂上代理才能稳定访问,所以我写了 代理的相关功能,不用的话注释就行了。
python
# 20240425 多线程版主程序
import sys
sys.path.append('/home/aistudio/external-libraries')
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm # 导入tqdm库
import threading
import concurrent.futures
from datetime import datetime
import retrying
# 假设这个函数负责下载单个图片,如果下载失败,它会抛出异常
def download_image(image_url, image_path,page_number,image_filename):
# 这里是下载图片的逻辑,如果失败则抛出异常
# 检查图片文件是否已存在
if not os.path.exists(image_path):
# 发送请求下载图片
#image_response = requests.get(image_url, stream=True, headers=headers,proxies=proxies)
image_response = requests.get(image_url, stream=True, headers=headers,verify=False)
image_response.raise_for_status()
# 将图片保存到本地文件
with open(image_path, 'wb') as file:
for chunk in image_response.iter_content(1024):
file.write(chunk)
print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 第{page_number}页,图片 {image_filename} 已保存到 {image_path}.replace('downloaded_images','')。")
else:
print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 图片 {image_filename} 已存在,跳过下载。")
return
# 这个函数负责下载页面上的所有图片,并对每个图片的下载进行重试
@retrying.retry(stop_max_attempt_number=3, wait_fixed=1000)
def download_image_with_retry(image_url, image_path,page_number,image_filename):
return download_image(image_url, image_path,page_number,image_filename)
def download_images_from_page(url, page_number, output_dir):
# 发送请求并获取响应
#response = requests.get(url, headers=headers, proxies=proxies)
response = requests.get(url, headers=headers,verify=False)
response.raise_for_status()
# 解析响应内容
soup = BeautifulSoup(response.text, 'html.parser')
# 假设图片URL在img标签的src属性中,且img标签的id以'image-'开头
img_tags = soup.find_all('img', {'id': lambda x: x and x.startswith('image-')})
# 确保输出目录存在
page_dir = os.path.join(output_dir, f"page_{page_number}")
os.makedirs(page_dir, exist_ok=True)
# 下载图片并保存到对应的页码文件夹中
for img_tag in img_tags:
image_url = img_tag['src']
image_id = img_tag['id']
# 修改图片文件名格式
image_filename = f"{image_id.split('-')[1]}_image.jpg" # 假设image_id的格式为'image-数字'
image_path = os.path.join(page_dir, image_filename)
try:
# 使用重试机制下载单个图片
download_image_with_retry(image_url, image_path,page_number,image_filename)
except Exception as e:
print(f"下载图片时发生错误: {e}")
log_to_file(output_dir, f"下载图片时发生错误: {e}")
# 这里可以选择记录失败,或者进行其他处理,比如跳过该图片继续下一个
print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} 第 {page_number} 页的所有图片下载完成。")
log_to_file(output_dir,f"第 {page_number} 页的所有图片下载完成。")
def log_to_file(output_dir,message):
# 获取当前时间
current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
output_dir_txt=os.path.join(output_dir, "log.txt")
# 打开文件(如果不存在则创建)
with open(output_dir_txt, "a") as f:
# 写入时间和消息
f.write(f"{current_time} - {message}\n")
def main():
start_time = datetime.now() # 记录开始时间
print(f"开始时间: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")
projectname = '寄宿日记'
target_url_base = "https://bakamh.com/manga/%e5%af%84%e5%ae%bf%e6%97%a5%e8%ae%b0/c-"
# 设置页码
start_page = 1
end_page = 15
output_dir = os.path.join('downloaded_images', projectname)
# 确保输出目录存在
os.makedirs(output_dir, exist_ok=True)
# 使用线程池限制并发线程数
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: # 假设最多同时运行5个线程
futures = []
for page_number in range(start_page, end_page + 1):
page_url = f"{target_url_base}{page_number}/"
# 提交任务到线程池,并获取Future对象
future = executor.submit(download_images_from_page, page_url, page_number, output_dir)
futures.append(future)
# 提交任务到线程池,并获取Future对象
#future = executor.submit(download_with_retry)
#futures.append(future)
with tqdm(total=end_page-start_page+1, desc="下载进度") as pbar:
failed_tasks = []
# 当所有任务都提交后,开始监听Future对象的完成情况
for future in concurrent.futures.as_completed(futures):
try:
# 获取每个任务返回的结果(这里我们不需要结果,但可以捕获异常)
future.result()
except Exception as e:
print(f"下载页面时发生错误: {e}")
log_to_file(output_dir,f"下载页面时发生错误: {e}")
failed_tasks.append(future)
finally:
# 这里更新进度条,但由于我们不知道每页图片数量,所以只能假设每张图片是一个单位进度
pbar.update(1)
# 如果存在失败的任务,可以选择进行额外处理,比如再次重试或记录日志
if failed_tasks:
print("存在失败的任务,已记录并可选择进一步处理。")
log_to_file(output_dir,f"存在失败的任务,已记录并可选择进一步处理。")
end_time = datetime.now() # 记录结束时间
print(f"结束时间: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"总用时: {end_time - start_time}")
log_to_file(output_dir,f"总用时: {end_time - start_time}")
if __name__ == '__main__':
'''
# 代理服务器的IP和端口
proxy_ip = "117.42.94.115"
proxy_port = 21287
# 创建代理字典,指定协议类型和代理服务器的地址
proxies = {
"http": f"http://{proxy_ip}:{proxy_port}",
#"https": f"https://{proxy_ip}:{proxy_port}",
}
'''
# 设置User-Agent
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
main()