多线程韩漫爬虫下载器

这是一个用于批量下载漫画图片的多线程爬虫程序，主要针对特定网站的漫画《寄宿日记》实现自动抓取。
但是因为目标网站有反扒机制，且因为防火墙管制，可能需要挂上代理才能稳定访问，所以我写了代理的相关功能，不用的话注释就行了。
python 复制代码
# 20240425 多线程版主程序

import sys 
sys.path.append('/home/aistudio/external-libraries')
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm  # 导入tqdm库
import threading
import concurrent.futures 
from datetime import datetime
import retrying



# 假设这个函数负责下载单个图片，如果下载失败，它会抛出异常  
def download_image(image_url, image_path,page_number,image_filename):  
    # 这里是下载图片的逻辑，如果失败则抛出异常  
    # 检查图片文件是否已存在
    if not os.path.exists(image_path):
        # 发送请求下载图片
        #image_response = requests.get(image_url, stream=True, headers=headers,proxies=proxies)
        image_response = requests.get(image_url, stream=True, headers=headers,verify=False)
        image_response.raise_for_status()

        # 将图片保存到本地文件
        with open(image_path, 'wb') as file:
            for chunk in image_response.iter_content(1024):
                file.write(chunk)

        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  第{page_number}页，图片 {image_filename} 已保存到 {image_path}.replace('downloaded_images','')。")
    else:
        print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  图片 {image_filename} 已存在，跳过下载。")  
    return


# 这个函数负责下载页面上的所有图片，并对每个图片的下载进行重试
@retrying.retry(stop_max_attempt_number=3, wait_fixed=1000)  
def download_image_with_retry(image_url, image_path,page_number,image_filename):  
    return download_image(image_url, image_path,page_number,image_filename) 

def download_images_from_page(url, page_number, output_dir):


    # 发送请求并获取响应
    #response = requests.get(url, headers=headers, proxies=proxies)
    response = requests.get(url, headers=headers,verify=False)
    response.raise_for_status()

    # 解析响应内容
    soup = BeautifulSoup(response.text, 'html.parser')

    # 假设图片URL在img标签的src属性中，且img标签的id以'image-'开头
    img_tags = soup.find_all('img', {'id': lambda x: x and x.startswith('image-')})

    # 确保输出目录存在
    page_dir = os.path.join(output_dir, f"page_{page_number}")
    os.makedirs(page_dir, exist_ok=True)

    # 下载图片并保存到对应的页码文件夹中
    for img_tag in img_tags:  
        image_url = img_tag['src']  
        image_id = img_tag['id']  
        # 修改图片文件名格式  
        image_filename = f"{image_id.split('-')[1]}_image.jpg"  # 假设image_id的格式为'image-数字'  
        image_path = os.path.join(page_dir, image_filename)  

        try:  
            # 使用重试机制下载单个图片  
            download_image_with_retry(image_url, image_path,page_number,image_filename) 
        except Exception as e:  
            print(f"下载图片时发生错误: {e}")  
            log_to_file(output_dir, f"下载图片时发生错误: {e}")  
            # 这里可以选择记录失败，或者进行其他处理，比如跳过该图片继续下一个 


    print(f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  第 {page_number} 页的所有图片下载完成。")
    log_to_file(output_dir,f"第 {page_number} 页的所有图片下载完成。")


def log_to_file(output_dir,message):  
    # 获取当前时间  
    current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  
    output_dir_txt=os.path.join(output_dir, "log.txt")
    # 打开文件（如果不存在则创建）  
    with open(output_dir_txt, "a") as f:  
        # 写入时间和消息  
        f.write(f"{current_time} - {message}\n") 


def main():
    start_time = datetime.now()  # 记录开始时间  
    print(f"开始时间: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")  


    projectname = '寄宿日记'
    target_url_base = "https://bakamh.com/manga/%e5%af%84%e5%ae%bf%e6%97%a5%e8%ae%b0/c-"
    # 设置页码
    start_page = 1
    end_page = 15
    output_dir = os.path.join('downloaded_images', projectname)

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    # 使用线程池限制并发线程数  
    with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # 假设最多同时运行5个线程  
        futures = []  

        for page_number in range(start_page, end_page + 1):  
            page_url = f"{target_url_base}{page_number}/"  
            
            # 提交任务到线程池，并获取Future对象  
            future = executor.submit(download_images_from_page, page_url, page_number, output_dir)  
            futures.append(future)  
            

  
            # 提交任务到线程池，并获取Future对象  
            #future = executor.submit(download_with_retry)  
            #futures.append(future) 



        with tqdm(total=end_page-start_page+1, desc="下载进度") as pbar:   
            failed_tasks = []  
            # 当所有任务都提交后，开始监听Future对象的完成情况  
            for future in concurrent.futures.as_completed(futures):  
                try:  
                    # 获取每个任务返回的结果（这里我们不需要结果，但可以捕获异常）  
                    future.result()  
                except Exception as e:  
                    print(f"下载页面时发生错误: {e}")  
                    log_to_file(output_dir,f"下载页面时发生错误: {e}")
                    failed_tasks.append(future)
                finally:  
                    # 这里更新进度条，但由于我们不知道每页图片数量，所以只能假设每张图片是一个单位进度  
                    pbar.update(1)  
    
        # 如果存在失败的任务，可以选择进行额外处理，比如再次重试或记录日志  
        if failed_tasks:  
            print("存在失败的任务，已记录并可选择进一步处理。")  
            log_to_file(output_dir,f"存在失败的任务，已记录并可选择进一步处理。")

    end_time = datetime.now()  # 记录结束时间  
    print(f"结束时间: {end_time.strftime('%Y-%m-%d %H:%M:%S')}")  
    print(f"总用时: {end_time - start_time}")  
    log_to_file(output_dir,f"总用时: {end_time - start_time}")


if __name__ == '__main__':
    '''
    # 代理服务器的IP和端口  
    proxy_ip = "117.42.94.115"  
    proxy_port = 21287  
    
    # 创建代理字典，指定协议类型和代理服务器的地址  
    proxies = {  
        "http": f"http://{proxy_ip}:{proxy_port}",  
        #"https": f"https://{proxy_ip}:{proxy_port}",  
    }  
    '''
    # 设置User-Agent
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    main()