批量爬取百度图片关键词搜索得到的图片

将搜索关键字写到keywords.txt中，设置从哪行开始搜索爬取
调整尺寸，做到统一
脚本而已，用的BaiduSpider
python 复制代码
from PIL import Image
import os
import requests
from baiduspider import BaiduSpider
from requests.exceptions import Timeout
import time
# 定义起始行索引    
start_query_index = 16

# 读取关键词列表，从指定行开始
with open('ketwords.txt', 'r', encoding='utf-8') as file:
    for _ in range(start_query_index - 1):
        next(file)
    queries = file.read().splitlines()

save_dir = "downloaded_images"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 循环处理每一个查询词
for query_index, query in enumerate(queries, start=start_query_index):
    print(f"Processing query {query_index}: {query}")
    results = BaiduSpider().search_pic(query=query)
    
    query_save_dir = os.path.join(save_dir, str(query_index))
    if not os.path.exists(query_save_dir):
        os.makedirs(query_save_dir)
    
    # 对于每个查询，重置文件计数器
    file_counter = 1
    
    for image_index, result in enumerate(results):
        url = result.url
        print(f"Downloading image {image_index + 1} for query {query_index}: {url}")
        try:
            response = requests.get(url, stream=True, timeout=5, verify=False)  # 注意这里仍然使用verify=False
            if response.status_code == 200:
                # 使用固定的临时文件名
                temp_file_path = os.path.join(query_save_dir, 'temp.jpg')
                
                with open(temp_file_path, 'wb') as file:
                    for chunk in response.iter_content(1024):
                        file.write(chunk)
                
                print(f"Image downloaded temporarily.")
                
                # 加载图片并调整尺寸
                img = Image.open(temp_file_path)
                img_resized = img.resize((1920, 1080), Image.Resampling.LANCZOS)  # 使用Image.Resampling.LANCZOS
                
                # 生成最终文件名
                file_name = f'image_{file_counter}.jpg'
                final_file_path = os.path.join(query_save_dir, file_name)
                
                # 保存调整尺寸后的图片到最终文件名
                img_resized.save(final_file_path)
                
                # 清理临时文件
                os.remove(temp_file_path)
                
                print(f"Image {file_counter} processed and saved successfully.")
                
                file_counter += 1  # 文件计数器递增
            else:
                print(f"Failed to download image {image_index + 1} for query {query_index}. Status code: {response.status_code}")
        except Timeout:
            print(f"Request timed out after 5 seconds for image {image_index + 1} of query {query_index}. Skipping...")
        except Exception as e:
            print(f"An error occurred while downloading image {image_index + 1} for query {query_index}: {str(e)}")
    
    time.sleep(60)