- 将搜索关键字写到keywords.txt中,设置从哪行开始搜索爬取
- 调整尺寸,做到统一
- 脚本而已,用的BaiduSpider
python
from PIL import Image
import os
import requests
from baiduspider import BaiduSpider
from requests.exceptions import Timeout
import time
# 定义起始行索引
start_query_index = 16
# 读取关键词列表,从指定行开始
with open('ketwords.txt', 'r', encoding='utf-8') as file:
for _ in range(start_query_index - 1):
next(file)
queries = file.read().splitlines()
save_dir = "downloaded_images"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# 循环处理每一个查询词
for query_index, query in enumerate(queries, start=start_query_index):
print(f"Processing query {query_index}: {query}")
results = BaiduSpider().search_pic(query=query)
query_save_dir = os.path.join(save_dir, str(query_index))
if not os.path.exists(query_save_dir):
os.makedirs(query_save_dir)
# 对于每个查询,重置文件计数器
file_counter = 1
for image_index, result in enumerate(results):
url = result.url
print(f"Downloading image {image_index + 1} for query {query_index}: {url}")
try:
response = requests.get(url, stream=True, timeout=5, verify=False) # 注意这里仍然使用verify=False
if response.status_code == 200:
# 使用固定的临时文件名
temp_file_path = os.path.join(query_save_dir, 'temp.jpg')
with open(temp_file_path, 'wb') as file:
for chunk in response.iter_content(1024):
file.write(chunk)
print(f"Image downloaded temporarily.")
# 加载图片并调整尺寸
img = Image.open(temp_file_path)
img_resized = img.resize((1920, 1080), Image.Resampling.LANCZOS) # 使用Image.Resampling.LANCZOS
# 生成最终文件名
file_name = f'image_{file_counter}.jpg'
final_file_path = os.path.join(query_save_dir, file_name)
# 保存调整尺寸后的图片到最终文件名
img_resized.save(final_file_path)
# 清理临时文件
os.remove(temp_file_path)
print(f"Image {file_counter} processed and saved successfully.")
file_counter += 1 # 文件计数器递增
else:
print(f"Failed to download image {image_index + 1} for query {query_index}. Status code: {response.status_code}")
except Timeout:
print(f"Request timed out after 5 seconds for image {image_index + 1} of query {query_index}. Skipping...")
except Exception as e:
print(f"An error occurred while downloading image {image_index + 1} for query {query_index}: {str(e)}")
time.sleep(60)