百度图片搜索页面是动态加载的(使用AJAX技术),初始页面只加载部分图片,更多图片通过滚动或分页请求后端API获取。爬虫需要构造URL发送GET请求,获取响应数据。
python
import requests
import os
import time
import json
import urllib.parse
def crawl_baidu_images(keyword, num_images=50):
# 创建保存目录
save_dir = f"./{keyword}_images"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# URL编码关键词
encoded_keyword = urllib.parse.quote(keyword)
# Headers伪装浏览器
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36 Edg/140.0.0.0",
}
downloaded = 0
pn = 0 # 起始索引
rn = 30 # 每页数量
while downloaded < num_images:
# 构造API URL
url = f"https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={encoded_keyword}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=0&hd=&latest=©right=&word={encoded_keyword}&s=&se=&tab=&width=&height=&face=0&istype=2&qc=&nc=1&fr=&expermode=&nojc=&pn={pn}&rn={rn}&gsm=1e&{int(time.time() * 1000)}="
# url=f"https://image.baidu.com/search/acjson?tn=resultjson_com&word={keyword}&ie=utf-8&fp=result&fr=&ala=0&applid=8492805389177329582&pn=30&rn=30&nojc=0&gsm=1e&newReq=1"
# url="https://image.baidu.com/search/acjson?tn=resultjson_com&word=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8&ie=utf-8&fp=result&fr=&ala=0&applid=10053530779953026824&pn=30&rn=30&nojc=0&gsm=1e&newReq=1"
try:
session=requests.Session()
session.headers.update(headers)
session.get("https://image.baidu.com",headers=headers)
response = session.get(url, headers=headers)
# 解析JSON(有时需去除BOM)
text = response.text.replace("\ufeff", "")
data = json.loads(text)
if "data" not in data:
print("No more data available.")
break
for item in data["data"]:
if downloaded >= num_images:
break
if "thumbURL" in item:
img_url = item["thumbURL"]
else:
continue
# 下载图片
try:
img_resp = session.get(img_url, headers=headers, timeout=10)
img_resp.raise_for_status()
file_path = os.path.join(save_dir, f"image_{downloaded + 1}.jpg")
with open(file_path, "wb") as f:
f.write(img_resp.content)
print(f"下载成功: {file_path}")
downloaded += 1
except Exception as e:
print(f"不能下载 {img_url}: {e}")
except Exception as e:
print(f"Request failed: {e}")
# 下一页
pn += rn
time.sleep(1 + random.uniform(0, 1)) # 随机延时避免封禁,需要import random
print(f"Total downloaded: {downloaded}")
# 示例调用
if __name__ == "__main__":
import random # 用于随机延时
keyword = input("Enter keyword: ") # 如 "美女"
crawl_baidu_images(keyword, num_images=100)