豆瓣图片外链目前不可以直接在前端img使用,<img :src="item.cover" referrerpolicy="no-referrer" rel="v:image" alt=""/> 这种方式是无效的。
python
import random
import pymysql
import requests
import os
import time
import logging
from urllib.parse import urlparse
from datetime import datetime
# 配置日志
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler('image_download.log', encoding='utf-8'),
logging.StreamHandler()
]
)
class DoubanImageDownloader:
def __init__(self, db_config, image_save_path='./public'):
"""
初始化下载器
:param db_config: 数据库配置字典
:param image_save_path: 图片保存的本地路径
"""
self.db_config = db_config
self.image_save_path = image_save_path
self.conn = None
self.cursor = None
# 创建图片保存目录
os.makedirs(self.image_save_path, exist_ok=True)
# 创建请求会话,设置请求头模拟浏览器
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
})
def connect_db(self):
"""连接数据库"""
try:
self.conn = pymysql.connect(**self.db_config)
self.cursor = self.conn.cursor(pymysql.cursors.DictCursor)
logging.info("数据库连接成功")
except Exception as e:
logging.error(f"数据库连接失败: {e}")
raise
def close_db(self):
"""关闭数据库连接"""
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()
logging.info("数据库连接已关闭")
def get_empty_image_records(self, limit=20):
"""
获取image字段为空的记录
:param limit: 每次获取的记录数
:return: 记录列表
"""
try:
sql = """
SELECT id, name, cover
FROM tb_sys_movie
WHERE (image IS NULL OR image = '')
AND cover IS NOT NULL
AND cover != ''
LIMIT %s
"""
self.cursor.execute(sql, (limit,))
records = self.cursor.fetchall()
logging.info(f"获取到{len(records)}条image为空的记录")
return records
except Exception as e:
logging.error(f"查询数据库失败: {e}")
return []
def download_image(self, image_url, movie_id, movie_name):
"""
下载单张图片,支持重试和多种URL格式
:param image_url: 图片URL
:param movie_id: 电影ID
:param movie_name: 电影名称(用于日志)
:return: 本地图片路径,失败返回None
"""
try:
# 从URL中提取文件名
parsed_url = urlparse(image_url)
filename = os.path.basename(parsed_url.path)
# 如果文件名没有扩展名,添加.webp
if '.' not in filename:
filename = f"{filename}.webp"
# 本地保存路径
local_filename = f"{movie_id}_{filename}"
local_path = os.path.join(self.image_save_path, local_filename)
# 如果文件已存在,直接返回
if os.path.exists(local_path):
logging.info(f"图片已存在: {local_filename}")
return f"{local_filename}"
# 准备要尝试的URL列表
urls_to_try = [image_url]
# 如果当前URL以.jpg结尾,添加.webp版本
if image_url.lower().endswith('.jpg'):
webp_url = image_url[:-4] + '.webp'
urls_to_try.append(webp_url)
logging.info(f"添加备用URL: {webp_url}")
# 如果当前URL以.webp结尾,添加.jpg版本
elif image_url.lower().endswith('.webp'):
jpg_url = image_url[:-5] + '.jpg'
urls_to_try.append(jpg_url)
logging.info(f"添加备用URL: {jpg_url}")
# 如果URL包含特定路径模式,尝试替换
if '/s_ratio_poster/' in image_url:
# 尝试获取更高质量的图片
high_quality_url = image_url.replace('/s_ratio_poster/', '/m/')
if high_quality_url not in urls_to_try:
urls_to_try.append(high_quality_url)
logging.info(f"添加高质量备用URL: {high_quality_url}")
# 设置基础请求头(防盗链要求)
base_headers = {
'Referer': 'https://movie.douban.com/',
'User-Agent': self.session.headers['User-Agent']
}
# 尝试所有URL
for attempt, current_url in enumerate(urls_to_try, 1):
try:
logging.info(
f"尝试 {attempt}/{len(urls_to_try)}: 下载图片 {movie_name} (ID: {movie_id}) - {current_url}")
# 发送请求
response = self.session.get(
current_url,
headers=base_headers,
timeout=15,
stream=True
)
if response.status_code == 200:
# 检查Content-Type确保是图片
content_type = response.headers.get('content-type', '').lower()
# 如果Content-Type不是图片但状态码是200,可能是防盗链页面
if 'image' not in content_type:
# 检查返回内容大小,如果太小可能不是图片
content_length = response.headers.get('content-length')
if content_length and int(content_length) < 1024: # 小于1KB
logging.warning(f"返回内容太小({content_length} bytes),可能不是图片: {content_type}")
continue
else:
logging.warning(f"返回内容类型不是图片但继续尝试: {content_type}")
# 保存图片
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
# 验证文件是否有效(非空且是图片格式)
file_size = os.path.getsize(local_path)
if file_size < 1024: # 小于1KB可能无效
os.remove(local_path)
logging.warning(f"下载的文件太小({file_size} bytes),删除重试")
continue
logging.info(f"图片下载成功: {local_filename} ({file_size} bytes)")
return f"{local_filename}"
elif response.status_code == 403:
logging.warning(f"访问被拒绝(403),尝试更换User-Agent...")
# 更换User-Agent再试
self.rotate_user_agent()
base_headers['User-Agent'] = self.session.headers['User-Agent']
elif response.status_code == 404:
logging.warning(f"图片不存在(404): {current_url}")
continue
else:
logging.warning(f"下载失败,状态码: {response.status_code}")
except requests.exceptions.RequestException as e:
logging.warning(f"请求失败: {e}")
continue
# 在尝试之间添加短暂延迟
if attempt < len(urls_to_try):
time.sleep(1)
# 所有URL都尝试失败后,尝试不带Referer的请求
logging.info("所有带Referer的尝试失败,尝试无Referer请求...")
try:
no_referer_headers = {
'User-Agent': self.session.headers['User-Agent']
}
response = self.session.get(image_url, headers=no_referer_headers, timeout=10)
if response.status_code == 200:
# 检查是否是图片
content_type = response.headers.get('content-type', '')
if 'image' in content_type:
with open(local_path, 'wb') as f:
f.write(response.content)
file_size = os.path.getsize(local_path)
if file_size >= 1024:
logging.info(f"无Referer方式下载成功: {local_filename}")
return f"{local_filename}"
except Exception as e:
logging.warning(f"无Referer尝试也失败: {e}")
logging.error(f"所有尝试都失败,无法下载图片: {image_url}")
return None
except Exception as e:
logging.error(f"下载图片时发生错误: {e}")
return None
def rotate_user_agent(self):
"""
轮换User-Agent
"""
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
]
new_ua = random.choice(user_agents)
self.session.headers.update({'User-Agent': new_ua})
logging.info(f"更换User-Agent: {new_ua[:50]}...")
def update_image_field(self, movie_id, local_image_path):
"""
更新数据库中的image字段
:param movie_id: 电影ID
:param local_image_path: 本地图片路径
:return: 是否成功
"""
try:
sql = "UPDATE tb_sys_movie SET image = %s WHERE id = %s"
self.cursor.execute(sql, (local_image_path, movie_id))
self.conn.commit()
logging.info(f"数据库更新成功: ID {movie_id} -> {local_image_path}")
return True
except Exception as e:
logging.error(f"数据库更新失败: {e}")
self.conn.rollback()
return False
def process_movies(self, limit=20, delay=2):
"""
处理电影图片下载
:param limit: 每批处理数量
:param delay: 请求延迟(秒)
"""
try:
# 获取需要处理的记录
records = self.get_empty_image_records(limit)
if not records:
logging.info("没有需要处理的记录")
return
success_count = 0
fail_count = 0
for record in records:
movie_id = record['id']
movie_name = record['name']
cover_url = record['cover']
logging.info(f"开始处理: {movie_name} (ID: {movie_id})")
# 下载图片
local_path = self.download_image(cover_url, movie_id, movie_name)
if local_path:
# 更新数据库
if self.update_image_field(movie_id, local_path):
success_count += 1
else:
fail_count += 1
else:
fail_count += 1
# 请求延迟,避免被封
time.sleep(delay)
logging.info(f"处理完成! 成功: {success_count}, 失败: {fail_count}")
except Exception as e:
logging.error(f"处理过程中发生错误: {e}")
def run(self, batch_size=20, delay=2, max_batches=None):
"""
运行主程序
:param batch_size: 每批处理数量
:param delay: 请求延迟
:param max_batches: 最大批次数,None表示无限
"""
self.connect_db()
try:
batch_count = 0
while max_batches is None or batch_count < max_batches:
batch_count += 1
logging.info(f"开始第 {batch_count} 批处理")
self.process_movies(limit=batch_size, delay=delay)
# 检查是否还有数据需要处理
remaining = self.get_empty_image_records(1)
if not remaining:
logging.info("所有数据已处理完成")
break
# 批次间延迟
if max_batches is None or batch_count < max_batches:
wait_time = delay * 3
logging.info(f"等待 {wait_time} 秒后开始下一批...")
time.sleep(wait_time)
finally:
self.close_db()
def main():
# 数据库配置(请修改为你的实际配置)
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root', # 替换为你的数据库用户名
'password': '123456', # 替换为你的数据库密码
'database': 'XXXX', # 替换为你的数据库名
'charset': 'utf8mb4',
'cursorclass': pymysql.cursors.DictCursor
}
# 图片保存路径(根据你的项目结构调整)
# 如果是Vue项目,通常放在 public 下
image_save_path = 'D://myfilemapping/XXXX/file/'
# 创建下载器实例
downloader = DoubanImageDownloader(db_config, image_save_path)
# 运行程序(先测试少量数据)
# batch_size: 每批处理5条(先测试)
# delay: 每次请求间隔3秒
# max_batches: 1(只处理一批测试)
# downloader.run(batch_size=5, delay=3, max_batches=1)
# 测试成功后,可以处理所有数据
downloader.run(batch_size=20, delay=2, max_batches=None)
if __name__ == "__main__":
main()