Python爬虫之下载豆瓣电影图片到本地

豆瓣图片外链目前不可以直接在前端img使用,<img :src="item.cover" referrerpolicy="no-referrer" rel="v:image" alt=""/> 这种方式是无效的。

python 复制代码
import random

import pymysql
import requests
import os
import time
import logging
from urllib.parse import urlparse
from datetime import datetime

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('image_download.log', encoding='utf-8'),
        logging.StreamHandler()
    ]
)


class DoubanImageDownloader:
    def __init__(self, db_config, image_save_path='./public'):
        """
        初始化下载器
        :param db_config: 数据库配置字典
        :param image_save_path: 图片保存的本地路径
        """
        self.db_config = db_config
        self.image_save_path = image_save_path
        self.conn = None
        self.cursor = None

        # 创建图片保存目录
        os.makedirs(self.image_save_path, exist_ok=True)

        # 创建请求会话,设置请求头模拟浏览器
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            'Connection': 'keep-alive',
        })

    def connect_db(self):
        """连接数据库"""
        try:
            self.conn = pymysql.connect(**self.db_config)
            self.cursor = self.conn.cursor(pymysql.cursors.DictCursor)
            logging.info("数据库连接成功")
        except Exception as e:
            logging.error(f"数据库连接失败: {e}")
            raise

    def close_db(self):
        """关闭数据库连接"""
        if self.cursor:
            self.cursor.close()
        if self.conn:
            self.conn.close()
            logging.info("数据库连接已关闭")

    def get_empty_image_records(self, limit=20):
        """
        获取image字段为空的记录
        :param limit: 每次获取的记录数
        :return: 记录列表
        """
        try:
            sql = """
                SELECT id, name, cover 
                FROM tb_sys_movie 
                WHERE (image IS NULL OR image = '') 
                AND cover IS NOT NULL 
                AND cover != ''
                LIMIT %s
            """
            self.cursor.execute(sql, (limit,))
            records = self.cursor.fetchall()
            logging.info(f"获取到{len(records)}条image为空的记录")
            return records
        except Exception as e:
            logging.error(f"查询数据库失败: {e}")
            return []

    def download_image(self, image_url, movie_id, movie_name):
        """
        下载单张图片,支持重试和多种URL格式
        :param image_url: 图片URL
        :param movie_id: 电影ID
        :param movie_name: 电影名称(用于日志)
        :return: 本地图片路径,失败返回None
        """
        try:
            # 从URL中提取文件名
            parsed_url = urlparse(image_url)
            filename = os.path.basename(parsed_url.path)

            # 如果文件名没有扩展名,添加.webp
            if '.' not in filename:
                filename = f"{filename}.webp"

            # 本地保存路径
            local_filename = f"{movie_id}_{filename}"
            local_path = os.path.join(self.image_save_path, local_filename)

            # 如果文件已存在,直接返回
            if os.path.exists(local_path):
                logging.info(f"图片已存在: {local_filename}")
                return f"{local_filename}"

            # 准备要尝试的URL列表
            urls_to_try = [image_url]

            # 如果当前URL以.jpg结尾,添加.webp版本
            if image_url.lower().endswith('.jpg'):
                webp_url = image_url[:-4] + '.webp'
                urls_to_try.append(webp_url)
                logging.info(f"添加备用URL: {webp_url}")

            # 如果当前URL以.webp结尾,添加.jpg版本
            elif image_url.lower().endswith('.webp'):
                jpg_url = image_url[:-5] + '.jpg'
                urls_to_try.append(jpg_url)
                logging.info(f"添加备用URL: {jpg_url}")

            # 如果URL包含特定路径模式,尝试替换
            if '/s_ratio_poster/' in image_url:
                # 尝试获取更高质量的图片
                high_quality_url = image_url.replace('/s_ratio_poster/', '/m/')
                if high_quality_url not in urls_to_try:
                    urls_to_try.append(high_quality_url)
                    logging.info(f"添加高质量备用URL: {high_quality_url}")

            # 设置基础请求头(防盗链要求)
            base_headers = {
                'Referer': 'https://movie.douban.com/',
                'User-Agent': self.session.headers['User-Agent']
            }

            # 尝试所有URL
            for attempt, current_url in enumerate(urls_to_try, 1):
                try:
                    logging.info(
                        f"尝试 {attempt}/{len(urls_to_try)}: 下载图片 {movie_name} (ID: {movie_id}) - {current_url}")

                    # 发送请求
                    response = self.session.get(
                        current_url,
                        headers=base_headers,
                        timeout=15,
                        stream=True
                    )

                    if response.status_code == 200:
                        # 检查Content-Type确保是图片
                        content_type = response.headers.get('content-type', '').lower()

                        # 如果Content-Type不是图片但状态码是200,可能是防盗链页面
                        if 'image' not in content_type:
                            # 检查返回内容大小,如果太小可能不是图片
                            content_length = response.headers.get('content-length')
                            if content_length and int(content_length) < 1024:  # 小于1KB
                                logging.warning(f"返回内容太小({content_length} bytes),可能不是图片: {content_type}")
                                continue
                            else:
                                logging.warning(f"返回内容类型不是图片但继续尝试: {content_type}")

                        # 保存图片
                        with open(local_path, 'wb') as f:
                            for chunk in response.iter_content(chunk_size=8192):
                                f.write(chunk)

                        # 验证文件是否有效(非空且是图片格式)
                        file_size = os.path.getsize(local_path)
                        if file_size < 1024:  # 小于1KB可能无效
                            os.remove(local_path)
                            logging.warning(f"下载的文件太小({file_size} bytes),删除重试")
                            continue

                        logging.info(f"图片下载成功: {local_filename} ({file_size} bytes)")
                        return f"{local_filename}"

                    elif response.status_code == 403:
                        logging.warning(f"访问被拒绝(403),尝试更换User-Agent...")
                        # 更换User-Agent再试
                        self.rotate_user_agent()
                        base_headers['User-Agent'] = self.session.headers['User-Agent']

                    elif response.status_code == 404:
                        logging.warning(f"图片不存在(404): {current_url}")
                        continue

                    else:
                        logging.warning(f"下载失败,状态码: {response.status_code}")

                except requests.exceptions.RequestException as e:
                    logging.warning(f"请求失败: {e}")
                    continue

                # 在尝试之间添加短暂延迟
                if attempt < len(urls_to_try):
                    time.sleep(1)

            # 所有URL都尝试失败后,尝试不带Referer的请求
            logging.info("所有带Referer的尝试失败,尝试无Referer请求...")
            try:
                no_referer_headers = {
                    'User-Agent': self.session.headers['User-Agent']
                }
                response = self.session.get(image_url, headers=no_referer_headers, timeout=10)

                if response.status_code == 200:
                    # 检查是否是图片
                    content_type = response.headers.get('content-type', '')
                    if 'image' in content_type:
                        with open(local_path, 'wb') as f:
                            f.write(response.content)

                        file_size = os.path.getsize(local_path)
                        if file_size >= 1024:
                            logging.info(f"无Referer方式下载成功: {local_filename}")
                            return f"{local_filename}"
            except Exception as e:
                logging.warning(f"无Referer尝试也失败: {e}")

            logging.error(f"所有尝试都失败,无法下载图片: {image_url}")
            return None

        except Exception as e:
            logging.error(f"下载图片时发生错误: {e}")
            return None

    def rotate_user_agent(self):
        """
        轮换User-Agent
        """
        user_agents = [
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/121.0',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
        ]
        new_ua = random.choice(user_agents)
        self.session.headers.update({'User-Agent': new_ua})
        logging.info(f"更换User-Agent: {new_ua[:50]}...")
        
    def update_image_field(self, movie_id, local_image_path):
        """
        更新数据库中的image字段
        :param movie_id: 电影ID
        :param local_image_path: 本地图片路径
        :return: 是否成功
        """
        try:
            sql = "UPDATE tb_sys_movie SET image = %s WHERE id = %s"
            self.cursor.execute(sql, (local_image_path, movie_id))
            self.conn.commit()
            logging.info(f"数据库更新成功: ID {movie_id} -> {local_image_path}")
            return True
        except Exception as e:
            logging.error(f"数据库更新失败: {e}")
            self.conn.rollback()
            return False

    def process_movies(self, limit=20, delay=2):
        """
        处理电影图片下载
        :param limit: 每批处理数量
        :param delay: 请求延迟(秒)
        """
        try:
            # 获取需要处理的记录
            records = self.get_empty_image_records(limit)

            if not records:
                logging.info("没有需要处理的记录")
                return

            success_count = 0
            fail_count = 0

            for record in records:
                movie_id = record['id']
                movie_name = record['name']
                cover_url = record['cover']

                logging.info(f"开始处理: {movie_name} (ID: {movie_id})")

                # 下载图片
                local_path = self.download_image(cover_url, movie_id, movie_name)

                if local_path:
                    # 更新数据库
                    if self.update_image_field(movie_id, local_path):
                        success_count += 1
                    else:
                        fail_count += 1
                else:
                    fail_count += 1

                # 请求延迟,避免被封
                time.sleep(delay)

            logging.info(f"处理完成! 成功: {success_count}, 失败: {fail_count}")

        except Exception as e:
            logging.error(f"处理过程中发生错误: {e}")

    def run(self, batch_size=20, delay=2, max_batches=None):
        """
        运行主程序
        :param batch_size: 每批处理数量
        :param delay: 请求延迟
        :param max_batches: 最大批次数,None表示无限
        """
        self.connect_db()

        try:
            batch_count = 0
            while max_batches is None or batch_count < max_batches:
                batch_count += 1
                logging.info(f"开始第 {batch_count} 批处理")

                self.process_movies(limit=batch_size, delay=delay)

                # 检查是否还有数据需要处理
                remaining = self.get_empty_image_records(1)
                if not remaining:
                    logging.info("所有数据已处理完成")
                    break

                # 批次间延迟
                if max_batches is None or batch_count < max_batches:
                    wait_time = delay * 3
                    logging.info(f"等待 {wait_time} 秒后开始下一批...")
                    time.sleep(wait_time)

        finally:
            self.close_db()


def main():
    # 数据库配置(请修改为你的实际配置)
    db_config = {
        'host': 'localhost',
        'port': 3306,
        'user': 'root',  # 替换为你的数据库用户名
        'password': '123456',  # 替换为你的数据库密码
        'database': 'XXXX',  # 替换为你的数据库名
        'charset': 'utf8mb4',
        'cursorclass': pymysql.cursors.DictCursor
    }

    # 图片保存路径(根据你的项目结构调整)
    # 如果是Vue项目,通常放在 public 下
    image_save_path = 'D://myfilemapping/XXXX/file/'

    # 创建下载器实例
    downloader = DoubanImageDownloader(db_config, image_save_path)

    # 运行程序(先测试少量数据)
    # batch_size: 每批处理5条(先测试)
    # delay: 每次请求间隔3秒
    # max_batches: 1(只处理一批测试)
    # downloader.run(batch_size=5, delay=3, max_batches=1)

    # 测试成功后,可以处理所有数据
    downloader.run(batch_size=20, delay=2, max_batches=None)


if __name__ == "__main__":
    main()
相关推荐
喵手2 小时前
Python爬虫实战:构建“时光机”——网站数据增量监控与差异分析系统!
爬虫·python·爬虫实战·差异分析·零基础python爬虫教学·网站数据增量·网站数据增量监控系统
Katecat996632 小时前
SAR图像火情与烟雾检测:Cascade-Mask-RCNN与RegNetX模型融合详解
python
禁默2 小时前
零基础全面掌握层次分析法(AHP):Python实现+论文加分全攻略
python·数学建模·matlab
深蓝电商API2 小时前
爬虫数据导出 Excel:openpyxl 高级用法
爬虫·python·openpyxl
reasonsummer2 小时前
【教学类-74-05】20260216剪影马(黑色填充图案转黑线条白填充)
python
查士丁尼·绵2 小时前
通过sdk获取ecs指标
python·sdk
喵手2 小时前
Python爬虫实战:失败重试分级 - DNS/超时/403 分策略处理 + 重试退避等!
爬虫·python·爬虫实战·零基础python爬虫教学·失败重试分级·dns/超时·重试退避
得一录3 小时前
Python 算法高级篇:布谷鸟哈希算法与分布式哈希表
python·算法·aigc·哈希算法
Faker66363aaa3 小时前
基于Cascade-Mask-RCNN和RegNetX-4GF的果蝇检测与识别系统——COCO数据集训练与优化
python