DrissionPage的学习

目录

查看店铺内商品上新情况

python 复制代码
from DrissionPage import ChromiumPage
from DataRecorder import Recorder
from datetime import datetime, timedelta
import pandas as pd
import os
import time

def crawl_jd(url, r, good_name):
    page = ChromiumPage()
    crawled_urls = set()

    try:
        page.get(url, show_errmsg=True)
        page.wait(2) 
        print(f"已进入目标页面:{url}")

        while True:
            for _ in range(9):
                page.scroll.down(pixel=1000)
                page.wait(3)
            page.wait(2)
            
            items = page.eles('xpath://li[@class="jSubObject"]')
            current_page_item_count = len(items)
            print(f"\n当前页商品总数:{current_page_item_count}")
            new_crawled = 0

            for item in items:
                try:
                    info_url = item.ele('xpath:.//div[@class="jDesc"]/a').link
                    if info_url in crawled_urls:
                        print(f"→ 商品已重复,跳过:{info_url[:50]}...") 
                        continue

                    title = item.ele('xpath:.//div[@class="jDesc"]/a').text.strip()
                    if good_name not in title: # 最下面的推荐商品需要剔除掉
                        continue
                    item.ele('xpath:.//span[@class="jdNum"]', timeout=3)
                    price = item.ele('xpath:.//div[@class="jdPrice"]').text.replace(' ', '')
                    price = price.split('.')[0] if '.' in price else price

                    total_count = len(crawled_urls) + 1 
                    print(f"→ 已抓取{total_count}个商品:链接:{info_url[:50]}... | 标题:{title[:30]}... | 价格:{price}")
                    r.add_data((info_url, title, price))
                    crawled_urls.add(info_url)
                    new_crawled += 1

                except Exception as e:
                    print(f"→ 单个商品提取失败:{str(e)[:80]}...")  # 截取错误信息前80字符
                    continue

            # 循环终止判断:当前页无新商品 → 已到最后一页(重复加载)
            if new_crawled == 0:
                print(f"\n当前页无新商品(新抓取数:{new_crawled}),已爬取到最后一页")
                break

            try:
                next_btn = page('下一页', timeout=2)  # 通过文本定位下一页按钮
                if next_btn:
                    print(f"\n当前页新抓取{new_crawled}个商品,准备进入下一页")
                    next_btn.click()
                    page.wait.load_start()  # 等待页面完全加载
                    page.wait(2)  # 额外等待,确保新页初始数据渲染
                else:
                    print(f"\n未找到下一页按钮,停止爬取")
                    break
            except Exception as e:
                print(f"\n点击下一页失败:{str(e)[:80]}...,停止爬取")
                break

    except Exception as main_e:
        print(f"\n爬取主逻辑异常:{str(main_e)}")
    finally:
        print(f"\n爬取结束!总计抓取商品数量:{len(crawled_urls)}")
        r.record()
        print(f"数据已保存到Excel文件")
        # 关闭浏览器,释放资源(避免进程残留)
#         page.close()
#         print("浏览器已关闭")

def compare_two_days_products(today_excel_path, prev_day_excel_path, save_new_path=None):

    try:
        today_df = pd.read_excel(today_excel_path, usecols=["url", "标题", "价格"])
        today_df = today_df.dropna(subset=["url"])
        print(f"\n✅ 成功读取当天数据:{today_excel_path}")
        print(f"   - 当天商品总数:{len(today_df)}")

        # 读取前一天数据(仅需url列)
        if not os.path.exists(prev_day_excel_path):
            print(f"❌ 前一天文件不存在:{prev_day_excel_path},无法对比(视为首次爬取,所有商品都是上新)")
            new_products_df = today_df
        else:
            prev_df = pd.read_excel(prev_day_excel_path, usecols=["url"])
            prev_df = prev_df.dropna(subset=["url"])
            prev_urls = set(prev_df["url"])
            print(f"✅ 成功读取前一天数据:{prev_day_excel_path}")
            print(f"   - 前一天商品总数:{len(prev_df)}")
            new_products_df = today_df[~today_df["url"].isin(prev_urls)]  # ~表示"不包含"

        new_product_count = len(new_products_df)
        print(f"\n📊 对比结果:当天上新商品数量 = {new_product_count}")

        if new_product_count > 0:
            print("\n🔍 上新商品示例:")
            for idx, row in new_products_df.iterrows():
                print(f"   {idx+1}. 标题:{row['标题'][:30]} | URL:{row['url'][:50]}")
        else:
            print("❌ 当天无上新商品")
        return new_product_count, len(prev_df), len(today_df)

    except Exception as e:
        print(f"\n❌ 对比失败:{str(e)}")
        return -1, -1, -1

def get_excel_paths(brand_dir):
    today = datetime.now()
    prev_day = today - timedelta(days=1)
    today_str = today.strftime("%m%d")
    prev_day_str = prev_day.strftime("%m%d")
    
    os.makedirs(brand_dir, exist_ok=True)
    
    today_excel_path = f"{brand_dir}/{today_str}_data.xlsx"
    prev_day_excel_path = f"{brand_dir}/{prev_day_str}_data.xlsx"

    return today_excel_path, prev_day_excel_path

def run_brand_crawl_and_compare(brand_dir, brand_name, crawl_url):
    today_excel_path, prev_day_excel_path = get_excel_paths(brand_dir)
    
    # 若当天Excel文件已存在,先删除(避免数据重复或格式冲突)
    if os.path.exists(today_excel_path):
        try:
            os.remove(today_excel_path)
            print(f"⚠️  检测到当天Excel文件已存在,已删除:{today_excel_path}")
        except Exception as e:
            print(f"❌ 删除已存在的Excel文件失败:{str(e)}")
            raise Exception(f"文件删除失败,无法继续爬取:{str(e)}")
    excel_recorder = Recorder(today_excel_path)
    excel_recorder.set.head(('url', '标题', '价格'))

    crawl_jd(url=crawl_url, r=excel_recorder, good_name=brand_name.split("-")[1])

    return compare_two_days_products(
                today_excel_path=today_excel_path,
                prev_day_excel_path=prev_day_excel_path
            )

if __name__ == "__main__":
    BRANDS_CONFIG = {
        "小米-手机": (
            "E:/crawls/小米-手机/",
            "https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机"
        ),
        "华为-手机": (
            "E:/crawls/华为-手机/",
            "https://mall.jd.com/view_search-466323-1000004259-1000004259-0-0-0-0-1-1-60.html?keyword=手机"
        ),
    }
    counts = []
    brand_names = []
    for brand_name, (brand_dir, crawl_url) in BRANDS_CONFIG.items():
        print(f"\n" + "="*50)
        print(f"开始处理【{brand_name}】")
        print("="*50)
        new_product_count, prev_count, today_count = run_brand_crawl_and_compare(brand_dir=brand_dir,  brand_name=brand_name, crawl_url=crawl_url)
        brand_names.append(brand_name)
        counts.append([new_product_count, prev_count, today_count])
        time.sleep(10)

    print(f"\n" + "="*50)
    print("所有品牌处理完成!")
    print("="*50)

    print(f"\n" + "="*80)
    print(f"【各品牌数据汇总】- 生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*80)
    # 打印表头
    print(f"{'品牌名称':<15} | {'上新商品数':<10} | {'前一天商品数':<12} | {'今天商品数':<10}")
    print("-"*80)
    for i in range(len(brand_names)):
        brand = brand_names[i]
        new_count, prev_count, today_count = counts[i]
        # 处理异常值显示(如无数据时显示N/A)
        new_count = new_count if new_count != -1 else "N/A"
        prev_count = prev_count if prev_count is not None else "N/A"
        today_count = today_count if today_count is not None else "N/A"
        print(f"{brand:<15} | {new_count:<10} | {prev_count:<12} | {today_count:<10}")
    print("="*80)

输出:

bash 复制代码
==================================================
开始处理【小米-手机】
==================================================
已进入目标页面:https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机

当前页商品总数:70
→ 已抓取1个商品:链接:https://item.jd.com/100200730249.html... | 标题:小米(MI)REDMI Note15 Pro 天玑7400-... | 价格:¥1399
→ 已抓取2个商品:链接:https://item.jd.com/100067438552.html... | 标题:小米Redmi Note13 5G 国家补贴 1亿像素 超细... | 价格:¥799
→ 已抓取3个商品:链接:https://item.jd.com/100129016845.html... | 标题:小米REDMI K80 国家补贴 第三代骁龙8 6550mA... | 价格:¥2099
→ 已抓取4个商品:链接:https://item.jd.com/100071383534.html... | 标题:小米14 徕卡光学镜头 光影猎人900 国家补贴 第三代骁龙... | 价格:¥2699
→ 已抓取5个商品:链接:https://item.jd.com/100071390038.html... | 标题:小米14 徕卡光学镜头 光影猎人900 国家补贴 第三代骁龙... | 价格:¥2999
→ 已抓取6个商品:链接:https://item.jd.com/100158171899.html... | 标题:小米(MI)【国家补贴】REDMI Turbo 4 Pro ... | 价格:¥1799
→ 已抓取7个商品:链接:https://item.jd.com/100068892989.html... | 标题:小米 Redmi Note13Pro 骁龙7S 国家补贴 新... | 价格:¥1129
→ 已抓取8个商品:链接:https://item.jd.com/100157830510.html... | 标题:小米 REDMI K80 Pro 国家补贴 骁龙8至尊版 全... | 价格:¥3399
...
爬取结束!总计抓取商品数量:83
E:\crawls\小米-手机\0911_data.xlsx 开始写入文件,切勿关闭进程。
E:\crawls\小米-手机\0911_data.xlsx 写入文件结束。
数据已保存到Excel文件

✅ 成功读取当天数据:E:/crawls/小米-手机//0911_data.xlsx
   - 当天商品总数:83
❌ 前一天文件不存在:E:/crawls/小米-手机//0910_data.xlsx,无法对比(视为首次爬取,所有商品都是上新)
...
==================================================
所有品牌处理完成!
==================================================

================================================================================
【各品牌数据汇总】- 生成时间:2025-09-11 17:59:53
================================================================================
品牌名称            | 上新商品数      | 前一天商品数       | 今天商品数
--------------------------------------------------------------------------------
小米-手机           | N/A        | -1           | -1
华为-手机           | N/A        | -1           | -1
================================================================================

只前后两天的商品上新对比

python 复制代码
from datetime import datetime, timedelta
import pandas as pd
import os
import time

def get_excel_paths(brand_dir):
    today = datetime.now()
    prev_day = today - timedelta(days=1)
    today_str = today.strftime("%m%d")
    prev_day_str = prev_day.strftime("%m%d")
    
    today_excel_path = f"{brand_dir}/{today_str}_data.xlsx"
    prev_day_excel_path = f"{brand_dir}/{prev_day_str}_data.xlsx"

    return today_excel_path, prev_day_excel_path

def compare_two_days_products(today_excel_path, prev_day_excel_path, save_new_path=None):

    try:
        today_df = pd.read_excel(today_excel_path, usecols=["url", "标题", "价格"])
        today_df = today_df.dropna(subset=["url"])  # 排除url为空的行
        print(f"\n✅ 成功读取当天数据:{today_excel_path}")
        print(f"   - 当天商品总数:{len(today_df)}")

        if not os.path.exists(prev_day_excel_path):
            print(f"❌ 前一天文件不存在:{prev_day_excel_path},无法对比(视为首次爬取,所有商品都是上新)")
            new_products_df = today_df 
        else:
            prev_df = pd.read_excel(prev_day_excel_path, usecols=["url"])
            prev_df = prev_df.dropna(subset=["url"])
            prev_urls = set(prev_df["url"])
            print(f"✅ 成功读取前一天数据:{prev_day_excel_path}")
            print(f"   - 前一天商品总数:{len(prev_df)}")

            new_products_df = today_df[~today_df["url"].isin(prev_urls)]  # ~表示"不包含"

        new_product_count = len(new_products_df)
        print(f"\n📊 对比结果:当天上新商品数量 = {new_product_count}")

        if new_product_count > 0:
            print("\n🔍 上新商品示例:")
            for idx, row in new_products_df.iterrows():
                print(f"   {idx+1}. 标题:{row['标题'][:30]} | URL:{row['url'][:50]}")
        else:
            print("❌ 当天无上新商品")

        return new_product_count, len(prev_df), len(today_df)

    except Exception as e:
        print(f"\n❌ 对比失败:{str(e)}")
        return -1, -1, -1

if __name__ == "__main__":
    # 统一管理多品牌配置:key=品牌名,value=(品牌目录, 爬取URL)
    BRANDS_CONFIG = {
        "小米-手机": (
            "E:/crawls/小米-手机/",
            "https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机"
        ),
        "华为-手机": (
            "E:/crawls/华为-手机/",
            "https://mall.jd.com/view_search-466323-1000004259-1000004259-0-0-0-0-1-1-60.html?keyword=手机"
        ),
    }
    for brand_name, (brand_dir, crawl_url) in BRANDS_CONFIG.items():
        print(f"\n" + "="*50)
        print(f"开始处理【{brand_name}】")
        print("="*50)
        today_excel_path, prev_day_excel_path = get_excel_paths(brand_dir)
        compare_two_days_products(
                today_excel_path=today_excel_path,
                prev_day_excel_path=prev_day_excel_path
            )

查看机票价格涨降情况

python 复制代码
from DrissionPage.errors import ElementNotFoundError
import time
import random
from datetime import datetime
from DrissionPage import ChromiumPage

class Highlight:
    RED = '\033[91m'    # 红色(降价/涨价)
    GREEN = '\033[92m'  # 绿色(原价)
    END = '\033[0m'     # 结束高亮


TARGET_FLIGHTS = [
    {"plane_no": "CZ3952 波音787(大)", "ori_price": 1660, "current_benchmark": 1660, "history_lowest": 1660, "depart_time": "20:55", "arrive_time":"23:50"},
    {"plane_no": "HU7250 波音737(中)", "ori_price": 1700, "current_benchmark": 1700, "history_lowest": 1700, "depart_time": "17:55", "arrive_time":"21:00"},
    {"plane_no": "CZ6528 C919(中)", "ori_price": 1860, "current_benchmark": 1860, "history_lowest": 1860, "depart_time": "17:15", "arrive_time":"21:10"},
    {"plane_no": "CZ3696 空客A320(中)", "ori_price": 2040, "current_benchmark": 2040, "history_lowest": 2040, "depart_time": "19:55", "arrive_time":"22:45"}
]

def monitor_multi_flights():
    page = ChromiumPage()
    base_url = "https://flights.ctrip.com/online/list/oneway-tyn-can"

    try:
        current_ct = str(int(datetime.now().timestamp() * 1000))
        url = f"{base_url}?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct={current_ct}"
        page.get(url, show_errmsg=True)
        print(f"已进入目标页面:{url}")
        print(f"当前监测航班总数:{len(TARGET_FLIGHTS)} → {[f['plane_no'] for f in TARGET_FLIGHTS]}\n")
        while True:
            # 页面滚动加载(确保所有航班数据加载完成)
            page.wait(5)
            for _ in range(10):
                page.scroll.down()
                page.wait(1)
            page.wait(2)

            # 获取当前页所有航班容器
            items = page.eles('xpath://div[@class="flight-box"]')
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print(f"{'='*80}")
            print(f"{current_time} - 页面共加载 {len(items)} 个航班容器 | 【监测中:最低价格追踪】")
            print(f"{'='*80}")

            flight_price_map = {}
            for item in items:
                try:
                    current_plane_no = item.ele('xpath:.//span[@class="plane-No"]').text
                    price_text = item.ele('xpath:.//span[@class="price"]').text
                    current_price = int(price_text.split("¥")[1])
                    flight_price_map[current_plane_no] = current_price
                except ElementNotFoundError:
                    continue
                except Exception as e:
                    print(f"⚠️  解析单个航班容器出错:{str(e)}")
                    continue
                    
            for flight in TARGET_FLIGHTS:
                plane_no = flight["plane_no"]
                ori_price = flight["ori_price"]
                current_benchmark = flight["current_benchmark"]
                history_lowest = flight["history_lowest"]
                depart_time = flight["depart_time"]
                arrive_time = flight["arrive_time"]

                if plane_no not in flight_price_map:
                    print(f"❌ 未找到航班:{plane_no}(可能已下架或页面未加载)")
                    continue

                current_price = flight_price_map[plane_no]
                print(f"\n📅 航班:{plane_no}")
                print(f"   出发时间:{depart_time} | 到达时间:{arrive_time} | 初始价格:¥{ori_price} | 当前价格:¥{current_price} | 历史最低:¥{history_lowest}")

                # 价格下降:红色高亮
                if current_price < current_benchmark:
                    drop_amount = ori_price - current_price
                    print(f"   {Highlight.RED}✅ 降价了!下降金额:¥{drop_amount}{Highlight.END}")
                    flight["current_benchmark"] = current_price

                # 价格上涨:红色高亮
                elif current_price > current_benchmark:
                    rise_amount = current_price - ori_price
                    print(f"   {Highlight.RED}⚠️  涨价了!上涨金额:¥{rise_amount}{Highlight.END}")
                    flight["current_benchmark"] = current_price

                # 价格不变:绿色显示
                else:
                    print(f"   {Highlight.GREEN}ℹ️  价格未变{Highlight.END}")

                if current_price < history_lowest:
                    flight["history_lowest"] = current_price  # 更新历史最低
                    print(f"   {Highlight.RED}🔥 刷新历史最低价格!新历史最低:¥{current_price}{Highlight.END}")

            # 定时刷新:生成5-9分钟随机等待时间
            wait_minutes = random.randint(5, 9)
            wait_seconds = wait_minutes * 60
            print(f"\n{'-'*60}")
            print(f"当前轮监测结束,将在 {wait_minutes} 分钟后刷新页面...")
            print(f"{'-'*60}\n")
            time.sleep(wait_seconds)

            # 用新URL刷新(而非page.refresh()),避免重复特征
            current_ct = str(int(datetime.now().timestamp() * 1000))
            new_url = f"{base_url}?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct={current_ct}"
            page.get(new_url)
            # 等待页面核心元素加载(最多15秒)
            page.wait.eles_loaded('xpath://div[contains(@class, "flight-box")]', timeout=15)

    except Exception as e:
        print(f"\n❌ 监测主逻辑出错:{str(e)}")
    finally:
        page.close()
        print("\n浏览器已关闭,监测结束")
        print(f"\n{'='*80}")
        print(f"监测结束 | 各航班最终历史最低价格:")
        for f in TARGET_FLIGHTS:
            print(f"  - {f['plane_no']}:¥{f['history_lowest']}")
        print(f"{'='*80}")

if __name__ == "__main__":
    monitor_multi_flights()

输出为

bash 复制代码
已进入目标页面:https://flights.ctrip.com/online/list/oneway-tyn-can?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct=1757552041927
当前监测航班总数:4 → ['CZ3952 波音787(大)', 'HU7250 波音737(中)', 'CZ6528 C919(中)', 'CZ3696 空客A320(中)']

================================================================================
2025-09-11 08:54:20 - 页面共加载 25 个航班容器 | 【监测中:最低价格追踪】
================================================================================

� 航班:CZ3952 波音787(大)
   出发时间:20:55 | 到达时间:23:50 | 初始价格:¥1660 | 当前价格:¥1660 | 历史最低:¥1660
   ℹ️  价格未变

� 航班:HU7250 波音737(中)
   出发时间:17:55 | 到达时间:21:00 | 初始价格:¥1700 | 当前价格:¥1700 | 历史最低:¥1700
   ℹ️  价格未变

� 航班:CZ6528 C919(中)
   出发时间:17:15 | 到达时间:21:10 | 初始价格:¥1860 | 当前价格:¥1860 | 历史最低:¥1860
   ℹ️  价格未变

� 航班:CZ3696 空客A320(中)
   出发时间:19:55 | 到达时间:22:45 | 初始价格:¥2040 | 当前价格:¥2040 | 历史最低:¥2040
   ℹ️  价格未变

------------------------------------------------------------
当前轮监测结束,将在 9 分钟后刷新页面...
------------------------------------------------------------
相关推荐
橙子家2 小时前
浏览器缓存之【基础键值存储】:Local storage 和 Session storage
前端
程序员龙叔4 小时前
编写高质量 Skill 系列 -- 如何设计需求分析与用例生成的 SKILL
自动化测试·软件测试·python·软件测试工程师·接口测试·性能测试·skill·ai测试
星星在线4 小时前
MusicFree:一个「All in One」的个人音乐服务器,让听歌回归简单
前端·后端
IT_陈寒5 小时前
Redis的SETNX并发问题让我加了三天班
前端·人工智能·后端
demo007x5 小时前
Docling 文档转换以及技术架构分析
前端·后端·程序员
京东云开发者6 小时前
京东市民服务又“上新”!这次是黑龙江“龙易办”
前端
袋鱼不重7 小时前
我的神奇同事,AI 用多了居然写了个 Open In Codex
前端·后端·ai编程
用户8356290780517 小时前
使用 Python 操作 Word 内容控件
后端·python
通信小呆呆7 小时前
当算法有了“五感”:多模态数据融合如何向人体感官协同学习?
人工智能·学习·算法·机器学习·机器人
Fireworks7 小时前
深入vue3源码解读 -- 1、响应式的基础概念
前端