DrissionPage的学习

目录

查看店铺内商品上新情况

python 复制代码
from DrissionPage import ChromiumPage
from DataRecorder import Recorder
from datetime import datetime, timedelta
import pandas as pd
import os
import time

def crawl_jd(url, r, good_name):
    page = ChromiumPage()
    crawled_urls = set()

    try:
        page.get(url, show_errmsg=True)
        page.wait(2) 
        print(f"已进入目标页面:{url}")

        while True:
            for _ in range(9):
                page.scroll.down(pixel=1000)
                page.wait(3)
            page.wait(2)
            
            items = page.eles('xpath://li[@class="jSubObject"]')
            current_page_item_count = len(items)
            print(f"\n当前页商品总数:{current_page_item_count}")
            new_crawled = 0

            for item in items:
                try:
                    info_url = item.ele('xpath:.//div[@class="jDesc"]/a').link
                    if info_url in crawled_urls:
                        print(f"→ 商品已重复,跳过:{info_url[:50]}...") 
                        continue

                    title = item.ele('xpath:.//div[@class="jDesc"]/a').text.strip()
                    if good_name not in title: # 最下面的推荐商品需要剔除掉
                        continue
                    item.ele('xpath:.//span[@class="jdNum"]', timeout=3)
                    price = item.ele('xpath:.//div[@class="jdPrice"]').text.replace(' ', '')
                    price = price.split('.')[0] if '.' in price else price

                    total_count = len(crawled_urls) + 1 
                    print(f"→ 已抓取{total_count}个商品:链接:{info_url[:50]}... | 标题:{title[:30]}... | 价格:{price}")
                    r.add_data((info_url, title, price))
                    crawled_urls.add(info_url)
                    new_crawled += 1

                except Exception as e:
                    print(f"→ 单个商品提取失败:{str(e)[:80]}...")  # 截取错误信息前80字符
                    continue

            # 循环终止判断:当前页无新商品 → 已到最后一页(重复加载)
            if new_crawled == 0:
                print(f"\n当前页无新商品(新抓取数:{new_crawled}),已爬取到最后一页")
                break

            try:
                next_btn = page('下一页', timeout=2)  # 通过文本定位下一页按钮
                if next_btn:
                    print(f"\n当前页新抓取{new_crawled}个商品,准备进入下一页")
                    next_btn.click()
                    page.wait.load_start()  # 等待页面完全加载
                    page.wait(2)  # 额外等待,确保新页初始数据渲染
                else:
                    print(f"\n未找到下一页按钮,停止爬取")
                    break
            except Exception as e:
                print(f"\n点击下一页失败:{str(e)[:80]}...,停止爬取")
                break

    except Exception as main_e:
        print(f"\n爬取主逻辑异常:{str(main_e)}")
    finally:
        print(f"\n爬取结束!总计抓取商品数量:{len(crawled_urls)}")
        r.record()
        print(f"数据已保存到Excel文件")
        # 关闭浏览器,释放资源(避免进程残留)
#         page.close()
#         print("浏览器已关闭")

def compare_two_days_products(today_excel_path, prev_day_excel_path, save_new_path=None):

    try:
        today_df = pd.read_excel(today_excel_path, usecols=["url", "标题", "价格"])
        today_df = today_df.dropna(subset=["url"])
        print(f"\n✅ 成功读取当天数据:{today_excel_path}")
        print(f"   - 当天商品总数:{len(today_df)}")

        # 读取前一天数据(仅需url列)
        if not os.path.exists(prev_day_excel_path):
            print(f"❌ 前一天文件不存在:{prev_day_excel_path},无法对比(视为首次爬取,所有商品都是上新)")
            new_products_df = today_df
        else:
            prev_df = pd.read_excel(prev_day_excel_path, usecols=["url"])
            prev_df = prev_df.dropna(subset=["url"])
            prev_urls = set(prev_df["url"])
            print(f"✅ 成功读取前一天数据:{prev_day_excel_path}")
            print(f"   - 前一天商品总数:{len(prev_df)}")
            new_products_df = today_df[~today_df["url"].isin(prev_urls)]  # ~表示"不包含"

        new_product_count = len(new_products_df)
        print(f"\n📊 对比结果:当天上新商品数量 = {new_product_count}")

        if new_product_count > 0:
            print("\n🔍 上新商品示例:")
            for idx, row in new_products_df.iterrows():
                print(f"   {idx+1}. 标题:{row['标题'][:30]} | URL:{row['url'][:50]}")
        else:
            print("❌ 当天无上新商品")
        return new_product_count, len(prev_df), len(today_df)

    except Exception as e:
        print(f"\n❌ 对比失败:{str(e)}")
        return -1, -1, -1

def get_excel_paths(brand_dir):
    today = datetime.now()
    prev_day = today - timedelta(days=1)
    today_str = today.strftime("%m%d")
    prev_day_str = prev_day.strftime("%m%d")
    
    os.makedirs(brand_dir, exist_ok=True)
    
    today_excel_path = f"{brand_dir}/{today_str}_data.xlsx"
    prev_day_excel_path = f"{brand_dir}/{prev_day_str}_data.xlsx"

    return today_excel_path, prev_day_excel_path

def run_brand_crawl_and_compare(brand_dir, brand_name, crawl_url):
    today_excel_path, prev_day_excel_path = get_excel_paths(brand_dir)
    
    # 若当天Excel文件已存在,先删除(避免数据重复或格式冲突)
    if os.path.exists(today_excel_path):
        try:
            os.remove(today_excel_path)
            print(f"⚠️  检测到当天Excel文件已存在,已删除:{today_excel_path}")
        except Exception as e:
            print(f"❌ 删除已存在的Excel文件失败:{str(e)}")
            raise Exception(f"文件删除失败,无法继续爬取:{str(e)}")
    excel_recorder = Recorder(today_excel_path)
    excel_recorder.set.head(('url', '标题', '价格'))

    crawl_jd(url=crawl_url, r=excel_recorder, good_name=brand_name.split("-")[1])

    return compare_two_days_products(
                today_excel_path=today_excel_path,
                prev_day_excel_path=prev_day_excel_path
            )

if __name__ == "__main__":
    BRANDS_CONFIG = {
        "小米-手机": (
            "E:/crawls/小米-手机/",
            "https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机"
        ),
        "华为-手机": (
            "E:/crawls/华为-手机/",
            "https://mall.jd.com/view_search-466323-1000004259-1000004259-0-0-0-0-1-1-60.html?keyword=手机"
        ),
    }
    counts = []
    brand_names = []
    for brand_name, (brand_dir, crawl_url) in BRANDS_CONFIG.items():
        print(f"\n" + "="*50)
        print(f"开始处理【{brand_name}】")
        print("="*50)
        new_product_count, prev_count, today_count = run_brand_crawl_and_compare(brand_dir=brand_dir,  brand_name=brand_name, crawl_url=crawl_url)
        brand_names.append(brand_name)
        counts.append([new_product_count, prev_count, today_count])
        time.sleep(10)

    print(f"\n" + "="*50)
    print("所有品牌处理完成!")
    print("="*50)

    print(f"\n" + "="*80)
    print(f"【各品牌数据汇总】- 生成时间:{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*80)
    # 打印表头
    print(f"{'品牌名称':<15} | {'上新商品数':<10} | {'前一天商品数':<12} | {'今天商品数':<10}")
    print("-"*80)
    for i in range(len(brand_names)):
        brand = brand_names[i]
        new_count, prev_count, today_count = counts[i]
        # 处理异常值显示(如无数据时显示N/A)
        new_count = new_count if new_count != -1 else "N/A"
        prev_count = prev_count if prev_count is not None else "N/A"
        today_count = today_count if today_count is not None else "N/A"
        print(f"{brand:<15} | {new_count:<10} | {prev_count:<12} | {today_count:<10}")
    print("="*80)

输出:

bash 复制代码
==================================================
开始处理【小米-手机】
==================================================
已进入目标页面:https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机

当前页商品总数:70
→ 已抓取1个商品:链接:https://item.jd.com/100200730249.html... | 标题:小米(MI)REDMI Note15 Pro 天玑7400-... | 价格:¥1399
→ 已抓取2个商品:链接:https://item.jd.com/100067438552.html... | 标题:小米Redmi Note13 5G 国家补贴 1亿像素 超细... | 价格:¥799
→ 已抓取3个商品:链接:https://item.jd.com/100129016845.html... | 标题:小米REDMI K80 国家补贴 第三代骁龙8 6550mA... | 价格:¥2099
→ 已抓取4个商品:链接:https://item.jd.com/100071383534.html... | 标题:小米14 徕卡光学镜头 光影猎人900 国家补贴 第三代骁龙... | 价格:¥2699
→ 已抓取5个商品:链接:https://item.jd.com/100071390038.html... | 标题:小米14 徕卡光学镜头 光影猎人900 国家补贴 第三代骁龙... | 价格:¥2999
→ 已抓取6个商品:链接:https://item.jd.com/100158171899.html... | 标题:小米(MI)【国家补贴】REDMI Turbo 4 Pro ... | 价格:¥1799
→ 已抓取7个商品:链接:https://item.jd.com/100068892989.html... | 标题:小米 Redmi Note13Pro 骁龙7S 国家补贴 新... | 价格:¥1129
→ 已抓取8个商品:链接:https://item.jd.com/100157830510.html... | 标题:小米 REDMI K80 Pro 国家补贴 骁龙8至尊版 全... | 价格:¥3399
...
爬取结束!总计抓取商品数量:83
E:\crawls\小米-手机\0911_data.xlsx 开始写入文件,切勿关闭进程。
E:\crawls\小米-手机\0911_data.xlsx 写入文件结束。
数据已保存到Excel文件

✅ 成功读取当天数据:E:/crawls/小米-手机//0911_data.xlsx
   - 当天商品总数:83
❌ 前一天文件不存在:E:/crawls/小米-手机//0910_data.xlsx,无法对比(视为首次爬取,所有商品都是上新)
...
==================================================
所有品牌处理完成!
==================================================

================================================================================
【各品牌数据汇总】- 生成时间:2025-09-11 17:59:53
================================================================================
品牌名称            | 上新商品数      | 前一天商品数       | 今天商品数
--------------------------------------------------------------------------------
小米-手机           | N/A        | -1           | -1
华为-手机           | N/A        | -1           | -1
================================================================================

只前后两天的商品上新对比

python 复制代码
from datetime import datetime, timedelta
import pandas as pd
import os
import time

def get_excel_paths(brand_dir):
    today = datetime.now()
    prev_day = today - timedelta(days=1)
    today_str = today.strftime("%m%d")
    prev_day_str = prev_day.strftime("%m%d")
    
    today_excel_path = f"{brand_dir}/{today_str}_data.xlsx"
    prev_day_excel_path = f"{brand_dir}/{prev_day_str}_data.xlsx"

    return today_excel_path, prev_day_excel_path

def compare_two_days_products(today_excel_path, prev_day_excel_path, save_new_path=None):

    try:
        today_df = pd.read_excel(today_excel_path, usecols=["url", "标题", "价格"])
        today_df = today_df.dropna(subset=["url"])  # 排除url为空的行
        print(f"\n✅ 成功读取当天数据:{today_excel_path}")
        print(f"   - 当天商品总数:{len(today_df)}")

        if not os.path.exists(prev_day_excel_path):
            print(f"❌ 前一天文件不存在:{prev_day_excel_path},无法对比(视为首次爬取,所有商品都是上新)")
            new_products_df = today_df 
        else:
            prev_df = pd.read_excel(prev_day_excel_path, usecols=["url"])
            prev_df = prev_df.dropna(subset=["url"])
            prev_urls = set(prev_df["url"])
            print(f"✅ 成功读取前一天数据:{prev_day_excel_path}")
            print(f"   - 前一天商品总数:{len(prev_df)}")

            new_products_df = today_df[~today_df["url"].isin(prev_urls)]  # ~表示"不包含"

        new_product_count = len(new_products_df)
        print(f"\n📊 对比结果:当天上新商品数量 = {new_product_count}")

        if new_product_count > 0:
            print("\n🔍 上新商品示例:")
            for idx, row in new_products_df.iterrows():
                print(f"   {idx+1}. 标题:{row['标题'][:30]} | URL:{row['url'][:50]}")
        else:
            print("❌ 当天无上新商品")

        return new_product_count, len(prev_df), len(today_df)

    except Exception as e:
        print(f"\n❌ 对比失败:{str(e)}")
        return -1, -1, -1

if __name__ == "__main__":
    # 统一管理多品牌配置:key=品牌名,value=(品牌目录, 爬取URL)
    BRANDS_CONFIG = {
        "小米-手机": (
            "E:/crawls/小米-手机/",
            "https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机"
        ),
        "华为-手机": (
            "E:/crawls/华为-手机/",
            "https://mall.jd.com/view_search-466323-1000004259-1000004259-0-0-0-0-1-1-60.html?keyword=手机"
        ),
    }
    for brand_name, (brand_dir, crawl_url) in BRANDS_CONFIG.items():
        print(f"\n" + "="*50)
        print(f"开始处理【{brand_name}】")
        print("="*50)
        today_excel_path, prev_day_excel_path = get_excel_paths(brand_dir)
        compare_two_days_products(
                today_excel_path=today_excel_path,
                prev_day_excel_path=prev_day_excel_path
            )

查看机票价格涨降情况

python 复制代码
from DrissionPage.errors import ElementNotFoundError
import time
import random
from datetime import datetime
from DrissionPage import ChromiumPage

class Highlight:
    RED = '\033[91m'    # 红色(降价/涨价)
    GREEN = '\033[92m'  # 绿色(原价)
    END = '\033[0m'     # 结束高亮


TARGET_FLIGHTS = [
    {"plane_no": "CZ3952 波音787(大)", "ori_price": 1660, "current_benchmark": 1660, "history_lowest": 1660, "depart_time": "20:55", "arrive_time":"23:50"},
    {"plane_no": "HU7250 波音737(中)", "ori_price": 1700, "current_benchmark": 1700, "history_lowest": 1700, "depart_time": "17:55", "arrive_time":"21:00"},
    {"plane_no": "CZ6528 C919(中)", "ori_price": 1860, "current_benchmark": 1860, "history_lowest": 1860, "depart_time": "17:15", "arrive_time":"21:10"},
    {"plane_no": "CZ3696 空客A320(中)", "ori_price": 2040, "current_benchmark": 2040, "history_lowest": 2040, "depart_time": "19:55", "arrive_time":"22:45"}
]

def monitor_multi_flights():
    page = ChromiumPage()
    base_url = "https://flights.ctrip.com/online/list/oneway-tyn-can"

    try:
        current_ct = str(int(datetime.now().timestamp() * 1000))
        url = f"{base_url}?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct={current_ct}"
        page.get(url, show_errmsg=True)
        print(f"已进入目标页面:{url}")
        print(f"当前监测航班总数:{len(TARGET_FLIGHTS)} → {[f['plane_no'] for f in TARGET_FLIGHTS]}\n")
        while True:
            # 页面滚动加载(确保所有航班数据加载完成)
            page.wait(5)
            for _ in range(10):
                page.scroll.down()
                page.wait(1)
            page.wait(2)

            # 获取当前页所有航班容器
            items = page.eles('xpath://div[@class="flight-box"]')
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print(f"{'='*80}")
            print(f"{current_time} - 页面共加载 {len(items)} 个航班容器 | 【监测中:最低价格追踪】")
            print(f"{'='*80}")

            flight_price_map = {}
            for item in items:
                try:
                    current_plane_no = item.ele('xpath:.//span[@class="plane-No"]').text
                    price_text = item.ele('xpath:.//span[@class="price"]').text
                    current_price = int(price_text.split("¥")[1])
                    flight_price_map[current_plane_no] = current_price
                except ElementNotFoundError:
                    continue
                except Exception as e:
                    print(f"⚠️  解析单个航班容器出错:{str(e)}")
                    continue
                    
            for flight in TARGET_FLIGHTS:
                plane_no = flight["plane_no"]
                ori_price = flight["ori_price"]
                current_benchmark = flight["current_benchmark"]
                history_lowest = flight["history_lowest"]
                depart_time = flight["depart_time"]
                arrive_time = flight["arrive_time"]

                if plane_no not in flight_price_map:
                    print(f"❌ 未找到航班:{plane_no}(可能已下架或页面未加载)")
                    continue

                current_price = flight_price_map[plane_no]
                print(f"\n📅 航班:{plane_no}")
                print(f"   出发时间:{depart_time} | 到达时间:{arrive_time} | 初始价格:¥{ori_price} | 当前价格:¥{current_price} | 历史最低:¥{history_lowest}")

                # 价格下降:红色高亮
                if current_price < current_benchmark:
                    drop_amount = ori_price - current_price
                    print(f"   {Highlight.RED}✅ 降价了!下降金额:¥{drop_amount}{Highlight.END}")
                    flight["current_benchmark"] = current_price

                # 价格上涨:红色高亮
                elif current_price > current_benchmark:
                    rise_amount = current_price - ori_price
                    print(f"   {Highlight.RED}⚠️  涨价了!上涨金额:¥{rise_amount}{Highlight.END}")
                    flight["current_benchmark"] = current_price

                # 价格不变:绿色显示
                else:
                    print(f"   {Highlight.GREEN}ℹ️  价格未变{Highlight.END}")

                if current_price < history_lowest:
                    flight["history_lowest"] = current_price  # 更新历史最低
                    print(f"   {Highlight.RED}🔥 刷新历史最低价格!新历史最低:¥{current_price}{Highlight.END}")

            # 定时刷新:生成5-9分钟随机等待时间
            wait_minutes = random.randint(5, 9)
            wait_seconds = wait_minutes * 60
            print(f"\n{'-'*60}")
            print(f"当前轮监测结束,将在 {wait_minutes} 分钟后刷新页面...")
            print(f"{'-'*60}\n")
            time.sleep(wait_seconds)

            # 用新URL刷新(而非page.refresh()),避免重复特征
            current_ct = str(int(datetime.now().timestamp() * 1000))
            new_url = f"{base_url}?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct={current_ct}"
            page.get(new_url)
            # 等待页面核心元素加载(最多15秒)
            page.wait.eles_loaded('xpath://div[contains(@class, "flight-box")]', timeout=15)

    except Exception as e:
        print(f"\n❌ 监测主逻辑出错:{str(e)}")
    finally:
        page.close()
        print("\n浏览器已关闭,监测结束")
        print(f"\n{'='*80}")
        print(f"监测结束 | 各航班最终历史最低价格:")
        for f in TARGET_FLIGHTS:
            print(f"  - {f['plane_no']}:¥{f['history_lowest']}")
        print(f"{'='*80}")

if __name__ == "__main__":
    monitor_multi_flights()

输出为

bash 复制代码
已进入目标页面:https://flights.ctrip.com/online/list/oneway-tyn-can?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct=1757552041927
当前监测航班总数:4 → ['CZ3952 波音787(大)', 'HU7250 波音737(中)', 'CZ6528 C919(中)', 'CZ3696 空客A320(中)']

================================================================================
2025-09-11 08:54:20 - 页面共加载 25 个航班容器 | 【监测中:最低价格追踪】
================================================================================

� 航班:CZ3952 波音787(大)
   出发时间:20:55 | 到达时间:23:50 | 初始价格:¥1660 | 当前价格:¥1660 | 历史最低:¥1660
   ℹ️  价格未变

� 航班:HU7250 波音737(中)
   出发时间:17:55 | 到达时间:21:00 | 初始价格:¥1700 | 当前价格:¥1700 | 历史最低:¥1700
   ℹ️  价格未变

� 航班:CZ6528 C919(中)
   出发时间:17:15 | 到达时间:21:10 | 初始价格:¥1860 | 当前价格:¥1860 | 历史最低:¥1860
   ℹ️  价格未变

� 航班:CZ3696 空客A320(中)
   出发时间:19:55 | 到达时间:22:45 | 初始价格:¥2040 | 当前价格:¥2040 | 历史最低:¥2040
   ℹ️  价格未变

------------------------------------------------------------
当前轮监测结束,将在 9 分钟后刷新页面...
------------------------------------------------------------
相关推荐
阿加犀智能2 小时前
使用Langchain生成本地rag知识库并搭载大模型
服务器·python·langchain
木木子99992 小时前
行业学习【电商】:直播电商的去头部化、矩阵号?
学习
huabuyu2 小时前
将 Markdown 转为 AST:实现思路与实战解析
前端
前端Hardy2 小时前
惊艳同事的 Canvas 事件流程图,这篇教会你
前端·javascript·css
哔哩哔哩技术2 小时前
KMP on iOS 深度工程化:模块化、并发编译与 98% 增量构建加速
前端
朱自清的诗.2 小时前
使用python脚本储存mosquito服务器数据到sqlite
python·单片机·sqlite·esp32
神仙别闹2 小时前
基于 Vue+SQLite3开发吉他谱推荐网站
前端·vue.js·sqlite
xiao-xiang2 小时前
Django的session机制
python·django
Async Cipher2 小时前
CSS 居中
前端·css·css3