DrissionPage的学习

查看店铺内商品上新情况

python 复制代码

from DrissionPage import ChromiumPage
from DataRecorder import Recorder
from datetime import datetime, timedelta
import pandas as pd
import os
import time

def crawl_jd(url, r, good_name):
    page = ChromiumPage()
    crawled_urls = set()

    try:
        page.get(url, show_errmsg=True)
        page.wait(2) 
        print(f"已进入目标页面：{url}")

        while True:
            for _ in range(9):
                page.scroll.down(pixel=1000)
                page.wait(3)
            page.wait(2)
            
            items = page.eles('xpath://li[@class="jSubObject"]')
            current_page_item_count = len(items)
            print(f"\n当前页商品总数：{current_page_item_count}")
            new_crawled = 0

            for item in items:
                try:
                    info_url = item.ele('xpath:.//div[@class="jDesc"]/a').link
                    if info_url in crawled_urls:
                        print(f"→ 商品已重复，跳过：{info_url[:50]}...") 
                        continue

                    title = item.ele('xpath:.//div[@class="jDesc"]/a').text.strip()
                    if good_name not in title: # 最下面的推荐商品需要剔除掉
                        continue
                    item.ele('xpath:.//span[@class="jdNum"]', timeout=3)
                    price = item.ele('xpath:.//div[@class="jdPrice"]').text.replace(' ', '')
                    price = price.split('.')[0] if '.' in price else price

                    total_count = len(crawled_urls) + 1 
                    print(f"→ 已抓取{total_count}个商品：链接：{info_url[:50]}... | 标题：{title[:30]}... | 价格：{price}")
                    r.add_data((info_url, title, price))
                    crawled_urls.add(info_url)
                    new_crawled += 1

                except Exception as e:
                    print(f"→ 单个商品提取失败：{str(e)[:80]}...")  # 截取错误信息前80字符
                    continue

            # 循环终止判断：当前页无新商品 → 已到最后一页（重复加载）
            if new_crawled == 0:
                print(f"\n当前页无新商品（新抓取数：{new_crawled}），已爬取到最后一页")
                break

            try:
                next_btn = page('下一页', timeout=2)  # 通过文本定位下一页按钮
                if next_btn:
                    print(f"\n当前页新抓取{new_crawled}个商品，准备进入下一页")
                    next_btn.click()
                    page.wait.load_start()  # 等待页面完全加载
                    page.wait(2)  # 额外等待，确保新页初始数据渲染
                else:
                    print(f"\n未找到下一页按钮，停止爬取")
                    break
            except Exception as e:
                print(f"\n点击下一页失败：{str(e)[:80]}...，停止爬取")
                break

    except Exception as main_e:
        print(f"\n爬取主逻辑异常：{str(main_e)}")
    finally:
        print(f"\n爬取结束！总计抓取商品数量：{len(crawled_urls)}")
        r.record()
        print(f"数据已保存到Excel文件")
        # 关闭浏览器，释放资源（避免进程残留）
#         page.close()
#         print("浏览器已关闭")

def compare_two_days_products(today_excel_path, prev_day_excel_path, save_new_path=None):

    try:
        today_df = pd.read_excel(today_excel_path, usecols=["url", "标题", "价格"])
        today_df = today_df.dropna(subset=["url"])
        print(f"\n✅ 成功读取当天数据：{today_excel_path}")
        print(f"   - 当天商品总数：{len(today_df)}")

        # 读取前一天数据（仅需url列）
        if not os.path.exists(prev_day_excel_path):
            print(f"❌ 前一天文件不存在：{prev_day_excel_path}，无法对比（视为首次爬取，所有商品都是上新）")
            new_products_df = today_df
        else:
            prev_df = pd.read_excel(prev_day_excel_path, usecols=["url"])
            prev_df = prev_df.dropna(subset=["url"])
            prev_urls = set(prev_df["url"])
            print(f"✅ 成功读取前一天数据：{prev_day_excel_path}")
            print(f"   - 前一天商品总数：{len(prev_df)}")
            new_products_df = today_df[~today_df["url"].isin(prev_urls)]  # ~表示"不包含"

        new_product_count = len(new_products_df)
        print(f"\n📊 对比结果：当天上新商品数量 = {new_product_count}")

        if new_product_count > 0:
            print("\n🔍 上新商品示例：")
            for idx, row in new_products_df.iterrows():
                print(f"   {idx+1}. 标题：{row['标题'][:30]} | URL：{row['url'][:50]}")
        else:
            print("❌ 当天无上新商品")
        return new_product_count, len(prev_df), len(today_df)

    except Exception as e:
        print(f"\n❌ 对比失败：{str(e)}")
        return -1, -1, -1

def get_excel_paths(brand_dir):
    today = datetime.now()
    prev_day = today - timedelta(days=1)
    today_str = today.strftime("%m%d")
    prev_day_str = prev_day.strftime("%m%d")
    
    os.makedirs(brand_dir, exist_ok=True)
    
    today_excel_path = f"{brand_dir}/{today_str}_data.xlsx"
    prev_day_excel_path = f"{brand_dir}/{prev_day_str}_data.xlsx"

    return today_excel_path, prev_day_excel_path

def run_brand_crawl_and_compare(brand_dir, brand_name, crawl_url):
    today_excel_path, prev_day_excel_path = get_excel_paths(brand_dir)
    
    # 若当天Excel文件已存在，先删除（避免数据重复或格式冲突）
    if os.path.exists(today_excel_path):
        try:
            os.remove(today_excel_path)
            print(f"⚠️  检测到当天Excel文件已存在，已删除：{today_excel_path}")
        except Exception as e:
            print(f"❌ 删除已存在的Excel文件失败：{str(e)}")
            raise Exception(f"文件删除失败，无法继续爬取：{str(e)}")
    excel_recorder = Recorder(today_excel_path)
    excel_recorder.set.head(('url', '标题', '价格'))

    crawl_jd(url=crawl_url, r=excel_recorder, good_name=brand_name.split("-")[1])

    return compare_two_days_products(
                today_excel_path=today_excel_path,
                prev_day_excel_path=prev_day_excel_path
            )

if __name__ == "__main__":
    BRANDS_CONFIG = {
        "小米-手机": (
            "E:/crawls/小米-手机/",
            "https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机"
        ),
        "华为-手机": (
            "E:/crawls/华为-手机/",
            "https://mall.jd.com/view_search-466323-1000004259-1000004259-0-0-0-0-1-1-60.html?keyword=手机"
        ),
    }
    counts = []
    brand_names = []
    for brand_name, (brand_dir, crawl_url) in BRANDS_CONFIG.items():
        print(f"\n" + "="*50)
        print(f"开始处理【{brand_name}】")
        print("="*50)
        new_product_count, prev_count, today_count = run_brand_crawl_and_compare(brand_dir=brand_dir,  brand_name=brand_name, crawl_url=crawl_url)
        brand_names.append(brand_name)
        counts.append([new_product_count, prev_count, today_count])
        time.sleep(10)

    print(f"\n" + "="*50)
    print("所有品牌处理完成！")
    print("="*50)

    print(f"\n" + "="*80)
    print(f"【各品牌数据汇总】- 生成时间：{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("="*80)
    # 打印表头
    print(f"{'品牌名称':<15} | {'上新商品数':<10} | {'前一天商品数':<12} | {'今天商品数':<10}")
    print("-"*80)
    for i in range(len(brand_names)):
        brand = brand_names[i]
        new_count, prev_count, today_count = counts[i]
        # 处理异常值显示（如无数据时显示N/A）
        new_count = new_count if new_count != -1 else "N/A"
        prev_count = prev_count if prev_count is not None else "N/A"
        today_count = today_count if today_count is not None else "N/A"
        print(f"{brand:<15} | {new_count:<10} | {prev_count:<12} | {today_count:<10}")
    print("="*80)

输出：

bash 复制代码

==================================================
开始处理【小米-手机】
==================================================
已进入目标页面：https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机

当前页商品总数：70
→ 已抓取1个商品：链接：https://item.jd.com/100200730249.html... | 标题：小米（MI）REDMI Note15 Pro 天玑7400-... | 价格：￥1399
→ 已抓取2个商品：链接：https://item.jd.com/100067438552.html... | 标题：小米Redmi Note13 5G 国家补贴 1亿像素 超细... | 价格：￥799
→ 已抓取3个商品：链接：https://item.jd.com/100129016845.html... | 标题：小米REDMI K80 国家补贴 第三代骁龙8 6550mA... | 价格：￥2099
→ 已抓取4个商品：链接：https://item.jd.com/100071383534.html... | 标题：小米14 徕卡光学镜头 光影猎人900 国家补贴 第三代骁龙... | 价格：￥2699
→ 已抓取5个商品：链接：https://item.jd.com/100071390038.html... | 标题：小米14 徕卡光学镜头 光影猎人900 国家补贴 第三代骁龙... | 价格：￥2999
→ 已抓取6个商品：链接：https://item.jd.com/100158171899.html... | 标题：小米（MI）【国家补贴】REDMI Turbo 4 Pro ... | 价格：￥1799
→ 已抓取7个商品：链接：https://item.jd.com/100068892989.html... | 标题：小米 Redmi Note13Pro 骁龙7S 国家补贴 新... | 价格：￥1129
→ 已抓取8个商品：链接：https://item.jd.com/100157830510.html... | 标题：小米 REDMI K80 Pro 国家补贴 骁龙8至尊版 全... | 价格：￥3399
...
爬取结束！总计抓取商品数量：83
E:\crawls\小米-手机\0911_data.xlsx 开始写入文件，切勿关闭进程。
E:\crawls\小米-手机\0911_data.xlsx 写入文件结束。
数据已保存到Excel文件

✅ 成功读取当天数据：E:/crawls/小米-手机//0911_data.xlsx
   - 当天商品总数：83
❌ 前一天文件不存在：E:/crawls/小米-手机//0910_data.xlsx，无法对比（视为首次爬取，所有商品都是上新）
...
==================================================
所有品牌处理完成！
==================================================

================================================================================
【各品牌数据汇总】- 生成时间：2025-09-11 17:59:53
================================================================================
品牌名称            | 上新商品数      | 前一天商品数       | 今天商品数
--------------------------------------------------------------------------------
小米-手机           | N/A        | -1           | -1
华为-手机           | N/A        | -1           | -1
================================================================================

只前后两天的商品上新对比

python 复制代码

from datetime import datetime, timedelta
import pandas as pd
import os
import time

def get_excel_paths(brand_dir):
    today = datetime.now()
    prev_day = today - timedelta(days=1)
    today_str = today.strftime("%m%d")
    prev_day_str = prev_day.strftime("%m%d")
    
    today_excel_path = f"{brand_dir}/{today_str}_data.xlsx"
    prev_day_excel_path = f"{brand_dir}/{prev_day_str}_data.xlsx"

    return today_excel_path, prev_day_excel_path

def compare_two_days_products(today_excel_path, prev_day_excel_path, save_new_path=None):

    try:
        today_df = pd.read_excel(today_excel_path, usecols=["url", "标题", "价格"])
        today_df = today_df.dropna(subset=["url"])  # 排除url为空的行
        print(f"\n✅ 成功读取当天数据：{today_excel_path}")
        print(f"   - 当天商品总数：{len(today_df)}")

        if not os.path.exists(prev_day_excel_path):
            print(f"❌ 前一天文件不存在：{prev_day_excel_path}，无法对比（视为首次爬取，所有商品都是上新）")
            new_products_df = today_df 
        else:
            prev_df = pd.read_excel(prev_day_excel_path, usecols=["url"])
            prev_df = prev_df.dropna(subset=["url"])
            prev_urls = set(prev_df["url"])
            print(f"✅ 成功读取前一天数据：{prev_day_excel_path}")
            print(f"   - 前一天商品总数：{len(prev_df)}")

            new_products_df = today_df[~today_df["url"].isin(prev_urls)]  # ~表示"不包含"

        new_product_count = len(new_products_df)
        print(f"\n📊 对比结果：当天上新商品数量 = {new_product_count}")

        if new_product_count > 0:
            print("\n🔍 上新商品示例：")
            for idx, row in new_products_df.iterrows():
                print(f"   {idx+1}. 标题：{row['标题'][:30]} | URL：{row['url'][:50]}")
        else:
            print("❌ 当天无上新商品")

        return new_product_count, len(prev_df), len(today_df)

    except Exception as e:
        print(f"\n❌ 对比失败：{str(e)}")
        return -1, -1, -1

if __name__ == "__main__":
    # 统一管理多品牌配置：key=品牌名，value=(品牌目录, 爬取URL)
    BRANDS_CONFIG = {
        "小米-手机": (
            "E:/crawls/小米-手机/",
            "https://mall.jd.com/view_search-442829-1000004123-1000004123-0-0-0-0-1-1-60.html?keyword=手机"
        ),
        "华为-手机": (
            "E:/crawls/华为-手机/",
            "https://mall.jd.com/view_search-466323-1000004259-1000004259-0-0-0-0-1-1-60.html?keyword=手机"
        ),
    }
    for brand_name, (brand_dir, crawl_url) in BRANDS_CONFIG.items():
        print(f"\n" + "="*50)
        print(f"开始处理【{brand_name}】")
        print("="*50)
        today_excel_path, prev_day_excel_path = get_excel_paths(brand_dir)
        compare_two_days_products(
                today_excel_path=today_excel_path,
                prev_day_excel_path=prev_day_excel_path
            )

查看机票价格涨降情况

python 复制代码

from DrissionPage.errors import ElementNotFoundError
import time
import random
from datetime import datetime
from DrissionPage import ChromiumPage

class Highlight:
    RED = '\033[91m'    # 红色（降价/涨价）
    GREEN = '\033[92m'  # 绿色（原价）
    END = '\033[0m'     # 结束高亮


TARGET_FLIGHTS = [
    {"plane_no": "CZ3952 波音787(大)", "ori_price": 1660, "current_benchmark": 1660, "history_lowest": 1660, "depart_time": "20:55", "arrive_time":"23:50"},
    {"plane_no": "HU7250 波音737(中)", "ori_price": 1700, "current_benchmark": 1700, "history_lowest": 1700, "depart_time": "17:55", "arrive_time":"21:00"},
    {"plane_no": "CZ6528 C919(中)", "ori_price": 1860, "current_benchmark": 1860, "history_lowest": 1860, "depart_time": "17:15", "arrive_time":"21:10"},
    {"plane_no": "CZ3696 空客A320(中)", "ori_price": 2040, "current_benchmark": 2040, "history_lowest": 2040, "depart_time": "19:55", "arrive_time":"22:45"}
]

def monitor_multi_flights():
    page = ChromiumPage()
    base_url = "https://flights.ctrip.com/online/list/oneway-tyn-can"

    try:
        current_ct = str(int(datetime.now().timestamp() * 1000))
        url = f"{base_url}?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct={current_ct}"
        page.get(url, show_errmsg=True)
        print(f"已进入目标页面：{url}")
        print(f"当前监测航班总数：{len(TARGET_FLIGHTS)} → {[f['plane_no'] for f in TARGET_FLIGHTS]}\n")
        while True:
            # 页面滚动加载（确保所有航班数据加载完成）
            page.wait(5)
            for _ in range(10):
                page.scroll.down()
                page.wait(1)
            page.wait(2)

            # 获取当前页所有航班容器
            items = page.eles('xpath://div[@class="flight-box"]')
            current_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            print(f"{'='*80}")
            print(f"{current_time} - 页面共加载 {len(items)} 个航班容器 | 【监测中：最低价格追踪】")
            print(f"{'='*80}")

            flight_price_map = {}
            for item in items:
                try:
                    current_plane_no = item.ele('xpath:.//span[@class="plane-No"]').text
                    price_text = item.ele('xpath:.//span[@class="price"]').text
                    current_price = int(price_text.split("¥")[1])
                    flight_price_map[current_plane_no] = current_price
                except ElementNotFoundError:
                    continue
                except Exception as e:
                    print(f"⚠️  解析单个航班容器出错：{str(e)}")
                    continue
                    
            for flight in TARGET_FLIGHTS:
                plane_no = flight["plane_no"]
                ori_price = flight["ori_price"]
                current_benchmark = flight["current_benchmark"]
                history_lowest = flight["history_lowest"]
                depart_time = flight["depart_time"]
                arrive_time = flight["arrive_time"]

                if plane_no not in flight_price_map:
                    print(f"❌ 未找到航班：{plane_no}（可能已下架或页面未加载）")
                    continue

                current_price = flight_price_map[plane_no]
                print(f"\n📅 航班：{plane_no}")
                print(f"   出发时间：{depart_time} | 到达时间：{arrive_time} | 初始价格：¥{ori_price} | 当前价格：¥{current_price} | 历史最低：¥{history_lowest}")

                # 价格下降：红色高亮
                if current_price < current_benchmark:
                    drop_amount = ori_price - current_price
                    print(f"   {Highlight.RED}✅ 降价了！下降金额：¥{drop_amount}{Highlight.END}")
                    flight["current_benchmark"] = current_price

                # 价格上涨：红色高亮
                elif current_price > current_benchmark:
                    rise_amount = current_price - ori_price
                    print(f"   {Highlight.RED}⚠️  涨价了！上涨金额：¥{rise_amount}{Highlight.END}")
                    flight["current_benchmark"] = current_price

                # 价格不变：绿色显示
                else:
                    print(f"   {Highlight.GREEN}ℹ️  价格未变{Highlight.END}")

                if current_price < history_lowest:
                    flight["history_lowest"] = current_price  # 更新历史最低
                    print(f"   {Highlight.RED}🔥 刷新历史最低价格！新历史最低：¥{current_price}{Highlight.END}")

            # 定时刷新：生成5-9分钟随机等待时间
            wait_minutes = random.randint(5, 9)
            wait_seconds = wait_minutes * 60
            print(f"\n{'-'*60}")
            print(f"当前轮监测结束，将在 {wait_minutes} 分钟后刷新页面...")
            print(f"{'-'*60}\n")
            time.sleep(wait_seconds)

            # 用新URL刷新（而非page.refresh()），避免重复特征
            current_ct = str(int(datetime.now().timestamp() * 1000))
            new_url = f"{base_url}?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct={current_ct}"
            page.get(new_url)
            # 等待页面核心元素加载（最多15秒）
            page.wait.eles_loaded('xpath://div[contains(@class, "flight-box")]', timeout=15)

    except Exception as e:
        print(f"\n❌ 监测主逻辑出错：{str(e)}")
    finally:
        page.close()
        print("\n浏览器已关闭，监测结束")
        print(f"\n{'='*80}")
        print(f"监测结束 | 各航班最终历史最低价格：")
        for f in TARGET_FLIGHTS:
            print(f"  - {f['plane_no']}：¥{f['history_lowest']}")
        print(f"{'='*80}")

if __name__ == "__main__":
    monitor_multi_flights()

输出为

bash 复制代码

已进入目标页面：https://flights.ctrip.com/online/list/oneway-tyn-can?_=1&depdate=2025-10-07&cabin=Y_S_C_F&ct=1757552041927
当前监测航班总数：4 → ['CZ3952 波音787(大)', 'HU7250 波音737(中)', 'CZ6528 C919(中)', 'CZ3696 空客A320(中)']

================================================================================
2025-09-11 08:54:20 - 页面共加载 25 个航班容器 | 【监测中：最低价格追踪】
================================================================================

� 航班：CZ3952 波音787(大)
   出发时间：20:55 | 到达时间：23:50 | 初始价格：¥1660 | 当前价格：¥1660 | 历史最低：¥1660
   ℹ️  价格未变

� 航班：HU7250 波音737(中)
   出发时间：17:55 | 到达时间：21:00 | 初始价格：¥1700 | 当前价格：¥1700 | 历史最低：¥1700
   ℹ️  价格未变

� 航班：CZ6528 C919(中)
   出发时间：17:15 | 到达时间：21:10 | 初始价格：¥1860 | 当前价格：¥1860 | 历史最低：¥1860
   ℹ️  价格未变

� 航班：CZ3696 空客A320(中)
   出发时间：19:55 | 到达时间：22:45 | 初始价格：¥2040 | 当前价格：¥2040 | 历史最低：¥2040
   ℹ️  价格未变

------------------------------------------------------------
当前轮监测结束，将在 9 分钟后刷新页面...
------------------------------------------------------------