基于selenium库的爬虫实战：京东手机数据爬取

1、实验目标

实现京东平台的模拟登录；
爬取手机类商品的基本信息与详情数据；
将数据保存为结构化文件（CSV格式）；
实现多页爬取，确保数据完整性。

2、实验思路与过程

访问京东页面，发现通过搜索页面进入商品页面需要登录，所以本实验采用selenuim库驱动webdiver进行登录；

进行登录时，除了账号密码，还需完成滑块验证码，鉴于京东反爬较为严格，试错机会少，本实验采用人为干预完成滑块验证码；

登录成功后，进入搜索页面，遍历搜索结果的每一个商品，点击进入商品页面；

进入商品页面，通过xpath定位并获取详细信息，完成数据获取后关闭页面回到搜索页面；

第一页商品数据获取完成后，通过xpath定位并点击下一页，继续爬取；

最后保存所有数据为csv格式。

3、代码实现

python 复制代码

import time
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import csv
jd_login_url = "https://passport.jd.com/uc/login"
jd_firstpage_url = "https://www.jd.com"

class JDMobileSpider:
    def __init__(self):
        #login_cookies = self.get_cookies()
        self.driver = None
        self.data = []
        self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
                        'Connection': 'keep-alive',
                        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                        #'Cookie': login_cookies
                        }
        

    def init_edge_browser(self,url):
        """初始化Edge浏览器驱动"""
        try:
            # 设置Edge浏览器选项
            edge_options = Options()
            
            # 常用配置选项
            edge_options.add_argument('--disable-blink-features=AutomationControlled')
            edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
            edge_options.add_experimental_option('useAutomationExtension', False)
            edge_options.add_argument('--no-sandbox')
            edge_options.add_argument('--disable-dev-shm-usage')
            # 初始化Edge浏览器
            self.driver = webdriver.Edge(options=edge_options)
            # 隐藏自动化特征
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
            # 设置窗口大小
            self.driver.set_window_size(1400, 900)
            print("Edge浏览器初始化成功")
            time.sleep(1)
            print("正在打开京东登录页面...")
            self.driver.get(url)
            time.sleep(3)
            return True

        except Exception as e:
            print(f"Edge浏览器初始化失败: {e}")
            return False
        
    def jd_login(self, username, password):
        """京东账号登录(使用Edge浏览器)"""
        try:
            # 等待页面加载并切换到账户登录
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.ID, "loginname"))
            )
            
            # 尝试点击账户登录标签
            #try:
            #    account_login = self.driver.find_element(By.XPATH, "//*[@id='loginsubmit']")
            #    account_login.click()
            #    time.sleep(3)
            #except:
            #    print("未找到账户登录标签，可能已是账户登录模式")
            
            # 输入用户名
            print("正在输入用户名...")
            user_input = self.driver.find_element(By.ID, "loginname")
            user_input.clear()
            user_input.send_keys(username)
            time.sleep(3)
            
            # 输入密码
            print("正在输入密码...")
            pwd_input = self.driver.find_element(By.ID, "nloginpwd")
            pwd_input.clear()
            pwd_input.send_keys(password)
            time.sleep(3)
            
            # 点击登录按钮
            print("正在点击登录按钮...")
            submit_btn = self.driver.find_element(By.ID, "loginsubmit")
            submit_btn.click()
            
            # 等待登录结果
            time.sleep(5)
            
            # 处理滑块验证
            try:
                # 设定一个等待时间，检查滑块验证码元素是否出现
                print("正在处理特殊验证码...")
                wait = WebDriverWait(self.driver, 5) # 等待最多10秒
                slider = wait.until(EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/div/div[2]/span[1]"))) # 使用京东滑块验证码的相关类名:cite[1]
    
                if slider:
                    print("检测到滑块验证码，请手动完成滑动...")
                    input("完成后，请在控制台按回车键继续...") # 程序在此暂停，等待用户操作:cite[3]
        
            except Exception as e:
            # 如果在设定时间内没有找到滑块元素，可能直接登录成功或出现了其他情况
                print("未检测到滑块验证码，或验证码形式有变化:", e)

            # 验证登录是否成功
            # 手动滑动完成后，检查登录是否成功
            try:
            #等待页面跳转并检查用户昵称元素
                WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, "nickname")) # 登录成功后会出现昵称元素
                )
                print("登录成功！")
               
    
            except Exception as e:
                print("登录可能失败，请检查:", e)
                
        except Exception as e:
            print("登录可能失败1,请检查:", e)


    def get_mobile_data(self, pages=3):
        """
        获取手机数据（每页约40个商品，3页约120个确保获取前60畅销）
        
        Args:
            keyword: 搜索关键词
            pages: 爬取页数
        """
        try:
            all_products_labels = ["商品名称","价格","评论数","商品详情"]
            all_products = []
            for page in range(1, pages + 1):
                jd_search_url = f"https://re.jd.com/search?keyword=%E4%BA%AC%E4%B8%9C%E6%89%8B%E6%9C%BA%E8%87%AA%E8%90%A5&page={page}&ad_od=3&re_dcp=21Sm2D2ZOw&traffic_source=1004&bd_vid=9286869643b8e0c2&cu=true&utm_source=haosou-search&utm_medium=cpc&utm_campaign=t_262767352_haosousearch&utm_term=63857745620_0_9998f764b1f84bdd80ee784885202003"
                print(f"正在爬取第{page}页数据...")
                
                # 京东搜索URL
                self.driver.get(jd_search_url)
                original_window = self.driver.current_window_handle
                time.sleep(5)
                
                # 模拟滚动加载所有商品
                #self.scroll_page()
                for i in range(5):
                    self.driver.execute_script('window.scrollBy(0, 100)')
                    time.sleep(1)
                # 解析当前页商品
                j = 0
                print("获取商品数据中...")
                products = self.driver.find_elements(By.XPATH, "//*[@id='shop_list']/li")
                for product in products:
                    try:
                        price = product.find_element(By.XPATH, ".//*[@class='price']").text.strip()
                        name = product.find_element(By.XPATH, ".//*[@class='commodity_tit']").text.strip()
                        comment_count = product.find_element(By.XPATH, ".//*[@class='comment']").text.strip()
                        print(f"第{j+1}个商品:")
                        print(f"名字: {name}, 价格: {price}, 评论数: {comment_count}")
                        time.sleep(3)
                    except Exception as e:
                        print("未找到商品元素,跳过此商品")
                        continue
                    try:
                        print("正在获取商品链接...")
                        link_element = product.find_element(By.XPATH, ".//div/div[1]/a")
                        link_element.click()
                        time.sleep(6)
                        #link = link_element.get_attribute("href")
                    except Exception as e:
                        print(f"未找到商品链接: {e}")

                    WebDriverWait(self.driver, 10).until(EC.number_of_windows_to_be(2))
                    windows = self.driver.window_handles
                    for window in windows:
                        if window != original_window:
                            self.driver.switch_to.window(window)
                            print("切换到新窗口:", self.driver.current_url)
                            break
                    try:
                        details = []
                        print("正在获取手机详情数据...")
                        time.sleep(5)
                        #self.check_if_need_login()
                        WebDriverWait(self.driver, 10).until(
                        EC.presence_of_element_located((By.CLASS_NAME, "nickname")) #登录成功后会出现昵称元素
                        )
                        self.driver.execute_script('window.scrollBy(0, 200)')
                        time.sleep(3)
                        #input("等待继续执行，请在控制台按回车键继续...") # 程序在此暂停，等待用户操作
                        infs = self.driver.find_element(By.XPATH, "//*[@id='detail']/div[2]/div[3]/div[2]")
                        inf_labels = infs.find_elements(By.XPATH, ".//*[@class='flex-center']")
                        inf_values = infs.find_elements(By.XPATH, ".//*[@class='adaptive']")
                        for inf_label, inf_value in zip(inf_labels, inf_values):
                            inf_label_n = inf_label.text.strip()
                            inf_value_n = inf_value.text.strip()
                            details.append((inf_label_n, ':', inf_value_n))
                            #print(f"{inf_label_n}: {inf_value_n}")
                        time.sleep(3)
                        print("手机详情数据获取完毕！")
                    except Exception as e:
                        print(f"获取手机详情数据失败: {e}")
                    #details = self.get_mobile_detail()
                    print("正在返回上一页...")
                    time.sleep(3)
                    self.driver.close()
                    details = '|'.join([str(detail) for detail in details])
                    infmation = [name, price, comment_count, details]
                    one_product = dict(zip(all_products_labels, infmation))
                    all_products.append(one_product)
                    #print(f"第{j+1}个手机详情数据:")
                    #print(all_products[j])
                    time.sleep(3)
                    self.driver.switch_to.window(original_window)
                    j = j + 1
                #self.next_page()
                # 避免频繁请求
                time.sleep(5)
            
            self.save_data(all_products, all_products_labels)
            time.sleep(3)
            self.driver.quit()
        except Exception as e:
            print(f"获取手机数据失败: {e}")


    def save_data(self, data, labels):
        """
        保存数据
        
        Args:
            data: 要保存的数据
        """
        try:
            with open('jd_mobile_data.csv', 'w', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=labels)
                writer.writeheader()
                writer.writerows(data)
            print("数据保存成功！")
        except Exception as e:
            print(f"数据保存失败: {e}")

    def next_page(self):
        """
        翻页
        """
        try:
            next_page_element = self.driver.find_element(By.XPATH, "//*[@id='page']/a[7]")
            next_page_element.click()
            time.sleep(5)
        except Exception as e:
            print(f"翻页失败: {e}")

if __name__ == '__main__':
    spider = JDMobileSpider()
    spider.init_edge_browser(jd_login_url)
    spider.jd_login('your_user_name', 'your_password')
    spider.get_mobile_data()

4、结果展示（部分）

5、总结

本实验成功构建了一个爬取京东手机相关数据的爬虫，大部分操作均可全自动进行。

未来改进方向：

1、加入检测验证模块：当连续访问多个商品界面时，京东会对用户进行人机验证，需要处理验证后继续数据获取。

2、数据初步处理：爬取的数据较为杂乱，可以在保存之前先一步进行处理，减小后期进行数据分析的工作量。