1、 实验目标
-
实现京东平台的模拟登录;
-
爬取手机类商品的基本信息与详情数据;
-
将数据保存为结构化文件(CSV格式);
-
实现多页爬取,确保数据完整性。
2、实验思路与过程
访问京东页面,发现通过搜索页面进入商品页面需要登录,所以本实验采用selenuim库驱动webdiver进行登录;
进行登录时,除了账号密码,还需完成滑块验证码,鉴于京东反爬较为严格,试错机会少,本实验采用人为干预完成滑块验证码;
登录成功后,进入搜索页面,遍历搜索结果的每一个商品,点击进入商品页面;
进入商品页面,通过xpath定位并获取详细信息,完成数据获取后关闭页面回到搜索页面;
第一页商品数据获取完成后,通过xpath定位并点击下一页,继续爬取;
最后保存所有数据为csv格式。
3、代码实现
python
import time
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import csv
jd_login_url = "https://passport.jd.com/uc/login"
jd_firstpage_url = "https://www.jd.com"
class JDMobileSpider:
def __init__(self):
#login_cookies = self.get_cookies()
self.driver = None
self.data = []
self.headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
'Connection': 'keep-alive',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
#'Cookie': login_cookies
}
def init_edge_browser(self,url):
"""初始化Edge浏览器驱动"""
try:
# 设置Edge浏览器选项
edge_options = Options()
# 常用配置选项
edge_options.add_argument('--disable-blink-features=AutomationControlled')
edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
edge_options.add_experimental_option('useAutomationExtension', False)
edge_options.add_argument('--no-sandbox')
edge_options.add_argument('--disable-dev-shm-usage')
# 初始化Edge浏览器
self.driver = webdriver.Edge(options=edge_options)
# 隐藏自动化特征
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
# 设置窗口大小
self.driver.set_window_size(1400, 900)
print("Edge浏览器初始化成功")
time.sleep(1)
print("正在打开京东登录页面...")
self.driver.get(url)
time.sleep(3)
return True
except Exception as e:
print(f"Edge浏览器初始化失败: {e}")
return False
def jd_login(self, username, password):
"""京东账号登录(使用Edge浏览器)"""
try:
# 等待页面加载并切换到账户登录
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.ID, "loginname"))
)
# 尝试点击账户登录标签
#try:
# account_login = self.driver.find_element(By.XPATH, "//*[@id='loginsubmit']")
# account_login.click()
# time.sleep(3)
#except:
# print("未找到账户登录标签,可能已是账户登录模式")
# 输入用户名
print("正在输入用户名...")
user_input = self.driver.find_element(By.ID, "loginname")
user_input.clear()
user_input.send_keys(username)
time.sleep(3)
# 输入密码
print("正在输入密码...")
pwd_input = self.driver.find_element(By.ID, "nloginpwd")
pwd_input.clear()
pwd_input.send_keys(password)
time.sleep(3)
# 点击登录按钮
print("正在点击登录按钮...")
submit_btn = self.driver.find_element(By.ID, "loginsubmit")
submit_btn.click()
# 等待登录结果
time.sleep(5)
# 处理滑块验证
try:
# 设定一个等待时间,检查滑块验证码元素是否出现
print("正在处理特殊验证码...")
wait = WebDriverWait(self.driver, 5) # 等待最多10秒
slider = wait.until(EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div/div/div/div[2]/span[1]"))) # 使用京东滑块验证码的相关类名:cite[1]
if slider:
print("检测到滑块验证码,请手动完成滑动...")
input("完成后,请在控制台按回车键继续...") # 程序在此暂停,等待用户操作:cite[3]
except Exception as e:
# 如果在设定时间内没有找到滑块元素,可能直接登录成功或出现了其他情况
print("未检测到滑块验证码,或验证码形式有变化:", e)
# 验证登录是否成功
# 手动滑动完成后,检查登录是否成功
try:
#等待页面跳转并检查用户昵称元素
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "nickname")) # 登录成功后会出现昵称元素
)
print("登录成功!")
except Exception as e:
print("登录可能失败,请检查:", e)
except Exception as e:
print("登录可能失败1,请检查:", e)
def get_mobile_data(self, pages=3):
"""
获取手机数据(每页约40个商品,3页约120个确保获取前60畅销)
Args:
keyword: 搜索关键词
pages: 爬取页数
"""
try:
all_products_labels = ["商品名称","价格","评论数","商品详情"]
all_products = []
for page in range(1, pages + 1):
jd_search_url = f"https://re.jd.com/search?keyword=%E4%BA%AC%E4%B8%9C%E6%89%8B%E6%9C%BA%E8%87%AA%E8%90%A5&page={page}&ad_od=3&re_dcp=21Sm2D2ZOw&traffic_source=1004&bd_vid=9286869643b8e0c2&cu=true&utm_source=haosou-search&utm_medium=cpc&utm_campaign=t_262767352_haosousearch&utm_term=63857745620_0_9998f764b1f84bdd80ee784885202003"
print(f"正在爬取第{page}页数据...")
# 京东搜索URL
self.driver.get(jd_search_url)
original_window = self.driver.current_window_handle
time.sleep(5)
# 模拟滚动加载所有商品
#self.scroll_page()
for i in range(5):
self.driver.execute_script('window.scrollBy(0, 100)')
time.sleep(1)
# 解析当前页商品
j = 0
print("获取商品数据中...")
products = self.driver.find_elements(By.XPATH, "//*[@id='shop_list']/li")
for product in products:
try:
price = product.find_element(By.XPATH, ".//*[@class='price']").text.strip()
name = product.find_element(By.XPATH, ".//*[@class='commodity_tit']").text.strip()
comment_count = product.find_element(By.XPATH, ".//*[@class='comment']").text.strip()
print(f"第{j+1}个商品:")
print(f"名字: {name}, 价格: {price}, 评论数: {comment_count}")
time.sleep(3)
except Exception as e:
print("未找到商品元素,跳过此商品")
continue
try:
print("正在获取商品链接...")
link_element = product.find_element(By.XPATH, ".//div/div[1]/a")
link_element.click()
time.sleep(6)
#link = link_element.get_attribute("href")
except Exception as e:
print(f"未找到商品链接: {e}")
WebDriverWait(self.driver, 10).until(EC.number_of_windows_to_be(2))
windows = self.driver.window_handles
for window in windows:
if window != original_window:
self.driver.switch_to.window(window)
print("切换到新窗口:", self.driver.current_url)
break
try:
details = []
print("正在获取手机详情数据...")
time.sleep(5)
#self.check_if_need_login()
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "nickname")) #登录成功后会出现昵称元素
)
self.driver.execute_script('window.scrollBy(0, 200)')
time.sleep(3)
#input("等待继续执行,请在控制台按回车键继续...") # 程序在此暂停,等待用户操作
infs = self.driver.find_element(By.XPATH, "//*[@id='detail']/div[2]/div[3]/div[2]")
inf_labels = infs.find_elements(By.XPATH, ".//*[@class='flex-center']")
inf_values = infs.find_elements(By.XPATH, ".//*[@class='adaptive']")
for inf_label, inf_value in zip(inf_labels, inf_values):
inf_label_n = inf_label.text.strip()
inf_value_n = inf_value.text.strip()
details.append((inf_label_n, ':', inf_value_n))
#print(f"{inf_label_n}: {inf_value_n}")
time.sleep(3)
print("手机详情数据获取完毕!")
except Exception as e:
print(f"获取手机详情数据失败: {e}")
#details = self.get_mobile_detail()
print("正在返回上一页...")
time.sleep(3)
self.driver.close()
details = '|'.join([str(detail) for detail in details])
infmation = [name, price, comment_count, details]
one_product = dict(zip(all_products_labels, infmation))
all_products.append(one_product)
#print(f"第{j+1}个手机详情数据:")
#print(all_products[j])
time.sleep(3)
self.driver.switch_to.window(original_window)
j = j + 1
#self.next_page()
# 避免频繁请求
time.sleep(5)
self.save_data(all_products, all_products_labels)
time.sleep(3)
self.driver.quit()
except Exception as e:
print(f"获取手机数据失败: {e}")
def save_data(self, data, labels):
"""
保存数据
Args:
data: 要保存的数据
"""
try:
with open('jd_mobile_data.csv', 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=labels)
writer.writeheader()
writer.writerows(data)
print("数据保存成功!")
except Exception as e:
print(f"数据保存失败: {e}")
def next_page(self):
"""
翻页
"""
try:
next_page_element = self.driver.find_element(By.XPATH, "//*[@id='page']/a[7]")
next_page_element.click()
time.sleep(5)
except Exception as e:
print(f"翻页失败: {e}")
if __name__ == '__main__':
spider = JDMobileSpider()
spider.init_edge_browser(jd_login_url)
spider.jd_login('your_user_name', 'your_password')
spider.get_mobile_data()
4、结果展示(部分)

5、总结
本实验成功构建了一个爬取京东手机相关数据的爬虫,大部分操作均可全自动进行。
未来改进方向:
1、加入检测验证模块:当连续访问多个商品界面时,京东会对用户进行人机验证,需要处理验证后继续数据获取。
2、数据初步处理:爬取的数据较为杂乱,可以在保存之前先一步进行处理,减小后期进行数据分析的工作量。