python下几个淘宝、天猫、京东爬虫实例

以下是使用Python编写的针对淘宝、天猫、京东详情页的爬虫实例。请注意,这些实例仅供参考,实际使用时可能需要根据网站结构的变化进行调整,并且需要遵守各平台的爬虫协议和法律法规。

淘宝详情页爬虫实例

  1. 环境准备

    • Python 3.x
    • Selenium库
    • ChromeDriver(或对应浏览器的WebDriver)
  2. 代码实现

|---|-----------------------------------------------------------------------------------------------|
| | from selenium import webdriver |
| | from selenium.webdriver.common.by import By |
| | from selenium.webdriver.common.keys import Keys |
| | import time |
| | import csv |
| | |
| | # 初始化WebDriver |
| | driver = webdriver.Chrome() |
| | |
| | # 打开淘宝并搜索商品 |
| | driver.get('https://www.taobao.com/') |
| | driver.maximize_window() # 最大化浏览器窗口 |
| | driver.implicitly_wait(10) # 设置隐式等待时间 |
| | |
| | # 搜索商品(这里以"手机"为例) |
| | search_keyword = '手机' |
| | driver.find_element(By.XPATH, '//*[@id="q"]').send_keys(search_keyword) |
| | driver.find_element(By.XPATH, '//*[@id="J_TSearchForm"]/div[1]/button').click() |
| | |
| | # 等待搜索结果加载完成 |
| | time.sleep(5) |
| | |
| | # 解析搜索结果页面并提取商品详情页链接 |
| | product_links = [] |
| | for item in driver.find_elements(By.XPATH, '//div[@class="grid g-clearfix"]/div/div'): |
| | detail_url = item.find_element(By.XPATH, './/div[@class="pic"]/a').get_attribute('href') |
| | product_links.append(detail_url) |
| | |
| | # 遍历商品详情页链接并提取所需信息 |
| | with open('taobao_products.csv', 'w', newline='', encoding='utf-8') as csvfile: |
| | fieldnames = ['title', 'price', 'seller', 'location', 'detail_url'] |
| | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| | |
| | writer.writeheader() |
| | |
| | for link in product_links: |
| | driver.get(link) |
| | time.sleep(3) # 等待详情页加载完成 |
| | |
| | title = driver.find_element(By.XPATH, '//*[@id="J_DetailHeader"]/div[1]/h1').text |
| | price = driver.find_element(By.XPATH, '//*[@id="J_StrPrice"]/em').text |
| | seller = driver.find_element(By.XPATH, '//*[@id="J_OtherOptions"]/div[1]/p[1]/a').text |
| | location = driver.find_element(By.XPATH, '//*[@id="J_OtherOptions"]/div[1]/p[2]/span').text |
| | |
| | writer.writerow({ |
| | 'title': title, |
| | 'price': price, |
| | 'seller': seller, |
| | 'location': location, |
| | 'detail_url': link |
| | }) |
| | |
| | # 关闭WebDriver |
| | driver.quit() |

天猫详情页爬虫实例

  1. 环境准备:与淘宝相同。
  2. 代码实现(以搜索"羽毛球"为例):

|---|---------------------------------------------------------------------------------------------------------------------------------------------------------------|
| | from selenium import webdriver |
| | from selenium.webdriver.common.by import By |
| | from selenium.webdriver.support.ui import WebDriverWait |
| | from selenium.webdriver.support import expected_conditions as EC |
| | import time |
| | import csv |
| | |
| | # 初始化WebDriver |
| | driver = webdriver.Chrome() |
| | |
| | # 打开天猫并搜索商品 |
| | driver.get('https://list.tmall.com/') |
| | driver.maximize_window() |
| | driver.implicitly_wait(10) |
| | |
| | # 搜索商品(这里以"羽毛球"为例) |
| | search_keyword = '羽毛球' |
| | driver.get(f'https://list.tmall.com/search_product.htm?q={search_keyword}') |
| | |
| | # 等待搜索结果加载完成 |
| | wait = WebDriverWait(driver, 10) |
| | page_total_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.tm-pagination .ui-page-item.ui-page-item-last em'))) |
| | page_total = page_total_element.text |
| | |
| | # 解析搜索结果页面并提取商品信息 |
| | product_info = [] |
| | for page in range(1, int(page_total) + 1): |
| | try: |
| | # 如果是非第一页,则进行翻页操作 |
| | if page > 1: |
| | input_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.ui-page > div.ui-page-wrap > b.ui-page-skip > form > input.ui-page-skipTo'))) |
| | input_element.clear() |
| | input_element.send_keys(page) |
| | submit_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.ui-page > div.ui-page-wrap > b.ui-page-skip > form > button.ui-btn-s'))) |
| | submit_button.click() |
| | time.sleep(2) # 等待页面加载 |
| | |
| | # 提取当前页的商品信息 |
| | goods = driver.find_elements(By.CSS_SELECTOR, '#J_ItemList .product') |
| | for good in goods: |
| | title = good.find_element(By.CSS_SELECTOR, '.productTitle').text |
| | price = good.find_element(By.CSS_SELECTOR, '.productPrice').text.replace('¥', '') |
| | detail_url = good.find_element(By.CSS_SELECTOR, '.productImg').get_attribute('href') |
| | product_info.append({ |
| | 'title': title, |
| | 'price': price, |
| | 'detail_url': detail_url |
| | }) |
| | except Exception as e: |
| | print(f"Error on page {page}: {e}") |
| | |
| | # 将商品信息写入CSV文件 |
| | with open('tmall_products.csv', 'w', newline='', encoding='utf-8') as csvfile: |
| | fieldnames = ['title', 'price', 'detail_url'] |
| | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) |
| | writer.writeheader() |
| | for product in product_info: |
| | writer.writerow(product) |
| | |
| | # 关闭WebDriver |
| | driver.quit() |

京东详情页爬虫实例

  1. 环境准备:与淘宝相同。
  2. 代码实现(以搜索"手机"为例,并提取详情页图片):

|---|---------------------------------------------------------------------------------------------------------------------------------------|
| | from selenium import webdriver |
| | from selenium.webdriver.common.by import By |
| | from selenium.webdriver.support.ui import WebDriverWait |
| | from selenium.webdriver.support import expected_conditions as EC |
| | import time |
| | import os |
| | import requests |
| | |
| | # 初始化WebDriver |
| | driver = webdriver.Chrome() |
| | |
| | # 打开京东并搜索商品 |
| | driver.get('https://search.jd.com/') |
| | driver.maximize_window() |
| | driver.implicitly_wait(10) |
| | |
| | # 搜索商品(这里以"手机"为例) |
| | search_keyword = '手机' |
| | driver.find_element(By.XPATH, '//*[@id="key"]').send_keys(search_keyword) |
| | driver.find_element(By.XPATH, '//*[@id="search"]/div/button').click() |
| | |
| | # 等待搜索结果加载完成 |
| | wait = WebDriverWait(driver, 10) |
| | |
| | # 提取商品详情页链接并进入详情页提取图片 |
| | product_links = [] |
| | for item in driver.find_elements(By.CSS_SELECTOR, '.gl-item'): |
| | detail_url = item.find_element(By.CSS_SELECTOR, '.p-name em a').get_attribute('href') |
| | product_links.append(detail_url) |
| | |
| | headers = { |
| | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' |
| | } |
| | |
| | for link in product_links: |
| | driver.get(link) |
| | time.sleep(3) # 等待详情页加载完成 |
| | |
| | # 提取图片链接并下载 |
| | image_urls = [] |
| | try: |
| | images = driver.find_elements(By.CSS_SELECTOR, '.sku-gallery img') |
| | for img in images: |
| | image_urls.append(img.get_attribute('src')) |
| | except Exception as e: |
| | print(f"Error extracting images from {link}: {e}") |
| | continue |
| | |
| | image_dir = f'./jd_images/{link.split("/")[-1]}' |
| | if not os.path.exists(image_dir): |
| | os.makedirs(image_dir) |

相关推荐
阿珊和她的猫1 小时前
v-scale-scree: 根据屏幕尺寸缩放内容
开发语言·前端·javascript
fouryears_234174 小时前
Flutter InheritedWidget 详解:从生命周期到数据流动的完整解析
开发语言·flutter·客户端·dart
我好喜欢你~4 小时前
C#---StopWatch类
开发语言·c#
lifallen6 小时前
Java Stream sort算子实现:SortedOps
java·开发语言
IT毕设实战小研6 小时前
基于Spring Boot 4s店车辆管理系统 租车管理系统 停车位管理系统 智慧车辆管理系统
java·开发语言·spring boot·后端·spring·毕业设计·课程设计
wyiyiyi6 小时前
【Web后端】Django、flask及其场景——以构建系统原型为例
前端·数据库·后端·python·django·flask
mit6.8246 小时前
[1Prompt1Story] 滑动窗口机制 | 图像生成管线 | VAE变分自编码器 | UNet去噪神经网络
人工智能·python
没有bug.的程序员6 小时前
JVM 总览与运行原理:深入Java虚拟机的核心引擎
java·jvm·python·虚拟机
甄超锋7 小时前
Java ArrayList的介绍及用法
java·windows·spring boot·python·spring·spring cloud·tomcat
cui__OaO7 小时前
Linux软件编程--线程
linux·开发语言·线程·互斥锁·死锁·信号量·嵌入式学习