文章目录
- [0 前提概要](#0 前提概要)
- [1 项目需要导入的库](#1 项目需要导入的库)
- [2 浏览头参数配置](#2 浏览头参数配置)
- [3 登录页面](#3 登录页面)
- [4 提取指定标签数据 + 数据保存](#4 提取指定标签数据 + 数据保存)
- [5 主程序](#5 主程序)
0 前提概要
- 创建虚拟环境
bash
# python.exe 替换成本地实际存放的地址
python.exe -m venv .venv
- 下载ChromeDriver驱动程序,存放在指定位置
选择与本地Chrome浏览器对应版本的驱动程序,可参考这篇博主文章
- 需要下载的库
bash
pip install selenium bs4
- 键盘
F12进行浏览器页面分析,或者点击【左上角"···" -> 更多工具 -> 开发人员工具/开发者工具】
在
https://www.dianping.com/shanghai/ch25页面中,切换不同的页面,URL会有规律的发生变化如第2页是
https://www.dianping.com/shanghai/ch25/p2,第3页是https://www.dianping.com/shanghai/ch25/p3很容易发现页面变化规律,那么,接下来就开始编写代码
1 项目需要导入的库
python
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time
import pickle
import csv
2 浏览头参数配置
python
def driver_instantiation():
"""实例化ChromeDriver对象"""
# 隐蔽性设置
chrome_options = Options()
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--disable-extensions")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# chrome_options.add_argument("--headless=new") # 无头浏览器设置
# 模拟真实浏览器请求,防止反爬策略
chrome_options.add_argument('--user-agent="添加浏览器请求头信息"')
service = Service(r"C:\ChromeDriver\chromedriver.exe")
driver = webdriver.Chrome(service=service, options=chrome_options)
return driver
3 登录页面
- 本篇是选择输入密码和手机号的方式登录
python
def login_interface(driver, url):
"""登录界面"""
try:
driver.get(url)
login_info(driver)
if "扫码登录更安全" in driver.page_source:
print("登录失败")
driver.quit()
else:
print("登录成功")
except Exception as e:
print(f"出错:{e}")
def login_info(driver):
"""输入手机号和验证码"""
# 等待切换按钮出现
switch_btn = WebDriverWait(driver, 10).until(
EC.element_to_be_clickable((By.CLASS_NAME, "bottom-password-login"))
)
# 点击切换到账号登录
switch_btn.click()
print("已切换到账号登录")
# 等待登录表单加载
time.sleep(2)
phone_num = "实际手机号"
try:
# 1. 输入手机号
driver.find_element(By.ID, "mobile-number-textbox").send_keys(phone_num)
print("手机号输入完成")
# 2. 点击同意协议按钮
driver.find_element(By.ID, "pc-check").click()
# 3. 点击发送验证码
driver.find_element(By.ID, "send-vcode-button").click()
# 4. 手动输入验证码
verification_num = input("请输入验证码:")
driver.find_element(By.ID, "number-textbox").send_keys(verification_num)
# 5. 点击登录按钮
driver.find_element(By.CLASS_NAME, "button-pc").click()
print(driver.page_source)
except Exception as e:
print(f"登录出错:{e}")
4 提取指定标签数据 + 数据保存
python
def extract_url(driver):
"""提取数据URL"""
try:
for i in range(1, 51):
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, "//*[@class='shop-list J_shop-list shop-all-list']"))
)
if i == 1:
url = r"https://www.dianping.com/shanghai/ch25"
else:
url = f"https://www.dianping.com/shanghai/ch25/p{i}"
driver.get(url)
try:
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "shop-all-list"))
)
except:
print(f"第 {i} 页加载超时,跳过")
continue
soup = BeautifulSoup(driver.page_source, "html.parser")
ul_li = soup.find("div", {"id": "shop-all-list"})
if not ul_li:
print(f"第 {i} 页未找到店铺列表")
continue
list_li = ul_li.find_all("li")
print(f"第 {i} 页找到{len(list_li)}个店铺")
page_data = []
for div_a in list_li:
try:
a_tag = div_a.find("div", {"class": "txt"})
if not a_tag:
continue
a = a_tag.find("a")
if not a:
continue
href = a.get("href", "")
name = a.get_text(strip=True)
page_data.append([name, href])
except Exception as e:
print(f"解析单个店铺信息出错:{e}")
if page_data:
try:
# 写入 csv 文件
with open("title_href.csv", "a", newline="", encoding="utf-8-sig") as csvfile:
writer = csv.writer(csvfile)
writer.writerows(page_data)
print("成功保存到 title_href.csv 文件中")
except Exception as e:
print(f"写入 title_href.csv 文件出错:{e}")
else:
print(f"第 {i} 页未提取到有效数据")
except KeyboardInterrupt as e:
driver.quit()
except Exception as e:
print(f"标签提取过程出错:{e}")
5 主程序
python
def main():
url = r"https://www.dianping.com/shanghai/ch25"
# ChromeDriver实例化对象
driver = driver_instantiation()
# 登录界面
login_interface(driver, url)
# 提取目标电影地址名称和urls
extract_url(driver)
driver.quit()
if __name__ == "__main__":
main()