"用 Python 写浏览器"可以有多种理解:可以是控制现有浏览器 (自动化测试/爬虫),也可以是解析和渲染网页 (构建浏览器引擎的一部分),还可以是实现一个简单的 HTTP 客户端+解析器。由于不可能真正穷举所有可能的语句,我将按照不同的技术方向,整理出最核心、最常用的 Python 语句和库,它们共同构成了"用 Python 与浏览器交互"的完整工具箱。
1. 浏览器自动化(控制真实浏览器)
这些库通过驱动 Chrome、Firefox 等浏览器,模拟真实用户操作。
1.1 Selenium(WebDriver)
python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# 启动浏览器
driver = webdriver.Chrome() # 或 Firefox(), Edge()
driver.get('https://example.com')
# 定位元素
elem = driver.find_element(By.ID, 'id')
elem = driver.find_element(By.NAME, 'name')
elem = driver.find_element(By.CLASS_NAME, 'class')
elem = driver.find_element(By.TAG_NAME, 'div')
elem = driver.find_element(By.LINK_TEXT, 'link')
elem = driver.find_element(By.PARTIAL_LINK_TEXT, 'partial')
elem = driver.find_element(By.XPATH, '//div[@class="foo"]')
elem = driver.find_element(By.CSS_SELECTOR, '#main > .content')
# 元素操作
elem.click()
elem.send_keys('text')
elem.clear()
elem.submit()
elem.get_attribute('href')
elem.text
# 执行 JavaScript
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# 等待
wait = WebDriverWait(driver, 10)
wait.until(EC.presence_of_element_located((By.ID, 'myId')))
driver.implicitly_wait(5)
# 窗口/标签页管理
driver.switch_to.window(driver.window_handles[0])
driver.close()
driver.quit()
# Cookie
driver.get_cookies()
driver.add_cookie({'name': 'foo', 'value': 'bar'})
1.2 Playwright(现代跨浏览器自动化)
python
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto('https://example.com')
# 元素定位与操作
page.fill('#username', 'user')
page.click('button[type="submit"]')
page.check('#checkbox')
page.select_option('#dropdown', value='option1')
# 获取内容
text = page.text_content('.result')
html = page.inner_html('.container')
title = page.title()
url = page.url
# 等待
page.wait_for_selector('.loaded', timeout=5000)
page.wait_for_load_state('networkidle')
# 截图与 PDF
page.screenshot(path='screenshot.png')
page.pdf(path='page.pdf')
# 执行 JS
page.evaluate('() => window.scrollTo(0, document.body.scrollHeight)')
browser.close()
1.3 Pyppeteer(Python 版 Puppeteer)
python
import asyncio
from pyppeteer import launch
async def main():
browser = await launch(headless=False)
page = await browser.newPage()
await page.goto('https://example.com')
await page.screenshot({'path': 'example.png'})
await page.type('#input', 'text')
await page.click('#button')
await browser.close()
asyncio.get_event_loop().run_until_complete(main())
2. 模拟浏览器请求(不渲染)
使用 HTTP 客户端库,可以发送各种请求,模拟浏览器的网络行为。
2.1 requests(最常用)
python
import requests
# 基本请求
resp = requests.get('https://example.com')
resp = requests.post('https://httpbin.org/post', data={'key': 'value'})
resp = requests.put('https://httpbin.org/put', json={'key': 'value'})
resp = requests.delete('https://httpbin.org/delete')
# 设置请求头
headers = {'User-Agent': 'Mozilla/5.0 ...'}
resp = requests.get(url, headers=headers)
# 携带 Cookie
cookies = {'session_id': 'abc123'}
resp = requests.get(url, cookies=cookies)
# 会话保持
session = requests.Session()
session.post('https://example.com/login', data={'user': 'a'})
resp = session.get('https://example.com/dashboard')
# 代理
proxies = {'http': 'http://proxy:8080', 'https': 'http://proxy:8080'}
resp = requests.get(url, proxies=proxies)
# 响应处理
resp.status_code
resp.headers
resp.text # 文本内容
resp.content # 二进制内容
resp.json() # 解析 JSON
2.2 urllib(标准库)
python
from urllib import request, parse
# GET
with request.urlopen('https://example.com') as f:
html = f.read().decode('utf-8')
# POST
data = parse.urlencode({'key': 'value'}).encode()
req = request.Request('https://httpbin.org/post', data=data, method='POST')
with request.urlopen(req) as f:
result = f.read()
# 添加头
req.add_header('User-Agent', 'Mozilla/5.0')
2.3 httpx(支持 HTTP/2,异步)
python
import httpx
# 同步
resp = httpx.get('https://example.com')
resp = httpx.post('https://httpbin.org/post', json={'key': 'value'})
# 异步
async def fetch():
async with httpx.AsyncClient() as client:
resp = await client.get('https://example.com')
return resp.text
3. 解析 HTML / XML
获取网页源码后,需要提取数据。
3.1 BeautifulSoup4
python
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser') # 或 'lxml'
# 查找元素
soup.find('div', id='content')
soup.find_all('a', class_='link')
soup.select('div#main > p') # CSS 选择器
# 获取内容
tag.text
tag.get('href')
tag.attrs
3.2 lxml(基于 libxml2)
python
from lxml import html
tree = html.fromstring(html_content)
elements = tree.xpath('//div[@class="item"]/a/@href')
text = tree.xpath('//title/text()')[0]
3.3 pyquery(类似 jQuery 语法)
python
from pyquery import PyQuery as pq
doc = pq(html)
links = doc('a')
for link in links:
print(pq(link).attr('href'))
3.4 html.parser(标准库,简单)
python
from html.parser import HTMLParser
class MyParser(HTMLParser):
def handle_starttag(self, tag, attrs):
print(tag)
parser = MyParser()
parser.feed(html)
4. 无头浏览器渲染(JavaScript 支持)
对于需要执行 JS 才能看到内容的页面,可以使用无头浏览器。
4.1 Selenium 无头模式
python
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get('https://example.com')
html = driver.page_source
driver.quit()
4.2 Playwright 无头模式
python
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto('https://example.com')
html = page.content()
4.3 requests-html(基于 Pyppeteer 的无头浏览器)
python
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://example.com')
r.html.render() # 执行 JS
print(r.html.html)
5. 内嵌浏览器(GUI 应用程序)
在 Python GUI 程序中嵌入一个真实的浏览器控件。
5.1 PyQt5 / PySide2 的 QWebEngineView
python
import sys
from PyQt5.QtWidgets import QApplication, QMainWindow
from PyQt5.QtWebEngineWidgets import QWebEngineView
app = QApplication(sys.argv)
window = QMainWindow()
view = QWebEngineView()
view.setUrl('https://example.com')
window.setCentralWidget(view)
window.show()
sys.exit(app.exec_())
5.2 wxPython 的 WebView(需要 wx.html2)
python
import wx
import wx.html2
app = wx.App()
frame = wx.Frame(None, title="Browser")
webview = wx.html2.WebView.New(frame)
webview.LoadURL("https://example.com")
frame.Show()
app.MainLoop()
5.3 tkinter + cefpython3(嵌入 Chromium)
python
from cefpython3 import cefpython as cef
import tkinter as tk
def main():
cef.Initialize()
root = tk.Tk()
root.geometry("800x600")
# 创建浏览器窗口(需要额外处理)
# 详见 cefpython 示例
cef.MessageLoop()
cef.Shutdown()
6. 简单 HTTP 服务器(调试用)
有时需要启动一个本地服务器来测试网页。
6.1 http.server(标准库)
python
import http.server
import socketserver
PORT = 8000
Handler = http.server.SimpleHTTPRequestHandler
with socketserver.TCPServer(("", PORT), Handler) as httpd:
print(f"Serving at port {PORT}")
httpd.serve_forever()
6.2 Flask(轻量级 Web 框架)
python
from flask import Flask, render_template
app = Flask(__name__)
@app.route('/')
def index():
return '<h1>Hello, Browser!</h1>'
app.run(port=5000)
7. 浏览器引擎与高级应用
这些库用于构建自定义浏览器或进行网页分析。
7.1 PyQtWebEngine(完整的浏览器引擎)
python
from PyQt5.QtWidgets import QApplication
from PyQt5.QtWebEngineWidgets import QWebEngineView
# 同上,可以扩展为完整浏览器
7.2 mitmproxy(中间人代理,用于调试浏览器请求)
python
# mitmproxy 提供脚本 API,可在 Python 中修改请求/响应
from mitmproxy import ctx
def request(flow):
flow.request.headers["User-Agent"] = "CustomBrowser"
7.3 selenium-stealth(隐藏自动化特征)
python
from selenium.webdriver import Chrome
from selenium_stealth import stealth
driver = Chrome()
stealth(driver,
languages=["en-US", "en"],
vendor="Google Inc.",
platform="Win32",
webgl_vendor="Intel Inc.",
renderer="Intel Iris OpenGL Engine",
fix_hairline=True,
)
8. 辅助语句:下载资源、处理 JavaScript
python
# 下载图片
import requests
img_data = requests.get('https://example.com/image.png').content
with open('local.png', 'wb') as f:
f.write(img_data)
# 解析 JavaScript 变量(正则或 js2py)
import re
pattern = r'var data = ({.*?});'
match = re.search(pattern, html)
if match:
import json
data = json.loads(match.group(1))
# 执行 JavaScript 代码(PyExecJS)
import execjs
ctx = execjs.compile("""
function add(a, b) {
return a + b;
}
""")
result = ctx.call("add", 1, 2)
总结
"Python 写浏览器"的语句可以归纳为以下几类:
| 类别 | 代表性语句/库 |
|---|---|
| 控制浏览器 | webdriver.Chrome(), page.goto(), element.click() |
| 模拟请求 | requests.get(), urllib.request.urlopen() |
| 解析页面 | BeautifulSoup(), lxml.html.fromstring(), pyquery() |
| 无头渲染 | options.add_argument('--headless'), r.html.render() |
| 内嵌浏览器 | QWebEngineView, wx.html2.WebView |
| 本地服务器 | http.server, Flask.run() |
| 高级调试 | mitmproxy, selenium-stealth |
由于 Python 生态极为丰富,上述内容已覆盖了绝大多数开发者在"写浏览器"时需要用到的核心语句。如果你有特定的场景(如爬虫、自动化测试、定制浏览器),可以进一步深入探讨。