代码功能概述
本文展示了一个使用 Playwright 进行网页自动化操作的 Python 脚本,特别针对反检测机制进行了优化。该脚本模拟人类操作行为访问百度搜索页面,自动输入查询词并提交搜索,同时通过多种技术手段规避网站的反爬虫检测。
环境准备
bash
pip install playwright
python -m playwright install chromium
完整代码(带详细注释)
python
from playwright.sync_api import sync_playwright
import time
import random
def human_delay(a=0.5, b=1.5):
time.sleep(random.uniform(a, b))
with sync_playwright() as p:
# 禁用自动化标志 --disable-blink-features=AutomationControlled
browser = p.chromium.launch(
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
page = browser.new_page()
# 注入脚本隐藏 webdriver 属性
page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page.goto("https://baidu.com")
page.fill("#chat-textarea", "ChatGPT")
human_delay()
page.click("#chat-submit-button")
text = page.inner_text("body")
print(text)
input("按回车关闭浏览器...")
browser.close()
保存cookie版本
python
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import time
import random
def human_delay(a=0.5, b=1.5):
time.sleep(random.uniform(a, b))
with sync_playwright() as p:
context = p.chromium.launch_persistent_context(
user_data_dir="my_profile", # ⭐在这里才存在
headless=False,
args=["--disable-blink-features=AutomationControlled"]
)
page = context.new_page()
# 注入脚本隐藏 webdriver 属性
page.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
page.goto("https://baidu.com")
page.fill("#chat-textarea", "ChatGPT")
human_delay()
page.wait_for_timeout(2000) # 或等某个结果 selector
page.click("#chat-submit-button")
page.wait_for_timeout(2000) # 或等某个结果 selector
text = page.inner_text("body")
html = page.content()
# print(html)
soup = BeautifulSoup(html, "html.parser")
print(soup.title.text)
print(soup.get_text())
with open("baidu.html", "w", encoding="utf-8") as f:
f.write(html)
input("按回车关闭浏览器...")
try:
context.close()
except Exception as e:
pass