总结一下爬虫学习中的实例实战
1.金山翻译
python
# -*- coding: utf-8 -*-
import requests
# 获取翻译包的url,需要去掉多余的保护壳:
# https://ifanyi.iciba.com/index.php?c=trans&m=fy&client=6&auth_user=key_web_new_fanyi&sign=9X%2BHAviAKqteMMuVvr%2B0X9RriqVIAJSQ%2BxmfU0q7dIE%3D
url = 'https://ifanyi.iciba.com/index.php?c=trans'
# 构建请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Referer': 'https://www.iciba.com/',
'Host': 'ifanyi.iciba.com'
}
while True:
# 实现用户输入的功能
content = input('请输入您想翻译的内容(输入"exit"结束程序):')
# 检查是否需要退出
if content.lower() == 'exit':
break
# 构建参数字典
post_data = {
'from': 'auto',
'to': 'auto',
'q': content,
}
# 发送请求
res = requests.post(url, headers=headers, data=post_data)
res_1 = res.content.decode()
# 输出翻译结果
print(eval(res_1)['out'])
2.github模拟登录
python
# -*- coding: utf-8 -*-
import re
# 1.获取并模拟登录操作 2.保存登录会话信息 3.验证是否登录成功
import requests
from requests import Session
def do_auth_token(session: Session):
global response
response = session.get('https://github.com/login')
if response.status_code != 200:
print("请求失败,请稍后再试!")
exit(0)
login_html = response.content.decode()
auth_token = re.findall(r'name="authenticity_token" value="(.*?)"', login_html)[0]
return auth_token
def do_auth_login(session: Session):
post_data = {
"commit": "Sign in",
"authenticity_token": auth_token,
"login": "2834438515@qq.com",
"password": "991016csq", # 登录密码,为了个人账号安全我这里不是真实密码
"webauthn-conditional": "undefined",
"javascript-support": "true",
"webauthn-support": "supported",
"webauthn-iuvpaa-support": "unsupported",
"return_to": "https://github.com/login"
}
response = session.post(url='https://github.com/session', data=post_data)
if response.status_code != 200:
print("请求失败,请检查参数!")
else:
print("请求session 成功!")
def do_login_status(session: Session):
response = session.get('https://github.com/csqting')
html_content = response.content
response1 = re.findall(r'<title>(.+?)(GitHub)?</title>', html_content.decode('utf-8'))
try:
end_str = response1[0][1]
except IndexError:
end_str = ""
if end_str == "":
# 个人主页的title内容如果结尾没有GitHub,说明登录成功
print("登录成功!")
else:
print("登录失败!")
with open("github_profile.html", "wb") as f:
f.write(html_content)
if __name__ == '__main__':
# 使用session进行状态保持
session = requests.session()
session.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'
}
# 1. 获取并模拟登录操作
auth_token = do_auth_token(session)
# 2. 保存登录会话信息
do_auth_login(session)
# 3. 验证是否登录成功
do_login_status(session)
3.百度贴吧爬取
python
# -*- coding: utf-8 -*-
import requests
from lxml import etree
# url
# headers
# 发送请求获取响应
# 从响应中提取数据
# 判断结束
class Tieba(object):
def __init__(self, name):
self.url = "https://tieba.baidu.com/f?kw={}".format(name)
print(self.url)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
# "User-Agent": "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; T132461)"
}
def get_data(self, url):
response = requests.get(url, headers=self.headers)
with open("temp.html", "wb") as f:
f.write(response.content)
return response.content
def parse_data(self, data):
# 创建element对象
data = data.decode().replace("<!--", "").replace("-->", "")
html = etree.HTML(data)
el_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
# print(len(el_list))
data_list = []
for el in el_list:
temp = {}
temp['title'] = el.xpath('./text()')[0]
temp['link'] = 'https://tieba.baidu.com' + el.xpath('./@href')[0]
data_list.append(temp)
# 获取下一页url
try:
next_url = 'https:' + html.xpath('//a[contains(text(),"下一页>")]/@href')[0]
except:
next_url = None
return data_list, next_url
def save_data(self, data_list):
for data in data_list:
print(data)
def run(self):
next_url = self.url
while True:
# 发送请求获取响应
data = self.get_data(next_url)
# 从响应中提取数据,数据和翻页用的url
data_list, next_url = self.parse_data(data)
self.save_data(data_list)
print(next_url)
# 判断是否结束
if next_url == None:
break
if __name__ == '__main__':
tieba = Tieba("美食天下")
tieba.run()
4.斗鱼直播
python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
class Douyu(object):
def __init__(self):
self.url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.Chrome()
self.driver.implicitly_wait(10) # 设置隐式等待,最大等待10秒
def parse_data(self):
room_list = self.driver.find_elements(By.XPATH, '//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
print(len(room_list))
data_list = []
# 遍历房间列表,从每一个房间节点获取数据
for room in room_list:
temp = {}
# temp['title'] = room.find_element(By.XPATH, './div[2]/div[1]/a').text
# temp['type'] = room.find_element(By.XPATH, './div[2]/div[2]/span/a').text
# temp['owner'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[1]/div').text
# temp['num'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[2]/span').text
temp['picture'] = room.find_element(By.XPATH, './div[1]/picture/source[1]').get_attribute('srcset')
# print(temp)
data_list.append(temp)
return data_list
def run(self):
self.driver.get(self.url)
total_rooms = 0
last_count = 0 # 上一次获取的房间数量
while True:
# 滚动到页面底部
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2) # 等待页面加载新内容
# 获取当前房间数据
new_data = self.parse_data()
total_rooms += len(new_data)
print(f"Total rooms : {total_rooms}")
# 检查当前房间数量
if total_rooms == last_count: # 如果新加载的房间数量没有增加,停止滚动
print("No more new data to load.")
break
last_count = total_rooms # 更新最后一次的房间数量
print(f"Final total rooms fetched: {total_rooms}")
self.driver.quit() # 退出浏览器
if __name__ == '__main__':
douyu = Douyu()
douyu.run()
5.黑马贴吧
python
import requests
import re
def fetch_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36',
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# 使用正则表达式提取文章标题
titles = re.findall(r'class="s xst">([^<]+)</a>', response.text)
# 提取发布时间和作者
details = re.findall(
r'<span style="margin-left: 0;">([^<]+)</span></a><span style="margin-left: 5px;">@ ([^<]+)</span>',
response.text)
authors = [detail[0] for detail in details]
dates = [detail[1] for detail in details]
# 输出提取的结果
for title, date, author in zip(titles, dates, authors):
print(f"文章标题: {title}")
print(f"发布时间: {date}")
print(f"文章作者: {author}")
print('-' * 40)
# 使用正则表达式提取下一页的链接,search第一次出现
next_page_link = re.search(r'<a href="([^"]+)" class="nxt">下一页</a>', response.text)
if next_page_link:
return next_page_link.group(1) # 返回完整的链接
else:
return None
else:
print("访问失败", response.status_code)
return None
# 初始页面
current_url = 'https://bbs.itheima.com/forum-425-1.html'
# 循环遍历每一页,直到没有下一页
while current_url:
print(f"正在爬取: {current_url}")
next_url = fetch_page(current_url)
current_url = next_url
6.网易云
python
# -*- coding: utf-8 -*-
# document.charset 查看源码编码格式
import requests
import time
import re
import os
filename = 'musics\\'
# 如果没有则创建文件夹,os与操作系统实现交互功能(创建文件夹和目录)
if not os.path.exists(filename):
os.makedirs(filename)
url = 'https://music.163.com/discover/toplist?id=3778678'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
time.sleep(5)
# re.findall
# 这个函数用于在字符串中查找所有与正则表达式模式匹配的部分,并返回一个包含所有匹配项的列表
# r 前缀表示这是一个原始字符串,其中的反斜杠不会被解释为转义字符
# (\d+): 捕获组,匹配一个或多个数字
# (.*?): 捕获组,非贪婪匹配任何字符(包括空字符),直到遇到 </a>
# print(response.text)
html_data = re.findall(r'<li><a href="/song\?id=(\d+)">(.*?)</a>', response.text)
for num_id, title in html_data:
# f-string 直接嵌入表达式
music_download = f'https://music.163.com/song/media/outer/url?id={num_id}.mp3'
music_content = requests.get(music_download, headers=headers)
with open('musics\\' + title + '.mp3', 'wb') as f:
f.write(music_content.content)
print(num_id, title)
7.微博热榜
python
# # -*- coding: utf-8 -*-
# import time
# from lxml import etree
# import requests
#
# url = 'https://m.weibo.cn/p/106003type=25&t=3&disable_hot=1&filter_type=realtimehot'
# headers = {
# 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'}
# response = requests.get(url, headers=headers)
# time.sleep(3)
# print(response.text)
# html = etree.HTML(response.text)
# el_list = html.xpath('//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[1]/text()')
# print(len(el_list))
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'https://m.weibo.cn/p/106003type=25&t=3&disable_hot=1&filter_type=realtimehot'
driver = webdriver.Chrome()
driver.get(url)
time.sleep(3)
el_list = driver.find_elements(By.XPATH,'//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[1]')
# print(len(el_list))
el_list1 = driver.find_elements(By.XPATH,'//*[@id="app"]/div[1]/div[2]/div[3]/div/div/div/div/div/div/div/span[2]/span[2]')
# print(len(el_list1))
# save_out = []
i=1
for title,hot in zip(el_list,el_list1):
# save_out.append(f"{i}\n") # 添加行号
# save_out.append(f"文章标题: {title.text}\n") # 添加文章标题
# save_out.append(f"热度: {hot.text}\n") # 添加热度
print(f"{i}")
print(f"文章标题: {title.text}")
print(f"热度: {hot.text}")
i += 1
print('-' * 40)
# with open("weibo.txt","w") as file:
# file.writelines(save_out)
driver.quit()
8.驾校自动答题
python
# -*- coding: utf-8 -*-
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
url = 'https://www.jsyks.com/kmy-mnks'
driver = webdriver.Chrome()
driver.get(url)
# 1.获取答案xpath
# 2.替换匹配答案
# 3.执行滑动点击操作
time.sleep(3)
el_list = driver.find_elements(By.XPATH, '/html/body/div[4]/div[1]/div[1]/ul/li')
# print(len(el_list))
# 使用get_attribute('标签名')获取标签值,保存正确选项
k_values = []
for li in el_list:
k_values.append(li.get_attribute('k')) # 'E'表示错误,'R'表示正确
# 使用列表推导式替换字符,E R都不是保持原样字符
replaced_list = ["正确" if x == 'R' else "错误" if x == 'E' else x for x in k_values]
for index, li in enumerate(el_list):
answer = replaced_list[index]
if answer == '正确' or answer == '错误':
option = li.find_element(By.XPATH, f".//b[contains(text(),'{answer}')]")
else:
# 使用 starts-with 函数查找以特定字符开始的文本,答案为A、B、C、D的情况
option = li.find_element(By.XPATH, f".//b[starts-with(normalize-space(text()), '{answer}')]")
# 滚动到指定元素
driver.execute_script('arguments[0].scrollIntoView();', option)
# 使用JavaScript点击选项
driver.execute_script("arguments[0].click();", option)
后期学习路线:继续在实战中总结反爬手段,学习反调式,以及之后的爬虫完整项目学习。