1.斗鱼selenium爬取
python
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
class Douyu(object):
def __init__(self):
self.url = 'https://www.douyu.com/directory/all'
self.driver = webdriver.Chrome()
self.driver.implicitly_wait(10) # 设置隐式等待,最大等待10秒
def parse_data(self):
room_list = self.driver.find_elements(By.XPATH, '//*[@id="listAll"]/section[2]/div[2]/ul/li/div')
print(len(room_list))
data_list = []
# 遍历房间列表,从每一个房间节点获取数据
for room in room_list:
temp = {}
# temp['title'] = room.find_element(By.XPATH, './div[2]/div[1]/a').text
# temp['type'] = room.find_element(By.XPATH, './div[2]/div[2]/span/a').text
# temp['owner'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[1]/div').text
# temp['num'] = room.find_element(By.XPATH, './div[1]/div/a/div/div[2]/div/div[2]/span').text
temp['picture'] = room.find_element(By.XPATH, './div[1]/picture/source[1]').get_attribute('srcset')
# print(temp)
data_list.append(temp)
return data_list
def run(self):
self.driver.get(self.url)
total_rooms = 0
last_count = 0 # 上一次获取的房间数量
while True:
# 滚动到页面底部
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
time.sleep(2) # 等待页面加载新内容
# 获取当前房间数据
new_data = self.parse_data()
total_rooms += len(new_data)
print(f"Total rooms : {total_rooms}")
# 检查当前房间数量
if total_rooms == last_count: # 如果新加载的房间数量没有增加,停止滚动
print("No more new data to load.")
break
last_count = total_rooms # 更新最后一次的房间数量
print(f"Final total rooms fetched: {total_rooms}")
self.driver.quit() # 退出浏览器
if __name__ == '__main__':
douyu = Douyu()
douyu.run()
2. request+mysql存储
python
import pymysql
import requests
from lxml import etree
# 第一步:请求百度首页并提取内容
url = 'https://www.baidu.com/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = etree.HTML(response.content.decode("utf-8"))
# 提取文本和链接
classes = ["normal", "c", "color", "t"]
extracted_data = []
for cls in classes:
xpath_query = f'//div[contains(@class, "{cls}")]'
elements = html.xpath(xpath_query)
for element in elements:
# 提取文本内容
text = ''.join(element.xpath('.//text()')).strip()
# 提取链接,假定链接是 a 标签的 href 属性
link = element.xpath('.//a/@href')
link = link[0] if link else "No link found"
extracted_data.append((text, link))
# 第二步:连接 MySQL 数据库
connection = pymysql.connect(
host='localhost', # 数据库地址
user='root', # MySQL 用户名
password='991016', # MySQL 密码
database='test', # 数据库名称
charset='utf8mb4', # 确保字符集是 utf8mb4
cursorclass=pymysql.cursors.DictCursor # 使用字典形式的游标
)
try:
with connection.cursor() as cursor:
# 创建一个新表存储网页内容
create_table_query = """
CREATE TABLE IF NOT EXISTS web_content (
id INT AUTO_INCREMENT PRIMARY KEY,
text_content TEXT,
link VARCHAR(255)
);
"""
cursor.execute(create_table_query)
# 插入提取到的数据
insert_query = "INSERT INTO web_content (text_content, link) VALUES (%s, %s)"
cursor.executemany(insert_query, extracted_data)
# 提交更改
connection.commit()
# 查询数据并验证是否成功存储
cursor.execute("SELECT * FROM web_content")
results = cursor.fetchall()
for row in results:
print(row)
finally:
connection.close()