需要用到的库
bash
#异步数据库
pip install aiomysql
#reids数据库进行去重
pip install redis
#用hashlib进行md5加密
pip install hashlib
#基于异步IO的网络请求库
pip install aiohttp
#xpath获取静态页面数据
pip install lxml
目标网站
目标网站:https://www.che168.com/china/a0_0msdgscncgpi1ltocsp1exf4x0/?pvareaid=102179#currengpostion
获取汽车具体数据的api:https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}
其中{}方便使用format进行格式化
思路分析
1、要获取详细的汽车数据,需要在首页点击某个汽车进入到详情页面数据,这个过程会携带该汽车类型的specid跳转到详情数据页面,并请求对应的接口返回具体数据信息。
2、那我们需要在这个首页的罗列数据提取到每个汽车对应的specid,因为这是一个静态页面,使用xpath获取每个汽车的specid。
3、在详情数据页面,获取汽车的具体数据需要分析对应的接口进行请求,这个请求的过程需要携带前面获取到的specid。
4、最后获取数据成功进行提取,去重,存储即可
代码示例
值得注意的是:这个网页有反爬机制,他的页面编码格式会切换,如果返回的编码格式是UTF-8-SIG,是获取不到网页数据的,也就是提取不到specid,那后面也就获取不了汽车详细数据,所以我们要验证页面编码格式,这里要用到chardet包
bash
pip install chardet
代码实现:
python
# -*- coding: utf-8 -*-
# @Time: 2024/06/24 12:51
# @File: 二手车.py
import aiomysql
import aiohttp
import redis
import hashlib
import chardet
import asyncio
from lxml import etree
class SpiderCar:
def __init__(self):
self.redis = redis.Redis()
self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
}
async def get_car_id(self, page, session, pool):
async with session.get(self.url.format(page), headers=self.headers) as response:
content = await response.read()
encoding = chardet.detect(content)["encoding"]
if encoding == 'GB2312':
html = content.decode('gbk')
else:
html = content.decode(encoding)
print('被反爬了...')
tree = etree.HTML(html)
car_id_list = tree.xpath("//ul[@class='viewlist_ul']/li/@specid")
if car_id_list:
car_info_tasks = [asyncio.create_task(self.get_car_info(car_id, session, pool)) for car_id in car_id_list]
await asyncio.wait(car_info_tasks)
async def get_car_info(self, car_id, session, pool):
async with session.get(self.api_url.format(car_id), headers=self.headers) as response:
car_info = await response.json()
# print(car_info)
if car_info["result"]["paramtypeitems"]:
item = dict()
item['name'] = car_info['result']['paramtypeitems'][0]['paramitems'][0]['value']
item['price'] = car_info['result']['paramtypeitems'][0]['paramitems'][1]['value']
item['brand'] = car_info['result']['paramtypeitems'][0]['paramitems'][2]['value']
item['altitude'] = car_info['result']['paramtypeitems'][1]['paramitems'][2]['value']
item['breadth'] = car_info['result']['paramtypeitems'][1]['paramitems'][1]['value']
item['length'] = car_info['result']['paramtypeitems'][1]['paramitems'][0]['value']
await self.save_car_info(item, pool)
@staticmethod
def md5_hash(item):
hash_obj = hashlib.md5()
hash_obj.update(str(item).encode())
return hash_obj.hexdigest()
async def save_car_info(self, item, pool):
async with pool.acquire() as connect:
async with connect.cursor() as cursor:
hash_item = self.md5_hash(item)
obj = self.redis.sadd('car_info:filter', hash_item)
if obj:
sql = """
insert into car_info value (
%s, %s, %s, %s, %s, %s, %s
);
"""
try:
await cursor.execute(sql, (0, item['name'], item['price'], item['brand'], item['altitude'], item['breadth'], item['length']))
await connect.commit()
print('插入成功')
except Exception as e:
print('插入失败!', e)
await connect.rollback()
else:
print('数据重复,跳过该条数据')
async def main(self):
async with aiomysql.create_pool(user='root', password='root', db='py_spider') as pool:
async with pool.acquire() as connect:
async with connect.cursor() as cursor:
# sql = """
# create table if not exists car_info (
# id int primary key auto_increment,
# name varchar(100),
# price varchar(100),
# brand varchar(100),
# altitude varchar(100),
# breadth varchar(100),
# length varchar(100)
# )
# """
# await cursor.execute(sql)
# 创建表
create_table_sql = """
create table car_info(
id int primary key auto_increment,
name varchar(100),
price varchar(100),
brand varchar(100),
altitude varchar(100),
breadth varchar(100),
length varchar(100)
);
"""
# 在异步代码中必须先要检查表是否存在, 直接使用if not语句无效
check_table_query = "show tables like 'car_info'"
result = await cursor.execute(check_table_query) # 如果表存在返回1 不存在返回0
if not result:
await cursor.execute(create_table_sql)
async with aiohttp.ClientSession() as session:
car_id_tasks = [asyncio.create_task(self.get_car_id(page, session, pool)) for page in range(1, 100)]
await asyncio.wait(car_id_tasks)
if __name__ == '__main__':
spider_car = SpiderCar()
loop = asyncio.get_event_loop()
loop.run_until_complete(spider_car.main())