爬虫笔记16——异步爬取二手汽车数据去重存入MySQL

需要用到的库

bash 复制代码

#异步数据库
pip install aiomysql
#reids数据库进行去重
pip install redis
#用hashlib进行md5加密
pip install hashlib
#基于异步IO的网络请求库
pip install aiohttp
#xpath获取静态页面数据
pip install lxml

目标网站

目标网站：https://www.che168.com/china/a0_0msdgscncgpi1ltocsp1exf4x0/?pvareaid=102179#currengpostion

获取汽车具体数据的api：https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}

其中{}方便使用format进行格式化

思路分析

1、要获取详细的汽车数据，需要在首页点击某个汽车进入到详情页面数据，这个过程会携带该汽车类型的specid跳转到详情数据页面，并请求对应的接口返回具体数据信息。

2、那我们需要在这个首页的罗列数据提取到每个汽车对应的specid，因为这是一个静态页面，使用xpath获取每个汽车的specid。

3、在详情数据页面，获取汽车的具体数据需要分析对应的接口进行请求，这个请求的过程需要携带前面获取到的specid。

4、最后获取数据成功进行提取，去重，存储即可

代码示例

值得注意的是：这个网页有反爬机制，他的页面编码格式会切换，如果返回的编码格式是UTF-8-SIG，是获取不到网页数据的，也就是提取不到specid，那后面也就获取不了汽车详细数据，所以我们要验证页面编码格式，这里要用到chardet包

bash 复制代码

pip install chardet

代码实现：

python 复制代码

# -*- coding: utf-8 -*-
# @Time:      2024/06/24 12:51
# @File:       二手车.py

import aiomysql
import aiohttp
import redis
import hashlib
import chardet
import asyncio
from lxml import etree

class SpiderCar:
    def __init__(self):
        self.redis = redis.Redis()
        self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
        self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
        }

    async def get_car_id(self, page, session, pool):
        async with session.get(self.url.format(page), headers=self.headers) as response:
            content = await response.read()
            encoding = chardet.detect(content)["encoding"]
            if encoding == 'GB2312':
                html = content.decode('gbk')
            else:
                html = content.decode(encoding)
                print('被反爬了...')

            tree = etree.HTML(html)
            car_id_list = tree.xpath("//ul[@class='viewlist_ul']/li/@specid")
            if car_id_list:
                car_info_tasks = [asyncio.create_task(self.get_car_info(car_id, session, pool)) for car_id in car_id_list]
                await asyncio.wait(car_info_tasks)

    async def get_car_info(self, car_id, session, pool):
        async with session.get(self.api_url.format(car_id), headers=self.headers) as response:
            car_info = await response.json()
            # print(car_info)
            if car_info["result"]["paramtypeitems"]:
                item = dict()
                item['name'] = car_info['result']['paramtypeitems'][0]['paramitems'][0]['value']
                item['price'] = car_info['result']['paramtypeitems'][0]['paramitems'][1]['value']
                item['brand'] = car_info['result']['paramtypeitems'][0]['paramitems'][2]['value']
                item['altitude'] = car_info['result']['paramtypeitems'][1]['paramitems'][2]['value']
                item['breadth'] = car_info['result']['paramtypeitems'][1]['paramitems'][1]['value']
                item['length'] = car_info['result']['paramtypeitems'][1]['paramitems'][0]['value']
                await self.save_car_info(item, pool)

    @staticmethod
    def md5_hash(item):
        hash_obj = hashlib.md5()
        hash_obj.update(str(item).encode())
        return hash_obj.hexdigest()

    async def save_car_info(self, item, pool):
        async with pool.acquire() as connect:
            async with connect.cursor() as cursor:
                hash_item = self.md5_hash(item)
                obj = self.redis.sadd('car_info:filter', hash_item)
                if obj:
                    sql = """
                        insert into car_info value (
                            %s, %s, %s, %s, %s, %s, %s
                        );
                    """
                    try:
                        await cursor.execute(sql, (0, item['name'], item['price'], item['brand'], item['altitude'], item['breadth'], item['length']))
                        await connect.commit()
                        print('插入成功')
                    except Exception as e:
                        print('插入失败！', e)
                        await connect.rollback()
                else:
                    print('数据重复，跳过该条数据')

    async def main(self):
        async with aiomysql.create_pool(user='root', password='root', db='py_spider') as pool:
            async with pool.acquire() as connect:
                async with connect.cursor() as cursor:
                    # sql = """
                    #     create table if not exists car_info (
                    #         id int primary key auto_increment,
                    #         name varchar(100),
                    #         price varchar(100),
                    #         brand varchar(100),
                    #         altitude varchar(100),
                    #         breadth varchar(100),
                    #         length varchar(100)
                    #     )
                    # """
                    # await cursor.execute(sql)
                    # 创建表
                    create_table_sql = """
                                           create table car_info(
                                               id int primary key auto_increment,
                                               name varchar(100),
                                               price varchar(100),
                                               brand varchar(100),
                                               altitude varchar(100),
                                               breadth varchar(100),
                                               length varchar(100)
                                           );
                                       """

                    # 在异步代码中必须先要检查表是否存在, 直接使用if not语句无效
                    check_table_query = "show tables like 'car_info'"
                    result = await cursor.execute(check_table_query)  # 如果表存在返回1 不存在返回0
                    if not result:
                        await cursor.execute(create_table_sql)

            async with aiohttp.ClientSession() as session:
                car_id_tasks = [asyncio.create_task(self.get_car_id(page, session, pool)) for page in range(1, 100)]
                await asyncio.wait(car_id_tasks)


if __name__ == '__main__':
    spider_car = SpiderCar()
    loop = asyncio.get_event_loop()
    loop.run_until_complete(spider_car.main())