京东店铺公司名爬虫

内容仅供学习参考,如有侵权联系删除

先通过京东非自营的店铺名拿到的公司名,再通过公司名称去其他平台拿到联系方式(代码省略)

python 复制代码
from aioscrapy.spiders import Spider
from aioscrapy.http import Request, FormRequest
import ddddocr
import re
import random

from loguru import logger


class JingDongSpider(Spider):
    name = 'products:jd'

    custom_settings = {
        'CONCURRENT_REQUESTS': 4,
        # 'DOWNLOAD_DELAY': 0.5,
        'DOWNLOAD_TIMEOUT': 10,
        'RETRY_TIMES': 5,
        'HTTPERROR_ALLOWED_CODES': [503],
        'COOKIES_ENABLED': False,
        'DUPEFILTER_CLASS': 'aioscrapy.dupefilters.redis.RFPDupeFilter',  # 过滤方法
        # 'LOG_LEVEL': 'DEBUG'
    }

    ocr = ddddocr.DdddOcr(show_ad=False, use_gpu=True)

    async def start_requests(self):

        yield Request(
            url=f"https://mall.jd.com/index-11111111.html?from=pc",
            method='GET',
            dont_filter=False,
            # fingerprint=str(i),
            # meta={"shop_id": str(i)},
            priority=500)

    async def parse(self, response):
        """店铺首页"""
        title = response.xpath('//title/text()').get() or ''
        shop_id = str(response.meta['shop_id'])
        if '您所访问的页面不存在' in str(title) or len(response.text) < 25000:
            logger.info(f"{shop_id}")
            return

        logger.info(title.strip())
        product_list = self.get_product_items(response)
        urls = re.findall(r"//\w+\.jd\.com/view_search-\d+-\d+-\d+-\d+-\d+-\d+\.html", response.text)

        yield Request(
            url=f"https://mall.jd.com/sys/vc/createVerifyCode.html?random={random.random()}",
            method='GET',
            callback=self.parse_img_code,
            dont_filter=True,
            meta={
                "data": {"product_url": 'https:' + urls[0] if urls else '',
                         "categorys": self.get_category(response),
                         "product_list": product_list,
                         # "shop_url": response.url,
                         "shop_id": shop_id}
            },
            priority=500)

    async def parse_img_code(self, response):
        """验证码"""
        code = self.ocr.classification(response.body)
        cookie = dict(response.cookies.items())
        shop_id = response.meta["data"]["shop_id"]
        if not code or not cookie:
            return

        yield FormRequest(
            url=f'https://mall.jd.com/showLicence-{shop_id}.html',
            method='POST',
            formdata={"verifyCode": str(code)},
            cookies=cookie,
            meta={"data": response.meta["data"]},
            callback=self.parse_shop_detail,
            dont_filter=True,
            priority=400)

    async def parse_shop_detail(self, response):
        """ 解析店铺详情
        """
        company = response.xpath(
            '//*[contains(.,"企业名称:")]/following-sibling::span[position()=1]/text()').get() or ''
        shop_name = response.xpath(
            '//*[contains(.,"店铺名称:")]/following-sibling::span[position()=1]//text()').get() or ''
        shop_url = response.xpath('//*[contains(.,"店铺网址:")]/following-sibling::span[position()=1]//text()').get()
        # legal_person = response.xpath( '//*[contains(.,"法定代表人姓名:")]/following-sibling::span[position()=1]//text()').get()
        # business_scope = response.xpath( '//*[contains(.,"营业执照经营范围:")]/following-sibling::span[position()=1]//text()').get()
        license = response.xpath('//img[@class="qualification-img"]/@src').get() or ''
        if not company or '测试' in shop_name or '测试' in company:
            if not company:
                logger.info(f"无公司: {response.url}")
            else:
                logger.info(f" {shop_name} => {company}")
            return
        else:
            logger.info(company)

        data = response.meta['data']
        data['company'] = company
        data['shop_name'] = shop_name

        items = dict(company=company,
                     shop_name=shop_name,
                     shop_url='https:' + shop_url if shop_url else response.url,
                     product_url=data['product_url'],
                     shop_id=data['shop_id'],
                     push_kafka_status=0,
                     license='https:' + license if license else '',

                     )

        if len(data['product_list']) < 1:
            if data['product_url']:
                yield Request(
                    url=data['product_url'],
                    method='GET',
                    meta={"data": data},
                    callback=self.parse_product,
                    dont_filter=True,
                    priority=300)
            else:
                logger.warning(f"获取不到产品链接:{response.url}")
                items.pop('product_url')
            yield items

        else:
            product_list = []
            for item in data['product_list']:
                item['entityId'] = company
                product_list.append(item)

            yield dict(
                source='jd.com',
                ocid='',
                entityId=company,
                product=product_list,
            )
            items['push_kafka_status'] = 1
            yield items

    async def parse_product(self, response):
        """解析产品页"""
        data = response.meta['data']
        shop_name = data['shop_name']
        company = data['company']
        categorys = data['categorys']

        product_list = self.get_product_items(response, shop_name, company, categorys, data['product_url'])

        if product_list:
            yield dict(
                source='jd.com',
                ocid='',
                entityId=company,
                product=product_list,
            )
            logger.info(f"成功: {company} => {data['shop_id']}")

            yield dict(
                company=company,
                shop_id=data['shop_id'],
                push_kafka_status=1,

            )
        else:
            logger.error(f"{response.url} => {data['shop_id']}")

    def get_product_items(self, response, shop_name='', company='', categorys='', shop_url='') -> list:
        ul = response.xpath('//li[@class="jSubObject"] | //li[@class="jSubObject gl-item"] | //div[@class="jItem"]')

        product_list = []
        for li in ul[:10]:
            title = li.xpath('.//div[@class="jDesc"]/a/@title').get() or ''
            # price = li.xpath('.//span[@class="jdNum"]/text()').get()
            img = str(li.xpath('.//div[@class="jPic"]//img/@src').get() or '').replace('s350x350', '')
            if not title and not img:
                continue
            if img:
                img = re.sub(r"/n[23456789]/", "/n1/", img)
                img = 'https:' + img

            item_i = {}
            item_i["entityId"] = company
            item_i["productPic"] = img.replace('s350x350', '')
            item_i["productName"] = title  # 产品名称
            item_i["productCategory"] = ""  # 产品分类
            item_i["productKeyword"] = ""  # 产品关键词
            item_i["productPrice"] = ""  # 产品价格
            item_i["mainProducts"] = categorys  # 主营产品
            item_i["listingPlatform"] = "京东"
            item_i["productShopName"] = shop_name  # 产品所属店铺名
            item_i["dataLink"] = shop_url or response.url  # 店铺链接
            product_list.append(item_i)

        return product_list

    @staticmethod
    def get_category(response) -> str:
        categorys = response.xpath(
            '//ul[@class="menu-list"]/li[@class="menu"]/a/text() | //div[@class="abs"]//div[@class="ins abs hdur_2"]/a/text()').getall()
        category = []
        for i in categorys:
            if '首页' in i or '全部' in i or '所有' in i or '问题' in i or '指导' in i or '售后' in i or '撰文' in i:
                continue
            category.append(i)
        return ','.join(category)


if __name__ == '__main__':
    JingDongSpider.start()

最后的数据

本内容仅限用于学习参考,不得用于商业目的。如有版权问题,请联系我们删除,谢谢!

欢迎一起学习讨论Q540513871

相关推荐
Lyyaoo.2 分钟前
Lombok工具库
开发语言·python
无垠的广袤4 分钟前
【工业树莓派 CM0 NANO 单板计算机】MLX90640 热成像仪
linux·python·树莓派·传感器
多恩Stone6 分钟前
【SLURM 入门】sbatch 等概念与常用命令
人工智能·python
全栈凯哥7 分钟前
09.Python 中元组完全指南
python
AsDuang8 分钟前
Python 3.12 MagicMethods - 39 - __mod__
开发语言·python
小鸡吃米…9 分钟前
Python 中的并发 —— 简介
服务器·数据库·python
咋吃都不胖lyh13 分钟前
WSL2(Linux)+ VSCode 运行 D 盘 Python 文件全流程文档
linux·vscode·python
进击的雷神13 分钟前
请求频率限制、嵌套数据结构、多目录聚合、地址字段重构——K展爬虫四大技术难关攻克纪实
数据结构·爬虫·python·重构
老师好,我是刘同学13 分钟前
Python字符串全解析:从创建到实战应用
python
PNP Robotics20 分钟前
连接AI产业·链动全球|PNP机器人亮相2026杭州全球人工智能大会
人工智能·python·学习·开源