高德地图美食

前言:今天讲下高德地图全国美食爬虫。

反爬点:

高德地图反爬主要是:

1.IP代理。

2.headers(referer,cookie必要的),referer:随便个可以不变。cookie:必要的参数:isg,l,cna(可自行数字大小写字母组合),uab_collina(固定值)

3.限制最大45页,可地区精确到区及二级分类精确到火锅来尽可能最大获取,避免最大页数问题.

import requests,random,string,time,pymongo,re,json,datetime,logging
from Config import Config
from urllib import parse
logging.basicConfig(filename="show.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)


class Amap(object):
    def __init__(self):
        self.isg = 'XXXX'
        self.l = 'XXX'
        self.cna = 'XXXX' 


    def get_pro(self):
        get_pro_list = self.post_city.find({})
        for get_pro in get_pro_list[9:]:
            print('begin......{}'.format(get_pro['pro_name']))
            pro_name = get_pro['pro_name']
            for every_city in get_pro['city_list']:
                choose_city = every_city
                city_name = choose_city['city_name']
                print('begin city ....{}'.format(city_name))
                city_adcode = choose_city['city_adcode']
                # 1获取城市所有区及美食二级分类
                show_url = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&city={}&geoobj=121.9098|25.510585|111.923414|24.516816&_src=around&keywords=美食'.format(city_adcode)
                headers = self.get_headers(city_adcode)
                show_json = self.request_url(show_url,headers)
                # print('11111',show_json)
                if show_json:
                    # 区分类
                    area_list = []
                    if 'bizAreaData' in show_json:
                        districts = show_json['bizAreaData']['districts']
                        for k in districts:
                            area_dict = {}
                            area_dict['area_name'] = k['name']
                            area_dict['area_value'] = k['districts']
                            area_list.append(area_dict)
                        self.deal_areas(pro_name, city_name, city_adcode, area_list)
                    else:
                        print('该市并未有区......')
                        area_list = []
                        self.deal_areas(pro_name, city_name, city_adcode, area_list)

                else:
                    print('{} 未获取到 json数据.......'.format(city_name))




    def deal_areas(self,pro_name,city_name,city_adcode,area_list):
        classify_list = Config.classify_list
        if len(area_list) > 0:
            for j in area_list:
                area_name = j['area_name']
                area_site_list = j['area_value']
                for k in area_site_list:
                    if re.search('全部',k['name']):
                        continue
                    else:
                        area_site_adcode = k['adcode']
                        area_site_name = k['name']
                        for m in classify_list:
                            classify_name = m['classify_name']
                            classify_value = m['classify_value']
                            print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,
                                                                              classify_name))
                            self.deal_information(pro_name,city_name,city_adcode,area_name,area_site_adcode,area_site_name,classify_name,classify_value)
        else:
            print('该市分区为0..........')
            area_name = ''
            area_site_adcode = ''
            area_site_name = ''
            classify_list2 = Config.classify_list2
            for m in classify_list2:
                classify_name = m['classify_name']
                second_classify_list = m['second_list']
                if len(second_classify_list) > 0:
                    for l in second_classify_list:
                        print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,
                                                                          classify_name))
                        self.deal_other_information(pro_name, city_name, city_adcode, area_name, area_site_adcode, area_site_name,
                                              classify_name,l)
                else:
                    print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,
                                                                      classify_name))
                    self.deal_other_information(pro_name, city_name, city_adcode, area_name, area_site_adcode,
                                                area_site_name,
                                                classify_name, '')


    def deal_other_information(self,pro_name, city_name, city_adcode, area_name, area_site_adcode, area_site_name,
                                              classify_name,second_classify_str):
        if second_classify_str:
            second_un = parse.quote(second_classify_str)
        else:
            second_un = parse.quote(classify_name)
        geoobj = parse.quote('')
        i = 1
        a = 0
        while True:
            url = 'https://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=14.18&city={}&geoobj={}&keywords={}'.format(str(i),city_adcode,geoobj,second_un)
            headers = self.get_headers(city_adcode)
            resp_json = self.request_url(url,headers)
            if resp_json:
                shop_total = int(resp_json['data']['total'])
                print('总共{}个店铺'.format(resp_json['data']['total']))
                if 'poi_list' in resp_json['data']:
                    now_num = len(resp_json['data']['poi_list'])
                    a += now_num
                else:
                    break
                print('当前已爬取{}个店铺'.format(a))
                if shop_total > 0:
                    for j in resp_json['data']['poi_list']:
                        shop_id = j['id']
                        shop_name = j['name']
                        # print(shop_name)
                        shop_address = j['address']
                        # print(shop_address)
                        shop_tel = j['tel']
                        shop_latitude = j['latitude']
                        shop_longitude = j['longitude']
                        if 'value' in j['domain_list'][8]:
                            second_classify = j['domain_list'][8]['value']
                        else:
                            second_classify = ''
                        self.save_info(pro_name, city_name, area_name, area_site_name, classify_name, shop_id, shop_name,
                                       shop_address, shop_tel, shop_latitude, shop_longitude, second_classify)
                else:
                    print('shop num is none.......')
                    break
            else:
                print('{}...{}..{}.get shop type  information failed'.format(city_name, area_name, area_site_name))
                break
            i += 1



    def deal_information(self,pro_name,city_name,city_adcode,area_name,area_site_adcode,area_site_name,classify_name,classify_value):
        geoobj = parse.quote('')
        classify_data = parse.quote('business_area_flag=1;adcode={};custom=business_area:{}+{}+sort_rule=5;reserved_keywords=true'.format(area_site_adcode,area_site_name,classify_value))
        user_loc = parse.quote('')
        need_params = 'city={}&geoobj={}&_src=around&classify_data={}&user_loc={}&keywords=%E7%BE%8E%E9%A3%9F'.format(
            city_adcode, geoobj, classify_data, user_loc)
        i = 1
        a = 0
        while True:
            need_url1 = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&'.format(str(i))
            every_url = need_url1 + need_params
            headers = self.get_headers(city_adcode)
            resp_json = self.request_url(every_url,headers)
            # print('22222',resp_json)
            if resp_json:
                shop_total = int(resp_json['data']['total'])
                print('总共{}个店铺'.format(resp_json['data']['total']))
                if 'poi_list' in resp_json['data']:
                    now_num = len(resp_json['data']['poi_list'])
                    a += now_num
                else:
                    break
                print('当前已爬取{}个店铺'.format(a))
                if shop_total > 0:
                    for j in resp_json['data']['poi_list']:
                        shop_id = j['id']
                        shop_name = j['name']
                        shop_address = j['address']
                        shop_tel = j['tel']
                        shop_latitude = j['latitude']
                        shop_longitude = j['longitude']
                        if 'value' in j['domain_list'][8]:
                            second_classify = j['domain_list'][8]['value']
                        else:
                            second_classify = ''
                        self.save_info(pro_name,city_name,area_name,area_site_name,classify_name,shop_id,shop_name,shop_address,shop_tel,shop_latitude,shop_longitude,second_classify)
                else:
                    print('shop num is none.......')
                    break
            else:
                print('{}...{}..{}.get shop type  information failed'.format(city_name,area_name,area_site_name))
                break
            i += 1


    def get_headers(self,city_adcode):
        headers = {
            'Accept': '*/*',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Host': 'www.amap.com',
            'X-Requested-With': 'XMLHttpRequest',
            'User-Agent': random.choice(Config.pc_user_agent_list),
            'Referer': 'XXXXXXXXXXXXXX'.format(city_adcode),
           isg={}; l={}'.format(cna,
            # isg, l)
            'Cookie': 'cna={}; _uab_collina=XXXXXXXX;isg={}; l={}'.format(cna,isg,l)

        }
        return headers

    def request_url(self,url,headers):
        i = 0
        while i <=5:
            if i == 5:
                print('retry five times {}'.format(url))
                logging.info("get url five times failed %s" % url)
                return {}
            try:
                resp = requests.get(url, headers=headers, proxies=Config.proxies, verify=False,timeout=2)
                # print(resp.text)
                resp_json = json.loads(resp.text)
                if 'status' in resp_json:
                    return resp_json
                else:
                    print('被反爬啦,重新尝试了....{}次'.format(str(i)))
                    i +=1
                    continue
            except Exception as f:
                print('get json data failed {}'.format(str(f)))
            i += 1

来源:高德地图美食爬虫_高德,cookie-CSDN博客

相关推荐
许嵩66几秒前
IC脚本之perl
开发语言·perl
长亭外的少年12 分钟前
Kotlin 编译失败问题及解决方案:从守护进程到 Gradle 配置
android·开发语言·kotlin
直裾12 分钟前
Scala全文单词统计
开发语言·c#·scala
心仪悦悦12 分钟前
Scala中的集合复习(1)
开发语言·后端·scala
JIAY_WX15 分钟前
kotlin
开发语言·kotlin
代码小鑫1 小时前
A043-基于Spring Boot的秒杀系统设计与实现
java·开发语言·数据库·spring boot·后端·spring·毕业设计
smilejingwei1 小时前
面向 Java 程序员的 SQLite 替代品
开发语言·sqlite·spl·esproc spl
Beekeeper&&P...1 小时前
git bash是什么,git是什么,git中的暂存区是什么,git中的本地仓库是什么,git中工作目录指的是什么
开发语言·git·bash