前言:今天讲下高德地图全国美食爬虫。
反爬点:
高德地图反爬主要是:
1.IP代理。
2.headers(referer,cookie必要的),referer:随便个可以不变。cookie:必要的参数:isg,l,cna(可自行数字大小写字母组合),uab_collina(固定值)
3.限制最大45页,可地区精确到区及二级分类精确到火锅来尽可能最大获取,避免最大页数问题.
import requests,random,string,time,pymongo,re,json,datetime,logging
from Config import Config
from urllib import parse
logging.basicConfig(filename="show.log",filemode="a",format="%(asctime)s-%(name)s-%(levelname)s-%(message)s",level=logging.INFO)
class Amap(object):
def __init__(self):
self.isg = 'XXXX'
self.l = 'XXX'
self.cna = 'XXXX'
def get_pro(self):
get_pro_list = self.post_city.find({})
for get_pro in get_pro_list[9:]:
print('begin......{}'.format(get_pro['pro_name']))
pro_name = get_pro['pro_name']
for every_city in get_pro['city_list']:
choose_city = every_city
city_name = choose_city['city_name']
print('begin city ....{}'.format(city_name))
city_adcode = choose_city['city_adcode']
# 1获取城市所有区及美食二级分类
show_url = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum=1&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&city={}&geoobj=121.9098|25.510585|111.923414|24.516816&_src=around&keywords=美食'.format(city_adcode)
headers = self.get_headers(city_adcode)
show_json = self.request_url(show_url,headers)
# print('11111',show_json)
if show_json:
# 区分类
area_list = []
if 'bizAreaData' in show_json:
districts = show_json['bizAreaData']['districts']
for k in districts:
area_dict = {}
area_dict['area_name'] = k['name']
area_dict['area_value'] = k['districts']
area_list.append(area_dict)
self.deal_areas(pro_name, city_name, city_adcode, area_list)
else:
print('该市并未有区......')
area_list = []
self.deal_areas(pro_name, city_name, city_adcode, area_list)
else:
print('{} 未获取到 json数据.......'.format(city_name))
def deal_areas(self,pro_name,city_name,city_adcode,area_list):
classify_list = Config.classify_list
if len(area_list) > 0:
for j in area_list:
area_name = j['area_name']
area_site_list = j['area_value']
for k in area_site_list:
if re.search('全部',k['name']):
continue
else:
area_site_adcode = k['adcode']
area_site_name = k['name']
for m in classify_list:
classify_name = m['classify_name']
classify_value = m['classify_value']
print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,
classify_name))
self.deal_information(pro_name,city_name,city_adcode,area_name,area_site_adcode,area_site_name,classify_name,classify_value)
else:
print('该市分区为0..........')
area_name = ''
area_site_adcode = ''
area_site_name = ''
classify_list2 = Config.classify_list2
for m in classify_list2:
classify_name = m['classify_name']
second_classify_list = m['second_list']
if len(second_classify_list) > 0:
for l in second_classify_list:
print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,
classify_name))
self.deal_other_information(pro_name, city_name, city_adcode, area_name, area_site_adcode, area_site_name,
classify_name,l)
else:
print('{}...{}..{}.get shop {} begin.....'.format(city_name, area_name, area_site_name,
classify_name))
self.deal_other_information(pro_name, city_name, city_adcode, area_name, area_site_adcode,
area_site_name,
classify_name, '')
def deal_other_information(self,pro_name, city_name, city_adcode, area_name, area_site_adcode, area_site_name,
classify_name,second_classify_str):
if second_classify_str:
second_un = parse.quote(second_classify_str)
else:
second_un = parse.quote(classify_name)
geoobj = parse.quote('')
i = 1
a = 0
while True:
url = 'https://ditu.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=14.18&city={}&geoobj={}&keywords={}'.format(str(i),city_adcode,geoobj,second_un)
headers = self.get_headers(city_adcode)
resp_json = self.request_url(url,headers)
if resp_json:
shop_total = int(resp_json['data']['total'])
print('总共{}个店铺'.format(resp_json['data']['total']))
if 'poi_list' in resp_json['data']:
now_num = len(resp_json['data']['poi_list'])
a += now_num
else:
break
print('当前已爬取{}个店铺'.format(a))
if shop_total > 0:
for j in resp_json['data']['poi_list']:
shop_id = j['id']
shop_name = j['name']
# print(shop_name)
shop_address = j['address']
# print(shop_address)
shop_tel = j['tel']
shop_latitude = j['latitude']
shop_longitude = j['longitude']
if 'value' in j['domain_list'][8]:
second_classify = j['domain_list'][8]['value']
else:
second_classify = ''
self.save_info(pro_name, city_name, area_name, area_site_name, classify_name, shop_id, shop_name,
shop_address, shop_tel, shop_latitude, shop_longitude, second_classify)
else:
print('shop num is none.......')
break
else:
print('{}...{}..{}.get shop type information failed'.format(city_name, area_name, area_site_name))
break
i += 1
def deal_information(self,pro_name,city_name,city_adcode,area_name,area_site_adcode,area_site_name,classify_name,classify_value):
geoobj = parse.quote('')
classify_data = parse.quote('business_area_flag=1;adcode={};custom=business_area:{}+{}+sort_rule=5;reserved_keywords=true'.format(area_site_adcode,area_site_name,classify_value))
user_loc = parse.quote('')
need_params = 'city={}&geoobj={}&_src=around&classify_data={}&user_loc={}&keywords=%E7%BE%8E%E9%A3%9F'.format(
city_adcode, geoobj, classify_data, user_loc)
i = 1
a = 0
while True:
need_url1 = 'https://www.amap.com/service/poiInfo?query_type=TQUERY&pagesize=20&pagenum={}&qii=true&cluster_state=5&need_utd=true&utd_sceneid=1000&div=PC1000&addr_poi_merge=true&is_classify=true&zoom=17&'.format(str(i))
every_url = need_url1 + need_params
headers = self.get_headers(city_adcode)
resp_json = self.request_url(every_url,headers)
# print('22222',resp_json)
if resp_json:
shop_total = int(resp_json['data']['total'])
print('总共{}个店铺'.format(resp_json['data']['total']))
if 'poi_list' in resp_json['data']:
now_num = len(resp_json['data']['poi_list'])
a += now_num
else:
break
print('当前已爬取{}个店铺'.format(a))
if shop_total > 0:
for j in resp_json['data']['poi_list']:
shop_id = j['id']
shop_name = j['name']
shop_address = j['address']
shop_tel = j['tel']
shop_latitude = j['latitude']
shop_longitude = j['longitude']
if 'value' in j['domain_list'][8]:
second_classify = j['domain_list'][8]['value']
else:
second_classify = ''
self.save_info(pro_name,city_name,area_name,area_site_name,classify_name,shop_id,shop_name,shop_address,shop_tel,shop_latitude,shop_longitude,second_classify)
else:
print('shop num is none.......')
break
else:
print('{}...{}..{}.get shop type information failed'.format(city_name,area_name,area_site_name))
break
i += 1
def get_headers(self,city_adcode):
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Host': 'www.amap.com',
'X-Requested-With': 'XMLHttpRequest',
'User-Agent': random.choice(Config.pc_user_agent_list),
'Referer': 'XXXXXXXXXXXXXX'.format(city_adcode),
isg={}; l={}'.format(cna,
# isg, l)
'Cookie': 'cna={}; _uab_collina=XXXXXXXX;isg={}; l={}'.format(cna,isg,l)
}
return headers
def request_url(self,url,headers):
i = 0
while i <=5:
if i == 5:
print('retry five times {}'.format(url))
logging.info("get url five times failed %s" % url)
return {}
try:
resp = requests.get(url, headers=headers, proxies=Config.proxies, verify=False,timeout=2)
# print(resp.text)
resp_json = json.loads(resp.text)
if 'status' in resp_json:
return resp_json
else:
print('被反爬啦,重新尝试了....{}次'.format(str(i)))
i +=1
continue
except Exception as f:
print('get json data failed {}'.format(str(f)))
i += 1