小程序项目思路分享爬虫
具体需求:
有这几个就行,门店名称+门店地址+门店类型,再加上省、市、县/区
门店名称:storeName
门店地址:storeAddress
程序运行:
honor_spider获取经纬度信息。
经纬度------>详细店铺接口
这是荣耀店铺接口数据,请求参数带有省/市的经纬度信息。返回响应数据是省、市下的行政区划荣耀店铺信息
https://retail.hihonor.com/isrp/sms/online/store-info/getTargetStore/hi-honor
curl
curl -H 'Host: retail.hihonor.com' -H 'appsource: wxapp' -H 'x-from: wxapp' -H 'charset: utf-8' -H 'x-appname: wxapp' -H 'user-agent: Mozilla/5.0 (Linux; Android 13; Mi 10 Build/TKQ1.221114.001; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/116.0.0.0 Mobile Safari/537.36 XWEB/1160117 MMWEBSDK/20240301 MMWEBID/291 MicroMessenger/8.0.48.2580(0x2800303D) WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64 MiniProgramEnv/android' -H 'x-user-token: ' -H 'content-type: application/json' -H 'x-access-token: ' -H 'referer: https://servicewechat.com/wxde6bc80b07638016/409/page-frame.html' --data-binary '{"coordinateType":"2","longitude":116.31188232421874,"latitude":40.03590304904514,"isGetTargetStore":true,"operatingStatus":1,"isCheckSpu":true,"range":50000,"enableWechatApplet":1}' --compressed 'https://retail.hihonor.com/isrp/sms/online/store-info/getTargetStore/hi-honor'
接口特点:
接口只返回省市下的店铺信息,而不返回区级及以下的店铺信息。所以要提取行政区划信息,要对店铺地址写正则表达式提取。
代码:
import requests
import json
headers = {
"Host": "retail.hihonor.com",
"appsource": "wxapp",
"x-from": "wxapp",
"charset": "utf-8",
"x-appname": "wxapp",
"user-agent": "Mozilla/5.0 (Linux; Android 13; Mi 10 Build/TKQ1.221114.001; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/116.0.0.0 Mobile Safari/537.36 XWEB/1160117 MMWEBSDK/20240301 MMWEBID/291 MicroMessenger/8.0.48.2580(0x2800303D) WeChat/arm64 Weixin NetType/WIFI Language/zh_CN ABI/arm64 MiniProgramEnv/android",
}
url = "https://retail.hihonor.com/isrp/sms/online/store-info/getTargetStore/hi-honor"
data = {
"coordinateType": "2",
"longitude": 114.571090,
"latitude": 38.146190,
}
data = json.dumps(data, separators=(',', ':'))
response = requests.post(url, headers=headers, data=data)
# 解析响应数据
response_data = response.json() # 使用.json()方法直接解析JSON响应数据
# 提取storeName和storeAddress字段
if response_data['status'] == 'success' and 'result' in response_data:
for store in response_data['result']:
store_name = store.get('storeName') # 使用get方法避免KeyError
store_address = store.get('storeAddress') # 使用get方法避免KeyError
print('店铺名称:', store_name)
print('店铺地址:', store_address)
else:
print('请求失败或响应数据格式不正确')
结果
创建scrapy:scrapy startproject miniprogram
创建爬虫:scrapy genspider honor
省市------>经纬度接口
url = 'https://apis.map.qq.com/jsapi?qt=geoc&addr={}&output=jsonp&pf=jsapi&ref=jsapi&cb=qq.maps._svcb3.geocoder0'.format(name)
请求参数是省市名称,返回的数据是经纬度信息,这里我调用的是开源的腾讯地图api接口。
爬取直辖市
province_data = {'北京市': 'CN-11', '天津市': 'CN-12', '河北省': 'CN-13', '山西省': 'CN-14',
'内蒙古自治区': 'CN-15', '辽宁省': 'CN-21', '吉林省': 'CN-22', '黑龙江省': 'CN-23',
'上海市': 'CN-31', '江苏省': 'CN-32', '浙江省': 'CN-33', '安徽省': 'CN-34', '福建省': 'CN-35',
'江西省': 'CN-36', '山东省': 'CN-37', '河南省': 'CN-41', '湖北省': 'CN-42', '湖南省': 'CN-43',
'广东省': 'CN-44', '广西壮族自治区': 'CN-45', '海南省': 'CN-46', '重庆市': 'CN-50',
'四川省': 'CN-51', '贵州省': 'CN-52', '云南省': 'CN-53', '西藏自治区': 'CN-54',
'陕西省': 'CN-61', '甘肃省': 'CN-62', '青海省': 'CN-63', '宁夏回族自治区': 'CN-64',
'新疆维吾尔自治区': 'CN-65'}
请求测试逻辑
# import requests
# import json
# import re
#
# # 假设您已经定义了get_proxies方法,如果没有,您可以将proxies参数从下面的请求中移除
# def get_proxies():
# # 返回代理服务器的配置,例如:
#
# # 如果您不使用代理,可以返回None或者直接在请求中移除proxies参数
# return None
#
# # 坐标
# def zb(name): # name是区名
# print('爬取坐标')
# url = 'https://apis.map.qq.com/jsapi?qt=geoc&addr={}&output=jsonp&pf=jsapi&ref=jsapi&cb=qq.maps._svcb3.geocoder0'.format(name)
# headers = {
# "user-agent": '
# }
# try:
# response = requests.get(url=url, headers=headers, proxies=get_proxies(), timeout=7)
# html = response.text
# if html[-1] != ';':
# html = html + ';'
#
# html = html.replace('\n', '').replace('\t', '').replace('\r', '')
# html = re.findall('\({(.*?)\);', html)[0]
# html = json.loads('{' + html)
# return html
# except Exception as e:
# print('获取坐标错误', name, '重试!')
# print(e)
# return zb(name)
#
# # 测试代码
# if __name__ == "__main__":
# test_name = "北京市" # 测试用的区名,可以更换为其他区名。用provinceName
# result = zb(test_name)
# print(json.dumps(result, indent=4, ensure_ascii=False))
import requests
import json
import re
from connRedis import OPRedis
# 假设您已经定义了get_proxies方法,如果没有,您可以将proxies参数从下面的请求中移除
# def get_proxies():
#如需调用,自行配置,这里我删除了。
# proxies = {
"http": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": tid, "pwd": password, "proxy": proxy_ip},
"https": "http://%(user)s:%(pwd)s@%(proxy)s/" % {"user": tid, "pwd": password, "proxy": proxy_ip}
}
return proxies
# 坐标
def zb(name): # name是区名
print('爬取坐标')
url = 'https://apis.map.qq.com/jsapi?qt=geoc&addr={}&output=jsonp&pf=jsapi&ref=jsapi&cb=qq.maps._svcb3.geocoder0'.format(name)
headers = {
"user-agent":
}
try:
response = requests.get(url=url, headers=headers, proxies=get_proxies(), timeout=7)
html = response.text
if html[-1] != ';':
html = html + ';'
html = html.replace('\n', '').replace('\t', '').replace('\r', '')
jsonp_str = re.findall('\((.*?)\);', html)[0] # 提取JSONP响应中的JSON部分
json_data = json.loads(jsonp_str) # 将JSON字符串转换为字典
pointx = json_data['detail']['pointx']
pointy = json_data['detail']['pointy']
return {'pointx': pointx, 'pointy': pointy} # 返回经纬度的值
except Exception as e:
print('获取坐标错误', name, '重试!')
print(e)
return zb(name)
# 测试代码
if __name__ == "__main__":
test_name = "正定县" # 测试用的区名,可以更换为其他区名
coordinates = zb(test_name)
print('经度:', coordinates['pointx'])
print('纬度:', coordinates['pointy'])
返回数据
import requests
import json
headers = {
"Host": "retail.hihonor.com",
"appsource": "wxapp",
"x-from": "wxapp",
"charset": "utf-8",
"x-appname": "wxapp",
"user-agent":
"x-user-token": "",
"content-type": "application/json",
"x-access-token": "",
"referer": "https://servicewechat.com/wxde6bc80b07638016/409/page-frame.html"
}
url = "https://retail.hihonor.com/isrp/sms/online/store-info/getTargetStore/hi-honor"
data = {
"coordinateType": "2",
"longitude": 116.231280,
"latitude": 40.220770,
"isGetTargetStore": True,
"operatingStatus": 1,
"isCheckSpu": True,
"range": 50000,
"enableWechatApplet": 1
}
data = json.dumps(data, separators=(',', ':'))
response = requests.post(url, headers=headers, data=data)
print(response.text)
print(response)
帮我提取出这段代码中的
问题
1.获取某些省,位置坐标报错
只能获取市级单位的经纬坐标
可见从头开始做测试的重要性,切忌想当然!!
2.代理不稳定,要重复请求
石家庄市
省------>市接口
"https://ccpce-cn.consumer.huawei.com/ccpcmd/services/dispatch/secured/CCPC/EN/ccpc/queryRegionList/1000"
这个接口比较特殊,是在华为消费者官网上找到的接口,通过这个接口。可以直接将省下面的市名列表获取到。这样就不用单独建数据库表了,直接调用就好了。这里注明来源,防止之后失效。
代码
# 获取下一级,省->市->区/县->
def get_next_level(code, level):#返回的是市级的数据,不要区县
headers = {
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"Connection": "keep-alive",
"Referer": "https://consumer.huawei.com/",
"Sec-Fetch-Dest": "script",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-site",
"User-Agent":
"sec-ch-ua": "^\\^Not",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "^\\^Windows^^"
}
cookies = {
}
url = "https://ccpce-cn.consumer.huawei.com/ccpcmd/services/dispatch/secured/CCPC/EN/ccpc/queryRegionList/1000"
params = {
"jsonp": "jQuery36205836021093286132_{}".format(int(time.time() * 1000)), # 使用当前时间戳,
"countryCode": "CN",
"lang": "zh-cn",
"parent_alpha_2_code": code,
"scopeGrade": level,
"curpage": "1",
"pagesize": "1000",
"_": int(time.time() * 1000)
}
# 初始化重试次数
max_retries = 3
attempts = 0
while attempts < max_retries:
try:
response = requests.get(url=url, headers=headers, cookies=cookies, params=params, proxies=get_proxies(), timeout=7)
html = response.text
# print(html)
# 确保返回的JavaScript响应以分号结尾
if html[-1] != ';':
html = html + ';'
# 使用正则表达式提取JSONP响应中的JSON数据
html = re.findall('\({(.*?)\);', html)[0]
# 将提取的JSON字符串转换为Python字典
html = json.loads('{' + html)
# 初始化一个空列表,用于存储解析后的数据
datas = []
# 遍历响应数据中的list部分,将每一项数据添加到datas列表中
for data in html['responseData']['list']:
datas.append(data)
# print('打印响应数据')
# print('----------------------------------------')
# print(datas)
# 返回解析后的数据列表
return datas
except Exception as e:
print('获取下一级错误', code, '返回空数组!')
print('异常类型:', type(e).__name__)
print('异常信息:', e)
# 如果尝试次数达到最大重试次数,则返回空数组
if attempts == max_retries:
print('已达到最大尝试次数,返回空数组!')
return []
# 短暂休眠后再次尝试
time.sleep(2)
# if __name__ == '__main__':
# data = get_next_level("CN-36","city")#逻辑是如果是省调用这个接口
# 主函数
if __name__ == '__main__':
# 遍历province_data中的每个省份
for province_name, province_code in province_data.items():
# 如果省份名称以"市"结尾,跳过直辖市,因为直辖市不需要获取下级市
if province_name.endswith('市'):
continue
# 获取省份下的市级数据
city_data = get_next_level(province_code, "city")
# 市数量计数
city_count = len(city_data)
# # 初始化市名列表
# city_names = [city['regionName'] for city in city_data]
# city_names = [item['multi_lang_name'] for item in city_data['responseData']['list']]
city_names = [item['multi_lang_name'] for item in city_data]# 打印省份名称,市的个数和市名列表
print(f'省份名称: {province_name}')
print(f'市的个数: {city_count}')
print(f'包含的市名列表: {city_names}')
print('----------------------------------------')
结果
代码优化:
写成了面向对象,有需要的可以后台call我。
之后如果客户有其他小程序的需求,还需要用scrapy
框架进行分布式爬取数据
封装成scrapy框架
scrapy genspider honor consumer.huawei.com
实例:Scrapy框架请求逻辑
import scrapy
import json
import re
import time
from connRedis import OPRedis
class RegionSpider(scrapy.Spider):
name = 'region_spider'
allowed_domains = ['consumer.huawei.com']
province_data = {
# ... 省略其他省份数据 ...
'江西省': 'CN-36',
# ... 省略其他省份数据 ...
}
def start_requests(self):
for province_name, province_code in self.province_data.items():
if province_name.endswith('市'):
continue
yield scrapy.Request(
url=self.get_url(province_code, "city"),
callback=self.parse_city,
meta={'province_name': province_name, 'province_code': province_code},
cookies=self.get_cookies(),
headers=self.get_headers()
)
def parse_city(self, response):
province_name = response.meta['province_name']
province_code = response.meta['province_code']
try:
jsonp_string = response.text
json_string = re.findall(r'\((.*?)\);', jsonp_string)[0]
data = json.loads(json_string)
city_data = data['responseData']['list']
city_count = len(city_data)
city_names = [item['multi_lang_name'] for item in city_data]
print(f'省份名称: {province_name}')
print(f'市的个数: {city_count}')
print(f'包含的市名列表: {city_names}')
except Exception as e:
self.logger.error(f'解析错误: {e}, 省份代码: {province_code}')
def get_url(self, code, level):
timestamp = int(time.time() * 1000)
return f"https://ccpce-cn.consumer.huawei.com/ccpcmd/services/dispatch/secured/CCPC/EN/ccpc/queryRegionList/1000?jsonp=jQuery36205836021093286132_{timestamp}&countryCode=CN&lang=zh-cn&parent_alpha_2_code={code}&scopeGrade={level}&curpage=1&pagesize=1000&_={timestamp}"
def get_cookies(self):
return {
# ... 省略其他cookies ...
"JSESSIONID": "F78B38998A33F9B74DE0077819BF987E603F6C1FC1AC1005",
# ... 省略其他cookies ...
}
def get_headers(self):
return {
"Accept": "*/*",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
}
代码用法
Scrapy爬虫类中,start_requests
方法生成初始请求,parse_city
方法解析响应并打印结果。get_url
、get_cookies
、get_headers
和get_proxies
方法用于构建请求的URL、Cookies、Headers和代理。
在这里connRedis
的模块和OPRedis
的类是代理中间件。