Python-爬虫案例
代码
css
import requests
import json
import threading
from queue import Queue
import time
class HeiMaTouTiao:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/92.0.4515.107 Safari/537.36",
'Authorization': 'Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJIU'
'zI1NiJ9.eyJleHAiOjE2NTY2NTk3NjcsInVzZXJfaWQiOjEsInJlZn'
'Jlc2giOmZhbHNlLCJ2ZXJpZmllZCI6dHJ1ZX0.ZSdV5mT6w_yhEKLg'
'qcvWNln2GKHBxfxK7d8YXaoCMYg'}
# URL队列
self.url_queue = Queue()
# 提取的内容队列
self.content_queue = Queue()
def get_url_list(self, start_page, end_page):
url_temp = 'http://api-toutiao-web.itheima.net/mp/v1_0/articles?' \
'page={}&per_page=10&response_type=comment'
url_list = [url_temp.format(i) for i in range(start_page, end_page + 1)]
for url in url_list:
print('正在请求:', url)
self.url_queue.put(url)
def get_data(self):
content_li = []
while True:
url = self.url_queue.get()
comment = requests.get(url=url, headers=self.headers).text
data = json.loads(comment)
data = data['data']['results']
for index in range(len(data)):
content = dict()
content['标题'] = data[index]['title']
if data[index]['comment_status'] is True:
content['评论状态'] = '正常'
else:
content['评论状态'] = '关闭'
content['总评论数'] = data[index]['total_comment_count']
content['粉丝评论数'] = data[index]['fans_comment_count']
content_li.append(content)
self.content_queue.put(content_li)
self.url_queue.task_done()
def save_data(self):
while True:
content_list = self.content_queue.get()
with open('toutiao.json', mode='a+', encoding='utf-8')as f:
f.write(json.dumps(content_list, ensure_ascii=False, indent=2))
self.content_queue.task_done()
def run(self):
start_page = int(input('请输入抓取的起始页:'))
end_page = int(input('请输入抓取的结束页:'))
# 线程列表
t_list = []
if start_page <= 0:
print('抓取的起始页从1开始。')
else:
t_url = threading.Thread(target=self.get_url_list, args=(
start_page, end_page))
t_list.append(t_url)
# 提取内容线程
for i in range(9):
t_content = threading.Thread(target=self.get_data)
t_list.append(t_content)
# 保存数据
t_save = threading.Thread(target=self.save_data)
t_list.append(t_save)
for t in t_list:
t.setDaemon(True)
t.start()
for q in [self.url_queue, self.content_queue]:
q.join()
if __name__ == '__main__':
heimatoutiao = HeiMaTouTiao()
start_time = time.time()
heimatoutiao.run()
end_time = time.time()
print(f'总用时:{end_time - start_time}秒')
这段Python代码定义了一个名为 HeiMaTouTiao 的类,用于爬取和保存某学习网站上的文章信息。
代码
css
import requests
import json
from pymongo import MongoClient
class LittleRabbit:
def __init__(self):
# 准备车载用品类页面的URL
self.init_url = 'https://apipc-xiaotuxian-front.itheima.net/category/goods/temporary'
# 请求头
self.headers = {
"Content-Type": "application/json;charset=utf-8",
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64)'
'AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/90.0.4430.212 Safari/537.36'}
# 连接MongoDB的客户端
self.client = MongoClient('127.0.0.1', 27017)
def load_category_page(self, page):
"""
抓取车载用品类商品展示页面的数据
:param page:待抓取的页码数
:return:车载用品类下的所有商品
"""
# 准备请求体
request_payload = {"page": page, "pageSize": 20, "categoryId": "1005009"}
# 将字典form_data转换为JSON字符串
json_data = json.dumps(request_payload)
response = requests.post(url=self.init_url, data=json_data,
headers=self.headers)
# 将服务器返回的JSON字符串先转换成字典,再获取字典中的商品信息
all_goods = json.loads(response.text)["result"]["items"]
return all_goods
def load_detail_page(self, all_goods):
"""
抓取商品详情页的数据
:param all_goods: 车载用品类下的所有商品
:return: 所有商品的详情信息
"""
# 准备基本URL
base_url = 'https://apipc-xiaotuxian-front.itheima.net/goods?'
# 定义一个数组,保存所有商品的详情信息
goods_detail_info = []
for good_info in all_goods:
# 提取商品的ID标识
good_id = dict(id=good_info['id'])
# 根据拼接商品详情页的完整URL,发送GET请求
response = requests.get(url=base_url, params=good_id)
# 将服务器返回的JSON数据转换为字典
good_detail = json.loads(response.text)
goods_detail_info.append(good_detail)
return goods_detail_info
def parse_page(self, detail_data):
"""
解析商品详情页的数据,提取目标数据
:param detail_data:所有商品的详情数据
:return:所有商品的信息
"""
# 定义一个列表,保存所有商品的信息
all_goods_info = []
temp_url = 'http://erabbit.itheima.net/#/product/'
for info in detail_data:
dict_data = dict()
dict_data['商品名称'] = info['result']['name']
dict_data['商品描述'] = info['result']['desc']
dict_data['商品链接'] = temp_url + info['result']['id']
dict_data['商品价格'] = info['result']['price']
# 获取详情页面中的第一张图片
dict_data['商品图片'] = info['result']['mainPictures'][0]
good_detail = info['result']['details']['properties']
dict_data['商品详情'] = ''.join([':'.join(info.values()) + '\n' for info in good_detail])
all_goods_info.append(dict_data)
return all_goods_info
def save_data(self, goods_info):
"""
存储商品详情的数据
:param get_goods_info:
"""
# 建立连接到本地的MongoDB
client = self.client
# 访问/创建数据库rabbit
db = client.rabbi
try:
for good in goods_info:
# 创建集合little_rabbit,并在该集合中插入文档对象
db.little_rabbit.insert_one(good)
print('保存成功')
# 访问集合中的文档对象
result = db.little_rabbit.find()
for doc in result:
print(doc)
except Exception as error:
print(error)
def run(self):
"""
启动网络爬虫,控制网络爬虫的执行流程
"""
begin_page = int(input('起始页码:'))
end_page = int(input('结束页码:'))
if begin_page <= 0:
print('起始页码从1开始')
else:
for page in range(begin_page, end_page + 1):
print(f'正在抓取第{page}页')
all_goods = self.load_category_page(page)
goods_detail = self.load_detail_page(all_goods)
goods_info = self.parse_page(goods_detail)
self.save_data(goods_info)
if __name__ == '__main__':
lr = LittleRabbit()
lr.run()
用于爬取和存储指定网站上的商品信息到MongoDB数据库