文章目录
前言
有些时候我们需要采集大量数据时,我们需要程序的运行效率高,当然如果有时候不想写请求代码的时候,这些情况我都会向你推荐scrapy。当然如果你之前学过django,那么你上手会更快,因为设计的架构是差不多的。
一、scrapy是什么?
Scrapy是一个Python编写的开源网络爬虫框架。它是一个被设计用于爬取网络数据、提取结构性数据的框架。(更详细的理论介绍可以移步其他文章,会很详细,本文着重于实战)
二、使用步骤
1.安装和创建
1: 安装环境
python
pip install scrapy
2: 创建爬虫工程文件
python
scrapy startproject mySpider
3: 生成爬虫脚本
python
scrapy genspider baidu https://www.baidu.com/ # scrapy genspider 文件名 网址
2.请求以及参数
- GET请求
python
start_url = 'https://www.baidu.com/'
info = {'arg': 'arg'}
yield scrapy.Request(
url=start_url,
meta=info , # 传参数
callback=self.parse_first,# 方法
dont_filter=False # scrapy它会默认过滤相同请求
)
def parse_first(self, response):
arg = response.meta['arg']
- POST请求(有很多,这里运用笔者用的次数最多的)
python
import json
from scrapy.http.request.json_request import JsonRequest
# 这里form_data 最好使用单引号
form_data = {'arg': 'arg'}
yield JsonRequest(
url=url,
body=json.dumps(form_data), # 对应requests库 requests.post里的参数 json=data
method='POST',
meta=info,
callback=self.get_pro_price,
dont_filter=True,
)
3.代理池
- ua代理池,在middlewares.py文件添加类
python
user_agent_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",............]
class RotateUserAgentMiddleware(UserAgentMiddleware):
# 创建ua
def process_request(self, request, spider):
user_agent = random.choice(user_agent_list)
if user_agent:
request.headers.setdefault('User-Agent', user_agent)
# print(f"User-Agent:{user_agent} is using.")
return None
def process_exception(self, request, exception, spider):
error_info = f"spider:{spider.name} RotateUserAgentMiddleware has error with {exception}"
# print(error_info)
logging.error(error_info)
- ip代理池,在middlewares.py文件添加类
python
class ProxyDownloaderMiddleware:
def process_request(self, request, spider):
"""
添加代理
"""
- 配置,在setting.py文件:
python
DOWNLOADER_MIDDLEWARES = {
"mySpider.middlewares.ProxyDownloaderMiddleware": 543,
"mySpider.middlewares.RotateUserAgentMiddleware": 544,
} # 数值越低优先级越高
4.请求错误处理
- setting文件配置
python
DUPEFILTER_DEBUG = True
# RETRY_HTTP_CODES = [429, 403, 504, 522,502,400] # 请求状态
# RETRY_TIMES = 5
# 请求失败重新请求50次
RETRY_ENABLED = True
RETRY_TIMES = 50
DOWNLOAD_TIMEOUT = 15 # 设置最大超时时间为15秒
- 个性化定制请求错误的逻辑,在middlewares.py文件添加类
python
from scrapy.downloadermiddlewares.retry import RetryMiddleware
class CustomRetryMiddleware(RetryMiddleware):
def process_response(self, request, response, spider):
if response.status != 200:
# 处理请求状态码不是200
logger.error(request.url)
logger.error(response.status)
logger.error(request.meta)
return response
def process_exception(self, request, exception, spider):
if self._is_max_retry(request):
# 打印日志
self._log_failed_request(request)
配置,在setting.py文件:
python
DOWNLOADER_MIDDLEWARES = {
"mySpider.middlewares.CustomRetryMiddleware": 544,
}
5.采集数据入库
- 定义item
python
class SanItem(scrapy.Item):
col1= scrapy.Field()
col2= scrapy.Field()
col3= scrapy.Field()
pass
- 数据写入mysql
python
class MySQLSanPipeline(object):
def __init__(self):
self.connection = pymysql.connect(host='localhost',
user='root',
password='1234',
database='test',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor)
self.cursor = self.connection.cursor()
self.data = []
def open_spider(self, spider):
pass
def process_item(self, item, spider):
self.data.append(
(item['col1'], item['col2'], item['col3'],
)
)
if len(self.data) >= 100: # 数据量大于100 批量写入数据库
self._write_to_db()
return item
def _write_to_db(self):
# execute->改为了 executemany 支持多条数据批量传入数据库
sql = """
insert into sangon(col1,col2,col3
) values (%s,%s,%s)
"""
self.cursor.executemany(
sql, self.data
)
self.connection.commit() # 把数据缓冲区的数据提交到数据库
self.data.clear() # 每次添加后清空data 避免重复添加数据
def close_spider(self, spider):
if len(self.data) >= 0: # 如果还有残留的数据,但是因为不满100条没有传到数据库也要做好处理
self._write_to_db()
self.connection.close()
#=================记得在setting文件配置==================
ITEM_PIPELINES = {
"mySpider.pipelines.MySQLSanPipeline": 300
}
- 更新MongoDB数据
python
class MongoPipeline(object):
def __init__(self, host, port, user, pwd, db, table):
self.host = host
self.port = port
self.user = user
self.pwd = pwd
self.db = db
self.table = table
@classmethod
def from_crawler(cls, crawler):
HOST = crawler.settings.get('HOST')
PORT = crawler.settings.get('PORT')
USER = crawler.settings.get('USER')
PWD = crawler.settings.get('PWD')
DB = crawler.settings.get('DB')
TABLE = crawler.settings.get('TABLE')
return cls(HOST, PORT, USER, PWD, DB, TABLE)
def open_spider(self, spider):
self.conn = MongoClient("mongodb://{}:{}@{}:{}/{}".format(self.user, self.pwd, self.host, self.port, self.db))
self.db_conn = self.conn[self.db]
self.set_conn = self.db_conn[self.table]
def close_spider(self, spider):
self.conn.close()
def process_item(self, item, spider):
self.set_conn.update_one(
{'_id': item['_id'], }, # 查询条件:匹配具有特定_id的文档
{'$set': {
'col1': item['col1'],
}} # 更新操作
)
# 如果是列表字段内更新
# {'id':id,'list':[{'code':'1','size':'s1'}]}
self.set_conn.update_one(
{'_id': _id,'list.code':1}, # 查询条件:匹配具有特定_id的文档
{'$set': {'list.$.size': 's2'}} # 更新操作
)
return item
#=================记得在setting文件配置==================
ITEM_PIPELINES = {
"mySpider.pipelines.MongoPipeline": 300
}
6.日志及其他配置
- 日志配置
python
import logging
import datetime
class MySpider(scrapy.Spider):
custom_settings = {
'LOG_FILE': 'E:/Lu_zong_data/logs/log_{}_{}_{}_{}.txt'.format(
datetime.datetime.today().year,
datetime.datetime.today().month,
datetime.datetime.today().day,
datetime.datetime.today().hour),
}
# 写入日志文件信息 logging.info('测试')
- 终端运行太麻烦了,可以设置run.py
python
from scrapy import cmdline
cmdline.execute('scrapy crawl baidu'.split())
总结
scrapy真是一个功能强大的爬虫框架,也建议结合crawlab爬虫管理平台,体感和操作效果会更好!