1:在settings.py
文件中添加MongoDB相关配置:
python
# settings.py
# MongoDB配置
MONGO_URI = 'mongodb://localhost:27017' # MongoDB连接字符串
MONGO_DATABASE = 'yiche_cars' # 数据库名称
MONGO_COLLECTION = 'car_info' # 集合名称
2:创建MongoDB管道:
python
# pipelines.py
import pymongo
from itemadapter import ItemAdapter
from scrapy.exceptions import DropItem
class MongoDBPipeline:
def __init__(self, mongo_uri, mongo_db, collection_name=None):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.collection_name = collection_name # 可选:自定义集合名
self.client = None
self.db = None
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'scrapy_db'),
collection_name=crawler.settings.get('MONGO_COLLECTION') # 可选
)
def open_spider(self, spider):
try:
self.client = pymongo.MongoClient(self.mongo_uri, serverSelectionTimeoutMS=5000)
self.db = self.client[self.mongo_db]
# 测试连接
self.client.server_info()
spider.logger.info("成功连接MongoDB!")
except pymongo.errors.ServerSelectionTimeoutError as err:
spider.logger.error('MongoDB连接失败: %s', err)
raise DropItem("无法连接MongoDB")
def close_spider(self, spider):
if self.client:
self.client.close()
def process_item(self, item, spider):
# 如果设置了 collection_name,优先使用它,否则使用 spider.name
collection_name = self.collection_name if self.collection_name else spider.name
try:
self.db[collection_name].insert_one(ItemAdapter(item).asdict())
spider.logger.debug(f"Item 写入 MongoDB: {self.mongo_db}/{collection_name}")
except pymongo.errors.PyMongoError as e:
spider.logger.error("写入MongoDB错误: %s", e)
raise DropItem("写入数据库失败")
return item # 必须返回 item,否则后续 pipeline 无法处理
3:在settings.py
中启用MongoDB管道:
python
# settings.py
ITEM_PIPELINES = {
'spt_spider.pipelines.MongoPipeline': 300,
# 其他管道...
}
运行爬虫:
scrapy crawl yiche