import scrapy
from bs4 import BeautifulSoup
from ..items import HaohaoItem
class HaotuijianSpider(scrapy.Spider):
name = 'haotuijian'
allowed_domains = ['tuijian.hao123.com']
start_urls = ['http://tuijian.hao123.com/']
def parse(self, response):
soup = BeautifulSoup(response.text, 'html.parser')
list_div = soup.find('div', class_='v2-nav')
ul_tags = list_div.find_all('ul')[0]
li_tags = ul_tags.find_all('li')
for li in li_tags:
a_tag = li.find('a')
if a_tag:
title = a_tag.text
href = a_tag['href']
if title in ["娱乐", "体育", "财经", "科技", "历史"]:
item = HaohaoItem() # 创建一个HaohaoItem实例来传输保存数据
item['title'] = title
item['href'] = href
yield item
2.items.py文件代码
python复制代码
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class HaohaoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
href = scrapy.Field()
3.pipelines.py文件代码(保存数据到Mongodb、Mysql、Excel中)
python复制代码
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from pymongo import MongoClient
import openpyxl
import pymysql
#保存到mongodb中
class HaohaoPipeline:
def __init__(self):
self.client = MongoClient('mongodb://localhost:27017/')
self.db = self.client['qiangzi']
self.collection = self.db['hao123']
self.data = []
def close_spider(self, spider):
if len(self.data) > 0:
self._write_to_db()
self.client.close()
def process_item(self, item, spider):
self.data.append({
'title': item['title'],
'href': item['href'],
})
if len(self.data) == 100:
self._write_to_db()
self.data.clear()
return item
def _write_to_db(self):
self.collection.insert_many(self.data)
self.data.clear()
#保存到mysql中
class MysqlPipeline:
def __init__(self):
self.conn = pymysql.connect(
host='localhost',
port=3306,
user='root',
password='789456MLq',
db='pachong',
charset='utf8mb4'
)
self.cursor = self.conn.cursor()
self.data = []
def close_spider(self,spider):
if len(self.data) > 0:
self._writer_to_db()
self.conn.close()
def process_item(self, item, spider):
self.data.append(
(item['title'],item['href'])
)
if len(self.data) == 100:
self._writer_to_db()
self.data.clear()
return item
def _writer_to_db(self):
self.cursor.executemany(
'insert into haohao (title,href)'
'values (%s,%s)',
self.data
)
self.conn.commit()
#保存到excel中
class ExcelPipeline:
def __init__(self):
self.wb = openpyxl.Workbook()
self.ws = self.wb.active
self.ws.title = 'haohao'
self.ws.append(('title','href'))
def open_spider(self,spider):
pass
def close_spider(self,spider):
self.wb.save('haohao.xlsx')
def process_item(self,item,spider):
self.ws.append(
(item['title'], item['href'])
)
return item