0 基本逻辑
python
复制代码
1 创建项目 scrapy startproject 项目名字
2 cd 到spiders文件夹下
3 创建爬虫文件
scrapy genspider -t crawl 爬虫文件名字 爬取的域名
1 settings.py文件中设置日志文件
python
复制代码
# 一般不采取这种方式
# LOG_LEVEL = 'WARNING'
# 推荐使用日志文件的方式
LOG_FILE = 'log.log'
2 使用scrapy爬取读书网的中书的名字和图片地址
2.1 新建项目
python
复制代码
scrapy startproject 项目名字
2.2 新建爬虫名字
python
复制代码
scrapy genspidef 爬虫名字 域名 # 域名,如www.baidu.com
2.3 在爬虫文件中写爬取逻辑
python
复制代码
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from read_book.items import ReadBookItem
class ReadbookSpider(CrawlSpider):
name = "read_book"
allowed_domains = ["www.dushu.com"]
start_urls = ["https://www.dushu.com/book/1188_1.html"]
rules = (
Rule(LinkExtractor(allow=r"/book/1188_\d+\.html"),
callback="parse_item",
follow=True),
)
def parse_item(self, response):
img_list = response.xpath('//div[@class="bookslist"]//img')
for img in img_list:
name = img.xpath('./@data-original').extract_first()
src = img.xpath('./@alt').extract_first()
book = ReadBook101Item(name=name, src=src)
yield book
2.4 items.py文件中
python
复制代码
import scrapy
class ReadBookItem(scrapy.Item):
name = scrapy.Field()
src = scrapy.Field()
2.5 pipelines.py文件中
python
复制代码
from itemadapter import ItemAdapter
from scrapy.utils.project import get_project_settings # 加载settings文件
import pymysql
class ReadBook101Pipeline:
def open_spider(self, spider):
self.fp = open('book.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(str(item))
return item
def close_spider(self, spider):
self.fp.close()
class MysqlPipeline:
def open_spider(self, spider):
settings = get_project_settings()
self.host = settings['DB_HOST']
self.user = settings['DB_USER']
self.password = settings['DB_PASSWORD']
self.name = settings['DB_NAME']
self.port = settings['DB_PORT']
self.charset = settings['DB_CHARSET']
self.connect()
def connect(self, ):
self.conn = pymysql.connect(
user=self.user,
password=self.password,
host=self.host,
database=self.name,
port=self.port,
charset=self.charset,
)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
sql = 'insert into book(name,src) values("{}","{}")'.format(
item['name'], item['src']
)
self.cursor.execute(sql)
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
2.6 settings文件中开启管道、配置数据库
python
复制代码
DB_HOST = '127.0.0.1'
DB_PORT = 3306
DB_USER = 'root'
DB_PASSWORD = 'root'
DB_NAME = 'spider01'
# utf-8不允许使用 - 否则会报错NoneType......
DB_CHARSET = 'utf8'
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"read_book_101.pipelines.ReadBook101Pipeline": 300,
'read_book_101.pipelines.MysqlPipeline': 301
}