爬取页面数据的详细信息
分析如下:
1.图片是个img标签拿到url
2.类别是两个button里面的span标签
3.上映时间是一个div里的span标签
4.评分是p标签 source
5.剧情简介也是一个p标签,外面有个div drama
注释:re.S:点任意匹配 "."匹配换行符之外的所有字符,用了re.S之后 就可以匹配换行符了
python
#! /usr/bin/env python3
import logging
import requests
import re
from urllib.parse import urljoin
import pymongo
# level指定记录日志的级别
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')
Basic_url = 'https://ssr1.scrape.center'
TOTAL_PAGE = 10
# 抓取某一页面的内容
def scrape_index(page):
index_url = f'{Basic_url}/page/{page}'
return scrape_page(index_url)
# 定义一个函数抓取网页的内容
def scrape_page(url):
# logging.info("正在抓取 %s......", url)
# 发送get请求
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
logging.info("抓取 %s 时返回无效的状态码 %s ", url, response.status_code)
except requests.RequestException:
# 发生异常, 输出错误
logging.error("抓取 %s 时发生异常", url, exc_info=True)
# 解析内容, 并提取出详情页面的URL
def parse_index(html):
# 用正则把连接提取出来
pattern = re.compile('<a.*href="(.*?)".*?class="name">')
items = re.findall(pattern, html)
if not items:
return []
else:
# 把相对来链接转化为绝对链接
for item in items:
detail_url = urljoin(Basic_url, item)
# logging.info("找到详情页面, 链接 %s ", detail_url)
yield detail_url
def scrape_details(url):
return scrape_page(url)
def parse_detail(html):
image_pattern = re.compile('class="el-col.*?<img.*?src=(".*?").*?class="cover">', re.S)
name_pattern = re.compile('<h2.*?>(.*?)</h2>')
categories_pattern = re.compile('<button.*?category.*?<span>(.*?)</span>.*?</button>', re.S)
published_pattern = re.compile('(\d{4}-\d{2}-\d{2})\s上映', re.S)
score_pattern = re.compile('<p.*?score.*?>(.*?)</p>', re.S)
drama_pattern = re.compile('<div.*?drama.*?>.*?<p.*?>(.*?)</p>', re.S)
image = re.search(image_pattern, html).group(1).split('@') if re.search(image_pattern, html) else None
name = re.search(name_pattern, html).group(1).split() if re.search(name_pattern, html) else None
category = re.findall(categories_pattern, html) if re.findall(categories_pattern, html) else None
published = re.findall(published_pattern, html) if re.findall(published_pattern, html) else None
score = re.search(score_pattern, html).group(1).split() if re.search(score_pattern, html) else None
drama = re.search(drama_pattern, html).group(1).split() if re.search(drama_pattern, html) else None
return {
'image': image[0],
'name': name,
'category': category,
'published': published,
'score': score,
'drama': drama
}
def main():
index_html = scrape_index(1)
details_urls = parse_index(index_html)
for details_url in details_urls:
details_html = scrape_details(details_url)
data = parse_detail(details_html)
logging.info("抓取数据 %s ", data)
if __name__ == '__main__':
main()
注释:重点在于能够正确使用正则匹配来匹配相关内容