爬取动态网页(下)
文章目录
前言
上篇主要讲了如何去爬取数据,这篇来讲一下如何在获取的同时将数据整理保存到excel文档中。
上一篇《Python 爬虫之简单的爬虫(三)》链接:https://blog.csdn.net/weixin_57061292/article/details/135073002
一、大致内容
以上一篇文章为基础。在原来的代码上进行增添和修改。
增添的内容是:Python操作文档的一些库等相关代码。
修改的内容是:对上一篇的《3.获取指定数据》进行修改,遍历获取的数据的同时把它们添加到新创建的excel文档里。
运行效果图:
二、基本思路
接着上一篇的基本思路继续写:
- 第五步:导入一下需要的新的软件库
- 第六步:主要是将上一篇《3.获取指定数据》里面print()替换成将数据保存到文档中的操作。
- 第七步:删除文档中默认的Sheet工作表,并保存文档。
三、代码编写
1.引入库
代码如下:
python
# 以上是原来的
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
# 以下是新添加的
from openpyxl.styles import Font, Alignment, Border, Side
import openpyxl
import re
2.加载网页数据
代码如下:
python
# 这些是原来的
driver = webdriver.Firefox()
driver.get("https://movie.douban.com/annual/2022/?fullscreen=1&source=movie_navigation")
time.sleep(5)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# 这些是新添加的
# 创建实例对象
wb = openpyxl.Workbook()
这里新添加一个对象实例,用来生成excel文档用的。
3.获取并保存
代码如下:
python
# 获取四大影视类型标题
comment_Titles = driver.find_elements(by=By.CSS_SELECTOR, value='.module-top10-grid-chart-title')
# 创建以四大影视类型标题的四个工作表
i = 0
for comment in comment_Titles:
# 创建工作表
ws = wb.create_sheet(index=i, title=comment.text)
# 冻结首行
ws.freeze_panes = 'A2'
# 首行居中、加粗、加框线
# 将电影中的元素作为标题添加到每个工作表的第一行中
cell_titles = ['片名', '演员', '评分', '产地']
index = 1
for title in cell_titles:
wc = ws.cell(row=1, column=index, value=title)
# 加粗
wc.font = Font(bold=True)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
# 水平垂直居中
wc.alignment = Alignment(horizontal='center', vertical='center')
index += 1
i += 1
# 获取每个影视类型里的第一名片名
which_mo_list = driver.find_elements(by=By.CSS_SELECTOR, value='.subject-top-title')
# 将第一名的片名写入到每个工作表中
a = 0
for each_mo in which_mo_list:
movie_title = each_mo.get_attribute('title')
if a == 0:
ws = wb['评分最高华语电影']
wc = ws.cell(column=1, row=2, value=f'《{movie_title}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif a == 1:
ws = wb['评分最高外语电影']
wc = ws.cell(column=1, row=2, value=f'《{movie_title}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif a == 2:
ws = wb['年度冷门佳片']
wc = ws.cell(column=1, row=2, value=f'《{movie_title}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif a == 3:
ws = wb['华语剧集']
wc = ws.cell(column=1, row=2, value=f'《{movie_title}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
a += 1
# 获取每个影视类型里的第一名评分
movies_top_scores_list = driver.find_elements(by=By.CSS_SELECTOR, value='.rating-card-value')
# 将第一名的评分写入到每个工作表中
c = 0
for movie_top_score in movies_top_scores_list:
score = movie_top_score.text
if c == 0:
ws = wb['评分最高华语电影']
wc = ws.cell(column=3, row=2, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif c == 1:
ws = wb['评分最高外语电影']
wc = ws.cell(column=3, row=2, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif c == 2:
ws = wb['年度冷门佳片']
wc = ws.cell(column=3, row=2, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif c == 3:
ws = wb['华语剧集']
wc = ws.cell(column=3, row=2, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
c += 1
# 获取所有影片的人物信息
persons_list = driver.find_elements(by=By.CSS_SELECTOR, value='.subject-credit')
# 将演员信息添加到各自的工作表中
b = 0
for person in persons_list:
person_title = person.find_elements(by=By.TAG_NAME, value='p')
for title in person_title:
# 演员信息
actor = title.text
if 0 < b <= 10:
ws = wb['评分最高华语电影']
wc = ws.cell(column=2, row=b+1, value=actor)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 11 < b <= 21:
ws = wb['评分最高外语电影']
wc = ws.cell(column=2, row=b-10, value=actor)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 22 < b <= 32:
ws = wb['年度冷门佳片']
wc = ws.cell(column=2, row=b-21, value=actor)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 33 < b <= 43:
ws = wb['华语剧集']
wc = ws.cell(column=2, row=b-32, value=actor)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
b += 1
# 获取所有影片的片名(每个影视类型里的第一名除外)
movies_title_list = driver.find_elements(by=By.CSS_SELECTOR, value='.subjects-rank-title')
# 将片名写入到每个工作表中
d = 0
for movie_title in movies_title_list:
# 使用正则表达式提取中文文本
# 使用正则表达式 [\u4e00-\u9fff]+
# 匹配一个或多个连续的中文字符,并使用 re.search().group(1) 获取第一个括号内的匹配内容,即中文文本。
chinese_text = re.search(r'([\u4e00-\u9fff]+)', movie_title.text).group(1)
if 0 <= d <= 8:
ws = wb['评分最高华语电影']
wc = ws.cell(column=1, row=d+3, value=f'《{chinese_text}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 9 <= d <= 17:
ws = wb['评分最高外语电影']
wc = ws.cell(column=1, row=d-6, value=f'《{chinese_text}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 18 <= d <= 26:
ws = wb['年度冷门佳片']
wc = ws.cell(column=1, row=d-15, value=f'《{chinese_text}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 27 <= d <= 35:
ws = wb['华语剧集']
wc = ws.cell(column=1, row=d-24, value=f'《{chinese_text}》')
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
d += 1
# 获取影片的产地(每个影视类型里的第一名除外)
addresses_list = driver.find_elements(by=By.CSS_SELECTOR, value='.subjects-rank-credits > div:nth-child(2)')
# 将产地名称添加到每个工作表中
e = 0
for addresses in addresses_list:
address_text = addresses.text
if 0 <= e <= 8:
ws = wb['评分最高华语电影']
wc = ws.cell(column=4, row=e + 3, value=address_text)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 9 <= e <= 17:
ws = wb['评分最高外语电影']
wc = ws.cell(column=4, row=e - 6, value=address_text)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 18 <= e <= 26:
ws = wb['年度冷门佳片']
wc = ws.cell(column=4, row=e - 15, value=address_text)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 27 <= e <= 35:
ws = wb['华语剧集']
wc = ws.cell(column=4, row=e - 24, value=address_text)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
e += 1
# 获取影片评分(每个影视类型里的第一名除外)
movies_scores_list = driver.find_elements(by=By.CSS_SELECTOR, value='.subjects-rank-rating')
# 将评分输入到每个工作表中
f = 0
for movie_score in movies_scores_list:
score = movie_score.text
if 0 <= f <= 8:
ws = wb['评分最高华语电影']
wc = ws.cell(column=3, row=f + 3, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 9 <= f <= 17:
ws = wb['评分最高外语电影']
wc = ws.cell(column=3, row=f - 6, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 18 <= f <= 26:
ws = wb['年度冷门佳片']
wc = ws.cell(column=3, row=f - 15, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
elif 27 <= f <= 35:
ws = wb['华语剧集']
wc = ws.cell(column=3, row=f - 24, value=score)
# 单元格左右上下加框线
wc.border = Border(left=Side(border_style='thin'), right=Side(border_style='thin'),
top=Side(border_style='thin'), bottom=Side(border_style='thin'))
f += 1
代码很多哈。但都是有规律的。上一篇是获取到数据把它变成一个列表,然后遍历打印出来它。
这里变了。不是遍历打印了,改成遍历保存了。因为上面获取的每个列表里面的元素顺序是有规律的(需要大家自己动手去体会啦),结合一定的逻辑判断,分别把它们填写到四个类型的工作表中去(再添加一些对表格美化的操作的代码)。
4.保存文档
代码如下:
python
del wb['Sheet']
wb.save(f'example{int(time.time())}.xlsx')
删除文档默认的Sheet工作表(没卵用),保存文档(默认保存到当前文件夹下)。
总结
其它的还好,主要是数据的遍历保存的逻辑判断部分的代码,这个需要大家手动去搞一遍才能明白。这篇用的是Python 3.11.6 版本的环境,基本环境因素要注意哦,要不然就算一样的代码运行起来也可能会有问题。