import requests
import re
res = requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')
# print(res.text)
# 解析出所有视频地址---》re解析
video_list = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', res.text)
for video in video_list:
real_url = 'https://www.pearvideo.com/' + video
video_id = video.split('_')[-1]
# 必须携带referer,referer是视频详情地址
# contId 是视频id号
header={
'Referer':real_url
}
res = requests.get('https://www.pearvideo.com/videoStatus.jsp?contId=%s&mrd=0.05520583472057039'%video_id,headers=header)
real_mp4_url=res.json()['videoInfo']['videos']['srcUrl']
mp4 = real_mp4_url.replace(real_mp4_url.split('/')[-1].split('-')[0], 'cont-%s' % video_id)
print('能播放的视频地址:',mp4)
# 把视频下载到本地
res=requests.get(mp4)
with open('./video/%s.mp4'%video_id,'wb') as f:
for line in res.iter_content():
f.write(line)
爬取新闻
解析库:汽车之家
bs4 解析库 pip3 install beautifulsoup4
lxml: pip3 install lxml
爬取所有数据:
python复制代码
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
print(res.text)
取出文章详情:
python复制代码
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.autohome.com.cn/news/1/#liststart')
print(res.text)
soup = BeautifulSoup(res.text, 'html.parser') # 解析库
ul_list = soup.find_all(name='ul', class_='article') # 找到所有 类名是article 的ul标签
for ul in ul_list: # 查找ul标签下的li标签
li_list = ul.find_all(name='li')
for li in li_list:
h3 = li.find(name='h3') # 查找li标签下的所有h3标题
if h3:
title = h3.text # 拿出h3标签的文本内容
content = li.find('p').text # 拿出li标签下的第一个p标签的文本内容
url = 'https:' + li.find(name='a').attrs['href'] # .attrs 拿到标签属性
img = li.find('img')['src'] # 拿出img标签的属性src,可以直接取
print('''
文章标题:%s
文章摘要:%s
文章url:%s
文章图片:%s
''' % (title, content, url, img))
bs4介绍和遍历文档树
bs4的概念:是解析 xml/html 格式字符串的解析库
不但可以解析(爬虫),还可以修改
解析库:
python复制代码
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" id='id_xx' xx='zz'>lqz <b>The Dormouse's story <span>彭于晏</span></b> xx</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
# soup=BeautifulSoup(html_doc,'html.parser')
soup = BeautifulSoup(html_doc, 'lxml') # pip3 install lxml
# 子节点、子孙节点
print(soup.p.contents) # p下所有子节点,只拿直接子节点
print(soup.p.children) # 直接子节点 得到一个迭代器,包含p下所有子节点
for i,child in enumerate(soup.p.children):
print(i,child)
print(soup.p.descendants) #获取子孙节点,p下所有的标签都会选择出来 generator
for i,child in enumerate(soup.p.descendants):
print(i,child)
# 父节点、祖先节点
print(soup.a.parent) #获取a标签的父节点
print(list(soup.a.parents)) #找到a标签所有的祖先节点,父亲的父亲,父亲的父亲的父亲...
# 兄弟节点
print(soup.a.next_sibling) #下一个兄弟
print(soup.a.previous_sibling) #上一个兄弟
print(list(soup.a.next_siblings)) #下面的兄弟们=>生成器对象
print(soup.a.previous_siblings) #上面的兄弟们=>生成器对象
搜索文档树
解析库:
python复制代码
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my_p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p id="my_p" class="title"><b id="bbb" class="boldest">The Dormouse's story</b>
</p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')