python
import requests
from lxml import etree
# 1. 请求网页
url = '链接'
res = requests.get(url)
# 2. 解析网页内容
tree = etree.HTML(res.text)
# 3. 提取数据
ls = tree.xpath('//dl[@class="textList"]//a')
for i in ls:
# 基于当前标签对象获取文本及属性
name = i.xpath('./text()')[0]
detail_url = i.xpath('./@href')[0]
下面是获取到内容页后得到页面数据组后操作
text_ls = p.xpath('.//text()')
print(''.join(text_ls).replace('\r\n','').replace(' ',''))
python
import requests
#from lxml import etree
from bs4 import BeautifulSoup
url="链接"
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36'
}
res=requests.get(url,headers=headers)
res.encoding='utf-8'
#print(res.text)
#创建BeautifulSoup对象
#BeautifulSoup(html字符串数据,html的解析器)
bs=BeautifulSoup(res.text,'lxml')
lis=bs.select('#browserItemList li')
#print(ls)
count=1
for item in lis:
title=item.find('a',class_="l").text
des=item.find('p',class_='info tip').text.replace('\n','').replace('\r','').replace(' ','')
fade=item.find('small',class_="fade").text
ren=item.find('span',class_="tip_j").text
print(count,title,des,fade,ren)
count+=1
安装模块
lxml :解析器 pip install lxml
pip install beautifulsoup4