-
安装lxml
bashpip3 install lxml
-
用法
python
import requests
from bs4 import BeautifulSoup
url ='xxxx'
res = requests.get(url).text
soup = BeautifulSoup(res,'lxml')
##---------------------bs4选择器使用方法--------------------------
#1. 根据a标签查找,类型是list,元素是bs4对象
print(soup.find_all('a'))
#2. 返回所有div 和 a 标签
print(soup.find_all(['div','a']))
#3. 根据标签属性定位
print(soup.find_all(attrs={"clsaa":"xxx"}))
#4. 根据标签和属性定位
print(soup.find_all('div',attrs={"clsaa":"xxx"}))
##---------------------css选择器使用方法--------------------------
#1. 根据标签
print(soup.select('h1'))
#2. 根据css属性 查找class=xx的数据
print(soup.select('.xxx'))
#3.根据id属性 查找id=xx的数据
print(soup.select('#xx))
#4. 层级选择器 id=xx 下的 class=yy的数据
print(soup.select('#xx .yy'))
#5 属性选择器
print(soup.select(div[id="xx"]))
##---------------------获取文本信息和属性方法----------------------
#5. 获取标签的文本信息 text
print(soup.find_all('div',attrs={"clsaa":"xxx"})[0].text)
#5-2. 获取标签的文本信息 string
print(soup.find_all('div',attrs={"clsaa":"xxx"})[0].string)
#5-3. 获取标签的文本信息 text()
print(soup.find_all('div',attrs={"clsaa":"xxx"})[0].get_text())
#5-4. 获取标签的文本信息 getText()
print(soup.find_all('div',attrs={"clsaa":"xxx"})[0].getText())
#获取标签属性 get()
print(soup.find_all('div')[0].get('class'))
-
实例
python#用xpath做一个简单的爬虫,爬取链家网里的租房信息获取标题, 位置,房屋的格局(三室一厅),关注人数,单价,总价 import requests from bs4 import BeautifulSoup def get_req(url): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } res = requests.get(url,headers=headers) return res.text def get_all(data): house_data=[] soup = BeautifulSoup(data,'lxml') for item in soup.find_all(attrs={"class":"info clear"}): house_msg = {} title = item.find('div',attrs={"class":"title"}).find('a').text position_info = '-'.join([value.text for value in item.find_all('div',attrs={"class":"positionInfo"})[0].find_all('a')]) house_icon = item.find('div',attrs={"class":"houseInfo"}).text.split('|')[0] total_price = item.find_all('div',attrs={"class":"totalPrice totalPrice2"})[0].find('span').text star_icon = item.find('div',attrs={"class":"followInfo"}).text unit_price = item.find('div',attrs={"class":"unitPrice"}).find('span').text house_msg['标题'] = title house_msg['位置'] = position_info house_msg['房屋的格局'] = house_icon house_msg['关注人数'] = star_icon house_msg['总价'] = total_price house_msg['单价'] = unit_price house_data.append(house_msg) print(house_msg) return house_data if __name__ == '__main__': url = 'https://sh.lianjia.com/ershoufang/pudong/pg2/' all_data = get_req(url) get_all(all_data)