安装BeautifulSoup
bash
pip install beautifulsoup4
基本使用
python
from bs4 import BeautifulSoup
res = requests.get(url)
soup = BeautifulSoup(res.text, 'lxml')
删除标签、属性
python
# 删除标签
for i in ['style','video']:
[s.extract() for s in soup(i)]
# 删除标签的属性
for element in soup.select('img'):
del element['srcset']
for element in soup.select('a'):
del element['href']
# 删除标签为div,属性class为abc的所有标签
remove_list = soup.find_all('div', attrs={'class': 'abc'})
[i.extract() for i in remove_list]
寻找标签
python
div1 = soup.find('div', id='me') # 只匹配第一个
div1 = soup.find_all('div', attrs={'class': 'abc'}) # 匹配出所有
删除标签,但保留其子标签且位置不变
python
from bs4 import BeautifulSoup
html = '''
<html>
<body>
<div id="container">
<p>This is a one</p>
<div id="remove-me">
<span>Child 1<a>111</a></span>
<span>Child 2</span>
</div>
<p>This is a two</p>
</div>
</body>
</html>
'''
soup = BeautifulSoup(html, 'lxml')
# 找到要删除的 div 标签
div_to_remove = soup.find('div', id='remove-me')
if div_to_remove:
# 获取父标签
parent = div_to_remove.parent
siblings = parent.contents
position = siblings.index(div_to_remove)
# 将子标签移动到父标签中
while div_to_remove.contents:
parent.insert(position,div_to_remove.contents[-1])
# 删除原始标签
div_to_remove.decompose()
# 打印结果
print(soup.prettify())