test.xml
XML
<?xml version="1.0" encoding="utf-8"?>
<bookstore>
<book name="halibote">
<title lang="en">Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
<abc>
<book lang="中文">neibu</book>
</abc>
</book>
<book name="hongloumeng">
红楼梦
</book>
</bookstore>
hello.html
html
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<!-- hello.html -->
<div>
<ul>
<li class="item-0">meiguo<a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html"><span
class="bold">third item</span></a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</body>
</html>
选取节点
python
from lxml import etree
tree = etree.parse("test.xml")
list_node = tree.xpath("book/@name")
print(list_node[0])
list_node = tree.xpath("/bookstore")
print(list_node[0])
list_node = tree.xpath("book/title")
print(list_node[0].text)
list_node = tree.xpath("book//book")
print(list_node)
list_node = tree.xpath("//@lang")
print(list_node)
谓语:指路径表达式的附加条件
python
from lxml import etree
tree = etree.parse("test.xml")
list_node = tree.xpath("book[2]")
print(list_node[0].text)
选取未知节点
python
from lxml import etree
tree = etree.parse("test.xml")
list_node = tree.xpath("/bookstore/*")
print(list_node)
选取若干路径
python
from lxml import etree
tree = etree.parse("test.xml")
list_node = tree.xpath("//book/title | //book/price")
print(list_node)
通过轴限定
python
from lxml import etree
tree = etree.parse("test.xml")
list_node = tree.xpath("descendant::book")
print(list_node)
操作XML节点
python
from lxml import etree
root = etree.Element("root",a="1")
child = etree.SubElement(root, "child")
root.set("b", "2")
root.text = "yilang"
print(etree.tostring(root))
print(root.tag)
print(root.text)
# 从字符串中解析XML,返回根节点
root = etree.XML("<root>"
"<a x='123'>aText"
"<b/>"
"<c/>"
"<b/>"
"</a>"
"</root>")
# 从根节点查找,返回匹配到的节点名称
print(root.find("a").tag)
# 从根节点开始查找,返回匹配到的第一个节点的名称
print(root.findall(".//a[@x]")[0].tag)
在XML中搜索
python
from lxml import etree
tree = etree.parse("hello.html",parser=etree.HTMLParser())
list_node = tree.xpath("//li")
print(list_node[0].text)