PyQuery库
初始化
字符串初始化
参考崔庆才爬虫
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html) #转换为标签树
print(doc('p')) #选择标签
print(doc('#link1')) #标签选择器
print(doc('.title')) #类样式选择器
print(doc('.story #link2')) #选择类样式为story下id为link2的标签
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<p class="title"><b>The Dormouse's story</b></p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
url初始化
from pyquery import PyQuery as pq
doc=pq(url="http://www.baidu.com",encoding='utf-8')
print(doc('head'))
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>百度一下,你就知道</title></head>
文档初始化
from pyquery import PyQuery as pq
doc=pq(".\\Text\\upload\\HTML.html")
print(doc('table'))
CSS选择器
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123" class="ul">我是ul</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
print(doc('.title')) #类样式选择器
print(doc('#123')) #id选择器
print(doc('#123 #table .ul')) #嵌套选择,选择id为123的标签下id为table的标签下class为ul的标签
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123" class="ul">我是ul</ul>
</table>
</div>
<ul id="t" name="123" class="ul">我是ul</ul>
查找元素
子元素
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
div=doc.find('div')
print(div)
print(type(div))
ul=div.find('ul')
print(type(ul))
print(ul)
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
</div>
<class 'pyquery.pyquery.PyQuery'>
<class 'pyquery.pyquery.PyQuery'>
<ul id="t" name="123">我是ul</ul>
t=doc.find('#123.div #table') #选取id="123"并且class="div"的标签下id=table的标签
print(t)
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
div=doc.children()
print(type(div))
print(div)
<class 'pyquery.pyquery.PyQuery'>
<head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>
父元素
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
ul=doc.find("#t")
parent=ul.parent()
print(parent)
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
</table>
doc=pq(html)
ul=doc.find('#t')
parents=ul.parents()
print(type(parents)) #所有祖先节点
print(parents)
<class 'pyquery.pyquery.PyQuery'>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html><body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body><div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
兄弟节点
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
siblings=doc.find('p').siblings('#123') #获取所有id="123"的兄弟节点
print(siblings)
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
遍历
单个元素
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
print(type(doc.find('#123')))
print(doc.find('#123'))
<class 'pyquery.pyquery.PyQuery'>
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
多个元素
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
ls=doc.find('a').items() #.items()生成一个迭代器类型可用于遍历
for p in ls:
print(p)
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
获取信息
获取属性
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story" name="我是一个p">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
l=doc.find('.story')
print(l.attr('name')) #获取name属性,其他属性填入即可
print(l.attr.name)
我是一个p
我是一个p
获取文本
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
div=doc.find('#123.div')
print(div)
print(div.text()) #获取文本内容
<div id="123" class="div" name="d">
<table id="table">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
我是table
我是ul
ulllll
获取HTML
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
a=doc.find('a')
print(a)
print(a.html())
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
Elsie
DOM操作
addClass removeClass
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
doc=pq(html)
p=doc.find('p')
p.removeClass('title') #移除类样式class=title
print(p)
print("........")
p.addClass('newTitle') #添加类样式class=newTitle
print(p)
<p class=""><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
........
<p class="newTitle"><b>The Dormouse's story</b></p>
<p class="story newTitle">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story newTitle">...</p>
attr,css
from pyquery import PyQuery as pq
html="""
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<div id="123" class="div" name="d">
<table id="table",name="T",class="te">
我是table
<ul id="t" name="123">我是ul</ul>
<ul id="t" name="111" class="test">ulllll</ul>
</table>
</div>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<span>我是span</span>
<p class="story">...</p>
"""
doc=pq(html)
span=doc.find('span')
print(span)
span.attr('name','span') #添加name属性
print(span)
span.attr('class','sd') #添加类样式
print(span)
span.attr('class','') #令class为空,但是class属性存在
print(span)
span.css('font-size','14px') #添加style样式
print(span)
<span>我是span</span>
<span name="span">我是span</span>
<span name="span" class="sd">我是span</span>
<span name="span" class="">我是span</span>
<span name="span" class="" style="font-size: 14px">我是span</span>
remove
html="""
<html><head></head>
<body>
<span class="span">我是span
<a>我是a</a>
</span>
</body>
</html>
"""
from pyquery import PyQuery as pq
doc=pq(html)
span=doc.find('.span')
print(span)
t=span.find('a').remove()
print(t.text())
<span class="span">我是span
<a>我是a</a>
</span>
我是a
伪类选择器
from pyquery import PyQuery as pq
html="""
<html>
<head></head>
<body>
<p>p1</p>
<p>p2</p>
<p>p3</p>
<p>p4</p>
<p>p5</p>
<p>p6</p>
<li>name</li>
<li></li>
</body>
</html>
"""
doc=pq(html)
l1=doc("p:first-child") #第一个p
print(l1)
print("............")
l2=doc("p:last-child") #最后一个p
print(l2)
print("...........")
l3=doc("p:nth-child(2)") #第二个p
print(l3)
print("...........")
l4=doc("p:gt(3)") #第四个之后(不包括四)
print(l4)
print("...........")
l5=doc("p:nth-child(2n)") #第偶数个
print(l5)
print(".............")
l6=doc("li:contains(na)") #选择文本中含有name的标签
print(l6)
<p>p1</p>
............
...........
<p>p2</p>
...........
<p>p5</p>
<p>p6</p>
...........
<p>p2</p>
<p>p4</p>
<p>p6</p>
.............
<li>name</li>