赞
踩
pip install beautifulsoup4
解析器 | 使用方式 |
---|---|
python标准库 | BeautifulSoup(markup,‘html.parser’ ) |
lxml HTML解析器 | BeautifulSoup(markup,‘lxml’) |
lxml XML解析器 | BeautifulSoup(markup,‘xml’) |
html5lib | BeautifulSoup(markup,‘html5lib’ |
导入模块
from bs4 import BeautifulSoup
设置解析器,传递需要解析的html文档
soup = BeautifulSoup(html,'lxml')
格式化代码
soup.prettify()
查看title标签内的文本
soup.title.string
返回title标签
soup.title
返回标签的类型
type(soup.title) #返回<class 'bs4.element.Tag'>一个类变量
返回head标签
soup.head
返回p标签,只返回第一个p标签,如果有多个只能输出第一个
soup.p
返回title标签的名称
soup.title.name #返回title 最外层标签的名称
html = '''<p class="title" name="zhaojia">the paragraph</p>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.p.attrs['name'] #返回zhaojia
soup.p['name']] #返回结果同上
html = '''<p class="title" name="zhaojia">the paragraph</p>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.p.string #返回the paragraph
html = '''<head><p class="title" name="zhaojia">the paragraph</p><head>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.head.p.string #返回the paragraph
soup.head.p.attrs['name'] # 返回zhaojia
html = '''<head><p class="title" name="zhaojia">the paragraph</p><head>'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.head.contents #返回所有子节点列表
the = soup.head.children
for i,children in enumerate(the):
print(i,children) # 返回一个iterable 必须使用for循环遍历出来
soup.head.descendants #获取所有的子孙节点,返回类型是iterable object
soup.a.parent #获取父节点,输出父节点全部内容
soup.a.parents #获取祖先节点
soup.a.next_siblings
soup.a.previous_siblings
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.find_all('p') 查找所有的p标签,并且以列表的形式返回
type(soup.find_all('p')) 返回类型是<class 'bs4.element.Tag'>
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.find_all(attrs = {'id':'list-1'})
from bs4 import BeautifulSoup
soup = BeautifulSoup(html,'lxml')
soup.find_all(text = 'the')
find方法返回一个,findall返回所有
通过select()直接传入CSS选择器就可以查找
class 选择器前面加一个点号
id 选择器前面加一个#
标签什么都不需要加
soup.select(’.class’)
soup.select(’#id #id2’)两个条件用空格隔开
soup.select(tag1 tag3)
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
for ul in soup.select('ul'):
print(ul.get_text)
soup.select('title')
soup.select('.classname')
#通过ID来查找
soup.select('#identified')
#通过属性来查找
soup.select("a[class= 'bri' ]")
soup.select("head>title")
#通过一个兄弟标签进行查找
res = soup.select(".classname ~ .broclassname")
print(res[0].get_text()) #输出一个兄弟标签里面的内容。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。