赞
踩
import requests
import re
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
url = 'http://sc.chinaz.com/tag_tupian/YaZhouMeiNv.html'
page_text = requests.get(url,headers=headers).text #获取字符串形式的响应数据
#通过正则进行图片地址的解析
ex = '<a.*?<img src2="(.*?)" alt.*?</a>'
img_src_list = re.findall(ex,page_text,re.S)#re.S处理回车
正则是以前学习过的内容,不做过多的叙述
pip install bs4
<html lang="en"> <head> <meta charset="UTF-8" /> <title>测试bs4</title> </head> <body> <div> <p>百里守约</p> </div> <div class="song"> <p>李清照</p> <p>王安石</p> <p>苏轼</p> <p>柳宗元</p> <a href="http://www.song.com/" title="赵匡胤" target="_self"> <span>this is span</span> 宋朝是最强大的王朝,不是军队的强大,而是经济很强大,国民都很有钱</a> <a href="" class="du">总为浮云能蔽日,长安不见使人愁</a> <img src="http://www.baidu.com/meinv.jpg" alt="" /> </div> <div class="tang"> <ul> <li><a href="http://www.baidu.com" title="qing">清明时节雨纷纷,路上行人欲断魂,借问酒家何处有,牧童遥指杏花村</a></li> <li><a href="http://www.163.com" title="qin">秦时明月汉时关,万里长征人未还,但使龙城飞将在,不教胡马度阴山</a></li> <li><a href="http://www.126.com" alt="qi">岐王宅里寻常见,崔九堂前几度闻,正是江南好风景,落花时节又逢君</a></li> <li><a href="http://www.sina.com" class="du">杜甫</a></li> <li><a href="http://www.dudu.com" class="du">杜牧</a></li> <li><b>杜小月</b></li> <li><i>度蜜月</i></li> <li><a href="http://www.haha.com" id="feng">凤凰台上凤凰游,凤去台空江自流,吴宫花草埋幽径,晋代衣冠成古丘</a></li> </ul> </div> </body> </html>
实例化一个bs4对象
from bs4 import BeautifulSoup
f = open('b.html','rb')
soup = BeautifulSoup(f,'lxml') # 方式一 ,给一个文件句柄,解析本地的文件
soup = BeautifulSoup(response.text,'lxml') # 方式二,给一个从网页中得到的内容
两种方式
方式1:
BeautifulSoup(fp,‘lxml’):解析本地存储的html文件
方式2:
from bs4 import BeautifulSoup
f = open('b.html','rb')
soup = BeautifulSoup(f,'lxml') # 方式一 ,给一个文件句柄,解析本地的文件
print(soup.div)
'''
<div>
<p>百里守约</p>
</div>
'''
from bs4 import BeautifulSoup f = open('b.html','rb') soup = BeautifulSoup(f,'lxml') # 方式一 ,给一个文件句柄,解析本地的文件 print(soup.find('div',class_ = 'song')) print(soup.find('a',id='feng')) """ <div class="song"> <p>李清照</p> <p>王安石</p> <p>苏轼</p> <p>柳宗元</p> <a href="http://www.song.com/" target="_self" title="赵匡胤"> <span>this is span</span> 宋朝是最强大的王朝,不是军队的强大,而是经济很强大,国民都很有钱</a> <a class="du" href="">总为浮云能蔽日,长安不见使人愁</a> <img alt="" src="http://www.baidu.com/meinv.jpg"/> </div> <a href="http://www.haha.com" id="feng">凤凰台上凤凰游,凤去台空江自流,吴宫花草埋幽径,晋代衣冠成古丘</a> """
print(soup.find_all('a',class_ = 'du'))
<a class="du" href="">总为浮云能蔽日,长安不见使人愁</a>
class_
要加一个小划线,因为class是关键print(soup.select('#feng')) #根据id选择器定位a标签
print(soup.select('.song')) #定位class为song的标签
#层级选择器
print(soup.select('.tang > ul > li > a')) # >表示一个层级,一层一层的向下
print(soup.select('.tang a')) #空格表示多个层级
提取标签中存在的数据
print(soup.p.string)
'''
百里守约
'''
print(soup.div.text)
"""
百里守约
"""
提取标签属性中存储的数据
print(soup.find('a')['href'])
"""
http://www.song.com/
"""
示例
import requests from bs4 import BeautifulSoup url = 'http://www.shicimingju.com/book/sanguoyanyi.html' rs = requests.get(url = url) rs = BeautifulSoup(rs.text,'lxml') lis_bookmulu = rs.select('.book-mulu > ul > li > a') for site in lis_bookmulu: name = site.string # 每一章的名称 is_url = 'http://www.shicimingju.com' + site['href'] # 详细的Url a = requests.get(url=is_url) b = BeautifulSoup(a.text,'lxml') down_load = b.find('div',class_= 'chapter_content').text with open('小说/{}.txt'.format(name),'w',encoding='utf-8') as f: f.write(down_load) break
UnicodeEncodeError: 'gbk' codec can't encode character '\xa0' in position 4: illegal multibyte sequence
出现这个报错,百度半小时发现是自己没加encoding='utf-8'当成对字节的操作了
html标签结构
xpath解析原理
pip install lxml
from lxml import etree
tre = etree.parse('b.html') # 方式一 解析本地的文件
tre = etree.HTML('page_text') # 方式二 解析网络的内容
print(tre)
"""
<lxml.etree._ElementTree object at 0x000001FDDC3AFF08>
"""
# 得到的是对象
from lxml import etree
tre = etree.parse('b.html') # 方式一 解析本地的文件
print(tre.xpath('/html/head'))
print(tre.xpath('/html/body'))
print(tre.xpath('//div/p')) # 找到任意div下的p标签
"""
[<Element p at 0x21a9b954588>, <Element p at 0x21a9b995a88>, <Element p at 0x21a9b995d88>, <Element p at 0x21a9b99aa48>, <Element p at 0x21a9b99a308>]
"""
print(tre.xpath('/html//meta'))
"""
[<Element meta at 0x1cebd1d4588>]
"""
# //表示中间隔着多个的层级
# 定位到 class属性为 du 的a标签
print(tre.xpath('//a[@class ="du"]'))
# 定位到 id属性为feng 的a标签
print(tre.xpath('//a[@id ="feng"]'))
print(tre.xpath('//li[1]'))
# 索引从1开始
模糊匹配
不常用,了解就可以
//div[contains(@class, "ng")] 定位到class属性值中包含ng的div标签
- //div[starts-with(@class, "ta")] 定位到class属性值中是以ta开头的div标签
取标签中的数据
print(tre.xpath('//a[@id="feng"]/text()')[0])
# 返回的是一个列表,要用[]来取值,虽然当中只有一个值
print(tre.xpath('//div[@class="song"]//text()'))
# 取得下面所有的文本内容
取属性的数据
print(tre.xpath('//a[@id="feng"]/@href')[0])
print(tre.xpath('//div[@class = "song"]/a[1]/@title'))
"""
http://www.haha.com
['赵匡胤']
"""
# 图片就取src
import requests from lxml import etree url = 'http://pic.netbian.com/4kdongwu/index_%d.html' for i in range(1,6): if i == 1: new_url = 'http://pic.netbian.com/4kdongwu/index.html' else: new_url = url%i rs = requests.get(url=new_url) rs.encoding = 'gbk' page_text = rs.text tree = etree.HTML(page_text) img_list = tree.xpath('//ul[@class = "clearfix"]/li') for img in img_list: img_url = 'http://pic.netbian.com' + img.xpath('./a/@href')[0] img_name = img.xpath('./a/img/@alt')[0] rs_img = requests.get(url = img_url) big_img = etree.HTML(rs_img.text) img_b = big_img.xpath('//*[@id="img"]/img')[0] down_load = 'http://pic.netbian.com' + img_b.xpath('./@src')[0] # 大图的下载本地 img_down_load = requests.get(url=down_load) # 对大图的下载地址发请求 with open('img/{}.jpg'.format(img_name),'wb') as f: f.write(img_down_load.content)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。