赞
踩
lxml: 它可以分析xml文件,html是xml的子集,所以分析html文档可以使用正则也可以使用lxml
示例文档
<bookstore>
<li id='test3'> li test3</li>
<book>
<title>Harry Potter</title>
<author>J K. Rowling</author>
<year>2005</year>
<price>29.99</price>
<li>li test1</li>
<li id='test2'>li test2</li>
</book>
</bookstore>
<test>
<li id='test3'>li test4</li>
</test>
lxml示例
实例1: 找到
一个完整示例:
from lxml import etree html = ''' <bookstore> <li id='test3'> li test3</li> <book> <title>Harry Potter</title> <author>J K. Rowling</author> <year>2005</year> <price>29.99</price> <li>li test1</li> <li id='test2'>li test2</li> </book> </bookstore> <test> <li id='test3'>li test4</li> </test>''' dom = etree.HTML(html) ret = dom.xpath('//li/text()') print(ret) ret = dom.xpath('//li/@id') print(ret)
一个完整示例:
from lxml import etree html = ''' <bookstore> <li id='test3'> li test3</li> <book> <title>Harry Potter</title> <author>J K. Rowling</author> <year>2005</year> <price>29.99</price> <li>li test1</li> <li id='test2'>li test2</li> </book> </bookstore> <test> <li id='test3'>li test4</li> </test>''' dom = etree.HTML(html) ret = dom.xpath('//li[@id]') for li in ret: print(li.text) print(li.attrib['id']) print(etree.tostring(li).decode()) print('=' * 50) #爬取暴漫非人哉漫画 #author : shuaijie_liu #date 2019-05-01 #email : 15028349493@163.com import requests from lxml import etree def down_html(url,timeout=10,headers=None,verify=True): if not headers: headers = { 'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36,' } req = requests.get(url=url,headers=headers,verify=verify,timeout=timeout) return req.text def find_imgs(data,exp): dom = etree.HTML(data) ret = dom.xpath(exp) return ret def download_img(url,filename,timeout=10,headers=None,verify=True): if not headers: headers = { 'User-Agent':r'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36,' } req = requests.get(url=url,headers=headers,verify=verify,timeout=timeout) with open(filename,'wb') as f: f.write(req.content) if __name__ == '__main__': for page in range(27): url = r'http://baozoumanhua.com/channels/1562?page={}'.format(page) imgs = r'//div[@class="article-body"]//img/@src' try: html = down_html(url=url) except Exception as e: print('Html Error {} : {}'.format(url,e)) continue img_urls = find_imgs(html,imgs) ret = [img_urls[0]] for url in img_urls: if url != ret[-1]: ret.append(url) filename = 0 for url in ret: filename += 1 file = "{}-{}.jpg".format(page+1,filename) print('down load {}'.format(url)) try: download_img(url,file) except Exception as e: print('IMAGE ERROR {}:{}'.format(url,e))
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。