赞
踩
pip install lxml
from lxml import etree
etree.parse(filePath)
etree.HTML('page_text')
xpath('xpath表达式')
https://blog.csdn.net/xiaobai729/article/details/124079260
//div[@class='song'] tag[@attrName="attrValue"]
//div[@class="song"]/p[3]
索引是从1开始的。/text()
获取的是标签中直系的文本内容//text()
标签中非直系的文本内容(所有的文本内容)/@attrName ==>img/src
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
if __name__ == "__main__":
#实例化好了一个etree对象,且将被解析的源码加载到了该对象中
tree = etree.parse('test.html')
# r = tree.xpath('/html/body/div') #获取html下面的body下面的div
# r = tree.xpath('/html//div')#获取html下面的多个层级的div
# r = tree.xpath('//div')#获取多个层级的div
# r = tree.xpath('//div[@class="song"]')#获取多个层级的div并且标签里面的class为song
# r = tree.xpath('//div[@class="tang"]//li[5]/a/text()')[0]#获取多个层级的div并且标签里面的class为tang下面的多个层级的第五个li标签下的a标签里面的内容
# r = tree.xpath('//li[7]//text()')#多个层级的第七个li下的所有内容
# r = tree.xpath('//div[@class="tang"]//text()')#多个层级的div标签下的class为tang下的所有内容
r = tree.xpath('//div[@class="song"]/img/@src')#多个层级的div标签下的class为song下的img属性的内容
print(r)
爬取强国内容
import requests
from lxml import etree
if __name__ == '__main__':
content = input("请输入内容:")
# step_1:指定url
url = 'http://www.syiban.com/search/index/init.html?modelid=1&q=' + content
# step_2:发起请求
# get方法会返回一个响应对象
response = requests.get(url=url)
# step_3:获取响应数据.text返回的是字符串形式的响应数据
page_text = response.text
tree = etree.HTML(page_text)
question = tree.xpath('//span[@class="title_color"]')
answer = tree.xpath('//div[@class="yzm-news-right"]/p/span')
print(question)
for index, value in enumerate(question):
print(question[index].xpath('string()'))
print(answer[index].xpath('string()'))
# step_4:持久化存储
with open('./强国.html', 'w', encoding='utf-8') as fp:
fp.write(page_text)
print('爬取数据结束!!!')
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。