赞
踩
(一)以css方式获取
import requests,csv from bs4 import BeautifulSoup import codecs #自然语言编码转换库 def main(): res=requests.get("http://book.dangdang.com/20180504_by11") res.encoding = res.apparent_encoding #获得真实编码 soup=BeautifulSoup (res.text,'html.parser') info=[] for item in soup.select('#bd li'): names=item.select('.name')[0].text.strip() price_n=item.select('.price span')[2].text.strip() price_f = item.select('.price span')[3].text.strip() price=price_n+price_f info.append([names,price]) # print(info) with codecs.open('book.csv','w',encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerow(['序号', '书名', '价格']) for i,l in enumerate(info): writer.writerow([i,l[0],l[1]]) #print(price,text,image,sep='\n**********************\n',end='\n*****')#多变量值分隔 if __name__ == '__main__': main()
(二)以xpath方式获取
import urllib.request import requests,csv from lxml import etree import codecs #自然语言编码转换库 def main(): url = "http://book.dangdang.com/20180504_by11" request = urllib.request.Request(url=url) # 不需要headers response = urllib.request.urlopen(request) content = response.read().decode('GBK') # 通过网页查看charset = gb2312 tree = etree.HTML(content) book_list = tree.xpath("//div[@class='con body']//li/p/a/text()") # 通过xpath获取a标签中的书名 price_n_list = tree.xpath("//div[@class='con body']//li/p[@class='price']//span[@class='num']/text()") # span中的价格由两部分组成 price_f_list = tree.xpath("//div[@class='con body']//li/p[@class='price']//span[@class='tail']/text()") info = [] # 定义列表存放书籍信息 for item in range(len(book_list)) : names= book_list[item] price_n = price_n_list[item] price_f = price_f_list[item] price = price_n+ price_f info.append([names,price]) # print(info) # 将列表中的数据存放到csv文件中 with codecs.open('book.csv','w',encoding='utf-8-sig') as f: writer = csv.writer(f) writer.writerow(['序号', '书名', '价格']) for i,l in enumerate(info): writer.writerow([i,l[0],l[1]]) #print(price,text,image,sep='\n**********************\n',end='\n*****')#多变量值分隔 if __name__ == '__main__': main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。