赞
踩
我们这里需要用到selenium库等来爬取这些js动态信息,下面让我带领大家完成这个程序。
from selenium import webdriver
import time
import xlwt
先导入所需要的库
driver_path=r"C:\Users\*****\chromedriver.exe"
driver=webdriver.Chrome(executable_path=driver_path)
将谷歌驱动的路径写入
f=xlwt.Workbook(encoding="utf8")
sheet01=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet01.write(0,0,'标题')#excl里面:左边0:是横,右边:纵
sheet01.write(0,1,'标价')
sheet01.write(0,2,'购买人数')
创建并设置表格编码形式以表格格式等等
def parse_page(url,number):
info={}
contents=[]
driver.get(url)
for i in range(0,number):
divs = driver.find_elements_by_xpath("//div[@id='searchResult']/div[@id='ItemWrapper']/div[@class='item']")
for div in divs:
title = div.find_element_by_xpath(".//span[@class='title']").text
price = div.find_element_by_xpath(".//span[@class='pricedetail']/strong").text
number = div.find_element_by_xpath(".//span[@class='payNum']").text
info={
'title':title,
'price':price,
'number':number
}
contents.append(info)
print('第%d页数据获取完成'%(i+1))
next_page=driver.find_element_by_xpath("//a[@class='page-next iconfont']").click()
time.sleep(1)
save_infos(contents)
解析网页,并且获取淘宝下一页按钮进行循环点击翻页,将内容存储到字典,最后将调用存储函数,将字典作为参数。
def save_infos(contents):
w = 0
for content in contents:
sheet01.write(w + 1, 0, content['title']) # 前纵后横
sheet01.write(w + 1, 1, content['price'])
sheet01.write(w + 1, 2, content['number'])
w = w + 1
f.save(r"C:\Users\*********结果.xls")
存储数据用的函数,将字典中的内容提取,然后写入xls
def main():
url="https://s.taobao.com/search?q=%E7%94%B7%E8%A3%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180705&ie=utf8&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0"
number=10
parse_page(url,number)
if __name__ == '__main__':
main()
完整代码如下:
from selenium import webdriver
import time
import xlwt
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
driver_path=r"C:\Users\**********\chromedriver.exe"
driver=webdriver.Chrome(executable_path=driver_path)
f=xlwt.Workbook(encoding="utf8")
sheet01=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet01.write(0,0,'标题')#excl里面:左边0:是横,右边:纵
sheet01.write(0,1,'标价')
sheet01.write(0,2,'购买人数')
def parse_page(url,number):
info={}
contents=[]
driver.get(url)
for i in range(0,number):
divs = driver.find_elements_by_xpath("//div[@id='searchResult']/div[@id='ItemWrapper']/div[@class='item']")
print(len(divs))
# .//div[@class='row row-2 title']/a/span[@class='baoyou-intitle icon-service-free']
for div in divs:
title = div.find_element_by_xpath(".//span[@class='title']").text
price = div.find_element_by_xpath(".//span[@class='pricedetail']/strong").text
number = div.find_element_by_xpath(".//span[@class='payNum']").text
info={
'title':title,
'price':price,
'number':number
}
print(info)
contents.append(info)
print('第%d页数据获取完成'%(i+1))
next_page=driver.find_element_by_xpath("//a[@class='page-next iconfont']").click()
time.sleep(1)
save_infos(contents)
def save_infos(contents):
w = 0
for content in contents:
sheet01.write(w + 1, 0, content['title']) # 前纵后横
sheet01.write(w + 1, 1, content['price'])
sheet01.write(w + 1, 2, content['number'])
w = w + 1
f.save(r"C:\Users\*********结果.xls")
def main():
# url="https://s.taobao.com/search?q=%E7%94%B7%E8%A3%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180705&ie=utf8&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0"
url="https://uland.taobao.com/sem/tbsearch?refpid=mm_26632258_3504122_32538762&keyword=%E5%A5%B3%E8%A3%85&clk1=44c369a534bf95506aa0a87518971645&upsid=44c369a534bf95506aa0a87518971645"
number=10
parse_page(url,number)
if __name__ == '__main__':
main()
大家一定注意把代码中的路径换成自己的文件路径,另外淘宝这个网站最近出了一点小问题,鼠标按下一页它不能跳转到下一页,以至于我们的程序无法跳转,但程序绝对是没有问题。按钮点击就是这么个写法,这里也给大家起到学习的作用。
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。