当前位置:   article > 正文

python爬虫淘宝简书_爬虫-淘宝商品参数爬取(selenium)

淘宝爬虫条形码

我们这里需要用到selenium库等来爬取这些js动态信息,下面让我带领大家完成这个程序。

from selenium import webdriver

import time

import xlwt

先导入所需要的库

driver_path=r"C:\Users\*****\chromedriver.exe"

driver=webdriver.Chrome(executable_path=driver_path)

将谷歌驱动的路径写入

f=xlwt.Workbook(encoding="utf8")

sheet01=f.add_sheet(u'sheet1',cell_overwrite_ok=True)

sheet01.write(0,0,'标题')#excl里面:左边0:是横,右边:纵

sheet01.write(0,1,'标价')

sheet01.write(0,2,'购买人数')

创建并设置表格编码形式以表格格式等等

def parse_page(url,number):

info={}

contents=[]

driver.get(url)

for i in range(0,number):

divs = driver.find_elements_by_xpath("//div[@id='searchResult']/div[@id='ItemWrapper']/div[@class='item']")

for div in divs:

title = div.find_element_by_xpath(".//span[@class='title']").text

price = div.find_element_by_xpath(".//span[@class='pricedetail']/strong").text

number = div.find_element_by_xpath(".//span[@class='payNum']").text

info={

'title':title,

'price':price,

'number':number

}

contents.append(info)

print('第%d页数据获取完成'%(i+1))

next_page=driver.find_element_by_xpath("//a[@class='page-next iconfont']").click()

time.sleep(1)

save_infos(contents)

解析网页,并且获取淘宝下一页按钮进行循环点击翻页,将内容存储到字典,最后将调用存储函数,将字典作为参数。

def save_infos(contents):

w = 0

for content in contents:

sheet01.write(w + 1, 0, content['title']) # 前纵后横

sheet01.write(w + 1, 1, content['price'])

sheet01.write(w + 1, 2, content['number'])

w = w + 1

f.save(r"C:\Users\*********结果.xls")

存储数据用的函数,将字典中的内容提取,然后写入xls

def main():

url="https://s.taobao.com/search?q=%E7%94%B7%E8%A3%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180705&ie=utf8&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0"

number=10

parse_page(url,number)

if __name__ == '__main__':

main()

完整代码如下:

from selenium import webdriver

import time

import xlwt

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By

driver_path=r"C:\Users\**********\chromedriver.exe"

driver=webdriver.Chrome(executable_path=driver_path)

f=xlwt.Workbook(encoding="utf8")

sheet01=f.add_sheet(u'sheet1',cell_overwrite_ok=True)

sheet01.write(0,0,'标题')#excl里面:左边0:是横,右边:纵

sheet01.write(0,1,'标价')

sheet01.write(0,2,'购买人数')

def parse_page(url,number):

info={}

contents=[]

driver.get(url)

for i in range(0,number):

divs = driver.find_elements_by_xpath("//div[@id='searchResult']/div[@id='ItemWrapper']/div[@class='item']")

print(len(divs))

# .//div[@class='row row-2 title']/a/span[@class='baoyou-intitle icon-service-free']

for div in divs:

title = div.find_element_by_xpath(".//span[@class='title']").text

price = div.find_element_by_xpath(".//span[@class='pricedetail']/strong").text

number = div.find_element_by_xpath(".//span[@class='payNum']").text

info={

'title':title,

'price':price,

'number':number

}

print(info)

contents.append(info)

print('第%d页数据获取完成'%(i+1))

next_page=driver.find_element_by_xpath("//a[@class='page-next iconfont']").click()

time.sleep(1)

save_infos(contents)

def save_infos(contents):

w = 0

for content in contents:

sheet01.write(w + 1, 0, content['title']) # 前纵后横

sheet01.write(w + 1, 1, content['price'])

sheet01.write(w + 1, 2, content['number'])

w = w + 1

f.save(r"C:\Users\*********结果.xls")

def main():

# url="https://s.taobao.com/search?q=%E7%94%B7%E8%A3%85&imgfile=&js=1&stats_click=search_radio_all%3A1&initiative_id=staobaoz_20180705&ie=utf8&bcoffset=6&ntoffset=6&p4ppushleft=1%2C48&s=0"

url="https://uland.taobao.com/sem/tbsearch?refpid=mm_26632258_3504122_32538762&keyword=%E5%A5%B3%E8%A3%85&clk1=44c369a534bf95506aa0a87518971645&upsid=44c369a534bf95506aa0a87518971645"

number=10

parse_page(url,number)

if __name__ == '__main__':

main()

大家一定注意把代码中的路径换成自己的文件路径,另外淘宝这个网站最近出了一点小问题,鼠标按下一页它不能跳转到下一页,以至于我们的程序无法跳转,但程序绝对是没有问题。按钮点击就是这么个写法,这里也给大家起到学习的作用。

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/我家自动化/article/detail/159410
推荐阅读
相关标签
  

闽ICP备14008679号