当前位置:   article > 正文

爬取淘宝商品信息selenium+pyquery+mongodb_pyspider+mongodb爬取淘宝商品信息

pyspider+mongodb爬取淘宝商品信息
  1. '''
  2. 爬取淘宝商品信息,通过selenium获得渲染后的源码,pyquery解析,mongodb存储
  3. '''
  4. from selenium import webdriver
  5. from selenium.webdriver.common.by import By
  6. from selenium.webdriver.support import expected_conditions as EC
  7. from selenium.common.exceptions import TimeoutException
  8. from selenium.webdriver.support.wait import WebDriverWait
  9. from urllib.parse import quote
  10. from pyquery import PyQuery as pq
  11. import pymongo
  12. BASEURL = 'https://s.taobao.com/search?q='
  13. KEYWORD = 'python'
  14. driver = webdriver.Chrome()
  15. wait = WebDriverWait(driver, 10)
  16. client = pymongo.MongoClient('mongodb://admin:admin123@localhost:27017/')
  17. db = client.taobao
  18. collection = db.products
  19. def get_page(page):
  20. ```
  21. 跳转到传入页面,获得源码,调用商品解析函数
  22. ```
  23. #driver = webdriver.Chrome()
  24. #wait = WebDriverWait(driver, 10)
  25. try:
  26. driver.get(BASEURL + quote(KEYWORD))
  27. print('你当前访问的是第%d页' % page)
  28. if page > 1:
  29. J_input = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#mainsrp-pager div.form > input' )))
  30. J_submit = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#mainsrp-pager div.form > span.btn.J_Submit')))
  31. J_input.clear()
  32. J_input.send_keys(page)
  33. J_submit.click()
  34. wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR,
  35. '#mainsrp-pager li.item.active > span'), str(page)))
  36. wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.m-itemlist .items .item')))
  37. html = driver.page_source
  38. get_products(html)
  39. except TimeoutException:
  40. print('try again')
  41. get_page(page)
  42. def get_products(html):
  43. '''
  44. 解析出每件商品信息,调用存储函数存储
  45. '''
  46. doc = pq(html)
  47. items = doc('#mainsrp-itemlist .items .item').items()
  48. for item in items:
  49. product = {}
  50. product['image'] = item.find('.img').attr('src')
  51. product['price'] = item.find('.price').text()
  52. product['payment'] = item.find('.deal-cnt').text()
  53. product['title'] = item.find('.title').text()
  54. product['location'] = item.find('.location').text()
  55. product['shop'] = item.find('.shopname').text()
  56. product['shop-link'] = item.find('.shopname').attr('href')
  57. print(product)
  58. save_to_mongo(product)
  59. def save_to_mongo(product):
  60. ```
  61. 存储函数,将商品信息存入数据库
  62. ```
  63. try:
  64. if collection.insert(product):
  65. print('存储成功')
  66. except Exception as e:
  67. print('失败',e.__class__)
  68. if __name__ == '__main__':
  69. for i in range(1, 3):
  70. get_page(i)
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小丑西瓜9/article/detail/159538
推荐阅读
相关标签
  

闽ICP备14008679号