当前位置:   article > 正文

使用selenium爬取淘宝商品数据(包含登录功能)_爬取淘宝数据要登入

爬取淘宝数据要登入

项目简介:

spider_taobao

以下是部分代码,博主没有对数据进行处理

import time
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
from selenium.webdriver.common.action_chains import ActionChains
from config import *
from urllib.parse import quote

chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
chrome_options.add_experimental_option("excludeSwitches",["enable-automation"])
browser = webdriver.Chrome(chrome_options)

# 设置等待时常
wait= WebDriverWait(browser, 30)

def login():
    url="https://login.taobao.com/member/login.jhtml?redirectURL=https%3a%2f%2fs.taobao.com:443/search%2F_____tmd_____%2Fpage%2Flogin_jump%3Frand%3DS3WxGHAgAt756EpznwfNzJq2AFA2qBNla3j6EINUS8We9dazM_iKElp8DwVSHZUevpC41Bx7RzivXIj9RnZgdg%26_lgt_%3D56187c2d2a96aec483e3f50a4baeeaa1___215918___bad338c181575046793221abbcea4855___eaebc79cac1eb5d2f7d8b4595e00ec73344a42d5a0b8cf56539c823cd24ac06c6b213ffd502da27c5771922daeb449eaeab84d3310934d804dd027a8d75c0275cdb3d5eb3b5d25381a7341f118cdf5120eb265e1c82cd48343995861cd625f2cad31a31f10b4650a1ef2ca0e10e585fa7e062c8e4c21cea1d0aa21ff1c6f4bd9254facd14bb5207b2f4873ebc10a73c154ca108f5af14608caea993432e6050d53b8568a3b95049b3641155db964afbdbd1ca290b8dcdb475a166232a82573f8f74e9970c1432e542a05d1f12eed775d&uuid=56187c2d2a96aec483e3f50a4baeeaa1"
    browser.get(url)
    loginId=wait.until(EC.presence_of_element_located((By.ID, "fm-login-id")))
    loginPassword = wait.until(EC.presence_of_element_located((By.ID, "fm-login-password")))
    # 淘宝的账号和密码
    loginId.send_keys(账号)
    loginPassword.send_keys(密码)


    time.sleep(2)
    # 检查是否出现了滑动验证码
    try:
        slider = browser.find_element_by_xpath("//span[contains(@class, 'nc_iconfont btn_slide')]")
        if slider.is_displayed():
            ActionChains(browser).click_and_hold(on_element=slider).perform()
            ActionChains(browser).move_by_offset(xoffset=258, yoffset=0).perform()
            ActionChains(browser).pause(0.5).release().perform()
    except:
        pass
    # 点击登录按钮
    button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'password-login')))
    button.click()

# 爬取每一页的数据
def index_page(page):
    print(f"正在爬取第{page}页")
    try:
        url = 'https://s.taobao.com/search?q=' + quote(KEYWORD)
        print(quote(KEYWORD))
        browser.get(url)
        if page>1:
            # 判断是否存在存在元素
            input= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div > span > input')))
            #可见的并且已启用,以便您可以单击它
            submit_button= wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'next-pagination-jump-go')))
            # 清空页码输入栏的文字
            input.clear()
            # 再页码输入栏输入页码
            input.send_keys(page)
            # 点击换页
            submit_button.click()
        # 等待指定的文本出现在某一个节点里面时即返回成功
        wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'next-current')))
        wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'Card--doubleCardWrapper--L2XFE73')))
        get_products()
    except TimeoutException:
        index_page(page)

def get_products():
    """
    提取商品
    :return:
    """
    html = browser.page_source
    doc= pq(html)
    items=doc('.Card--doubleCardWrapper--L2XFE73').items()
    print(items)
    for item in items:
        product = {
            'title': item.find('.Title--title--jCOPvpf').text(),
            'image': item.find('.MainPic--mainPic--rcLNaCv').attr('src'),
            'price': item.find('.Price--priceInt--ZlsSi_M').text(),
            'shoping': item.find('.Price--realSales--FhTZc7U').text(),
            'shop': item.find('.ShopInfo--shopName--rg6mGmy').text(),
            'location': item.find('.Card--doubleCardWrapper--L2XFE73').attr('href')
        }
        print(product)
def main():
    """
    遍历每一页
    :return:
    """
    login()
    for i in range(1,MAX_PAGE+1):
        index_page(i)
        time.sleep(20)

if __name__ == '__main__':
    main()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98
  • 99
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/神奇cpp/article/detail/970109
推荐阅读
相关标签
  

闽ICP备14008679号