赞
踩
项目简介:
spider_taobao
以下是部分代码,博主没有对数据进行处理
import time from selenium.webdriver.common.by import By from selenium import webdriver from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait from selenium.common.exceptions import TimeoutException from pyquery import PyQuery as pq from selenium.webdriver.common.action_chains import ActionChains from config import * from urllib.parse import quote chrome_options = webdriver.ChromeOptions() # chrome_options.add_argument('--headless') chrome_options.add_experimental_option("excludeSwitches",["enable-automation"]) browser = webdriver.Chrome(chrome_options) # 设置等待时常 wait= WebDriverWait(browser, 30) def login(): url="https://login.taobao.com/member/login.jhtml?redirectURL=https%3a%2f%2fs.taobao.com:443/search%2F_____tmd_____%2Fpage%2Flogin_jump%3Frand%3DS3WxGHAgAt756EpznwfNzJq2AFA2qBNla3j6EINUS8We9dazM_iKElp8DwVSHZUevpC41Bx7RzivXIj9RnZgdg%26_lgt_%3D56187c2d2a96aec483e3f50a4baeeaa1___215918___bad338c181575046793221abbcea4855___eaebc79cac1eb5d2f7d8b4595e00ec73344a42d5a0b8cf56539c823cd24ac06c6b213ffd502da27c5771922daeb449eaeab84d3310934d804dd027a8d75c0275cdb3d5eb3b5d25381a7341f118cdf5120eb265e1c82cd48343995861cd625f2cad31a31f10b4650a1ef2ca0e10e585fa7e062c8e4c21cea1d0aa21ff1c6f4bd9254facd14bb5207b2f4873ebc10a73c154ca108f5af14608caea993432e6050d53b8568a3b95049b3641155db964afbdbd1ca290b8dcdb475a166232a82573f8f74e9970c1432e542a05d1f12eed775d&uuid=56187c2d2a96aec483e3f50a4baeeaa1" browser.get(url) loginId=wait.until(EC.presence_of_element_located((By.ID, "fm-login-id"))) loginPassword = wait.until(EC.presence_of_element_located((By.ID, "fm-login-password"))) # 淘宝的账号和密码 loginId.send_keys(账号) loginPassword.send_keys(密码) time.sleep(2) # 检查是否出现了滑动验证码 try: slider = browser.find_element_by_xpath("//span[contains(@class, 'nc_iconfont btn_slide')]") if slider.is_displayed(): ActionChains(browser).click_and_hold(on_element=slider).perform() ActionChains(browser).move_by_offset(xoffset=258, yoffset=0).perform() ActionChains(browser).pause(0.5).release().perform() except: pass # 点击登录按钮 button = wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'password-login'))) button.click() # 爬取每一页的数据 def index_page(page): print(f"正在爬取第{page}页") try: url = 'https://s.taobao.com/search?q=' + quote(KEYWORD) print(quote(KEYWORD)) browser.get(url) if page>1: # 判断是否存在存在元素 input= wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,'div > span > input'))) #可见的并且已启用,以便您可以单击它 submit_button= wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'next-pagination-jump-go'))) # 清空页码输入栏的文字 input.clear() # 再页码输入栏输入页码 input.send_keys(page) # 点击换页 submit_button.click() # 等待指定的文本出现在某一个节点里面时即返回成功 wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'next-current'))) wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME,'Card--doubleCardWrapper--L2XFE73'))) get_products() except TimeoutException: index_page(page) def get_products(): """ 提取商品 :return: """ html = browser.page_source doc= pq(html) items=doc('.Card--doubleCardWrapper--L2XFE73').items() print(items) for item in items: product = { 'title': item.find('.Title--title--jCOPvpf').text(), 'image': item.find('.MainPic--mainPic--rcLNaCv').attr('src'), 'price': item.find('.Price--priceInt--ZlsSi_M').text(), 'shoping': item.find('.Price--realSales--FhTZc7U').text(), 'shop': item.find('.ShopInfo--shopName--rg6mGmy').text(), 'location': item.find('.Card--doubleCardWrapper--L2XFE73').attr('href') } print(product) def main(): """ 遍历每一页 :return: """ login() for i in range(1,MAX_PAGE+1): index_page(i) time.sleep(20) if __name__ == '__main__': main()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。