赞
踩
有些网页的反爬虫做得比较好,href="javascript:void(0);"
或href="javascript:;"
,含义是留在原处不跳转
此时无法直接从href中获取链接,链接直接写进监听事件里,从.js文件中也无从(难以)获取
使用Selenium模拟用户点击网页进行爬取
注意!!每次调用drvier之前建议调用time.sleep()
,因为程序运行速度远远比浏览器操作快
import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def wait(locator, timeout=10): '''等到元素加载完成''' WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator)) driver = webdriver.Chrome() driver.get('某链接') locator = (By.CLASS_NAME, '要爬取的类') # 相当于find_elements_by_class_name wait(locator) elements = driver.find_elements_by_class_name('要爬取的类') link = [] linkNum = len(elements) for i in range(linkNum): wait(locator) elements = driver.find_elements_by_class_name('要爬取的类') # 再次获取元素,预防StaleElementReferenceException driver.execute_script('arguments[0].click();', elements[i]) # 模拟用户点击 time.sleep(sleep_second) print(i, driver.current_url) link.append(driver.current_url) time.sleep(0.01) # 留时间给页面后退,网不好调大点,此处用driver.implicitly_wait()无效 driver.back() driver.quit() print('共{}条链接'.format(len(link)))
import time from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC def get_javascript0_links(url, class_name, sleep_second=0.01): """ Selenium模拟用户点击爬取url :param url: 目标页面 :param class_name: 模拟点击的类 :param sleep_second: 留给页面后退的时间 :return: list, 点击class为class_name进去的超链接 """ def wait(locator, timeout=10): """等到元素加载完成""" WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator)) options = Options() options.add_argument("--headless") # 无界面 driver = webdriver.Chrome(chrome_options=options) driver.get(url) locator = (By.CLASS_NAME, class_name) wait(locator) elements = driver.find_elements_by_class_name(class_name) link = [] linkNum = len(elements) for i in range(linkNum): wait(locator) elements = driver.find_elements_by_class_name(class_name) driver.execute_script("arguments[0].click();", elements[i]) time.sleep(sleep_second) link.append(driver.current_url) time.sleep(sleep_second) driver.back() driver.quit() return link if __name__ == "__main__": url = "目标页面" class_name = "模拟点击的类" link = get_javascript0_links(url, class_name) for i, _link in enumerate(link): print(i, _link)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。