赞
踩
Chromedriver所有版本下载链接:
https://chromedriver.storage.googleapis.com/index.html
注意版本对应关系,这里我用的是
版本 101.0.4951.41(正式版本) (64 位)
下载对应的Chrome Driver 版本即可,版本号对应即可;最后一位不一样,也可正常使用;
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from selenium import webdriver chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('disable-infobars')#谷歌不出现在被自动化工具控制 chrome_options.add_argument('--headless') # 无头浏览器设置, 使用无头容易出错 prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) # 不加载图片设置 chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"') # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option("useAutomationExtension", False) chrome_options.add_argument('log-level=3') chrome_options.add_argument("--start-maximized") # 最大化窗口 chrome_options.add_argument("--disable-cache") chrome_options.add_argument("--no-sandbox"); chrome_options.add_argument("--disable-infobars") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-browser-side-navigation") chrome_options.add_argument("--disable-software-rasterizer") chrome_options.binary_location=r"~/yourpath/chrome.exe" desired_capabilities = copy.deepcopy(DesiredCapabilities.CHROME) desired_capabilities["pageLoadStrategy"] = "none" # chrome_options.add_argument(r'--user-data-dir=C:\Users\ADMINI~1\AppData\Local\Temp\scoped_dir11488_22853\Default') # 谷歌有时候大姨妈 需要加上这个 将其中的路径 设置成用户自己的数据目录 browser = webdriver.Chrome( chrome_options=chrome_options, executable_path=r"~/yourpath/chromedriver.exe", desired_capabilities = desired_capabilities )
user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"] ua = random.choice(user_agent_list) if ua: browser.execute_cdp_cmd("Emulation.setUserAgentOverride", {"userAgent": ua})
其中stealth.min.js 文件可自行下载;
remove_webdriver_feature_js = os.path.join( os.path.dirname(__file__), 'stealth.min.js')
with open(remove_webdriver_feature_js) as fid:
js = fid.read()
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
对于有批量下载需求的,可以使用相应的代理服务进行下载配置
import os import pdb import copy import numpy as np import random import scrapy import copy from scrapy.http import HtmlResponse from scrapy.linkextractors import LinkExtractor from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from scrapy import Selector from selenium import webdriver from bs4 import BeautifulSoup from urllib import parse import pdb import selenium from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.desired_capabilities import DesiredCapabilities from lxml import etree import re,time,requests,json import numpy as np from requests.exceptions import RequestException chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('disable-infobars')#谷歌不出现在被自动化工具控制 # chrome_options.add_argument('--headless') # 无头浏览器设置, 使用无头容易出错 prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) # 不加载图片设置 chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"') # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug chrome_options.add_experimental_option('excludeSwitches', ['enable-automation']) chrome_options.add_experimental_option("useAutomationExtension", False) chrome_options.add_argument('log-level=3') chrome_options.add_argument("--start-maximized"); chrome_options.add_argument("--disable-cache") chrome_options.add_argument("--no-sandbox"); chrome_options.add_argument("--disable-infobars") chrome_options.add_argument("--disable-dev-shm-usage") chrome_options.add_argument("--disable-browser-side-navigation") chrome_options.add_argument("--disable-software-rasterizer") desired_capabilities = copy.deepcopy(DesiredCapabilities.CHROME) desired_capabilities["pageLoadStrategy"] = "none" # chrome_options.add_argument(r'--user-data-dir=C:\Users\ADMINI~1\AppData\Local\Temp\scoped_dir11488_22853\Default') # 谷歌有时候大姨妈 需要加上这个 将其中的路径 设置成用户自己的数据目录 browser = webdriver.Chrome( chrome_options=chrome_options, executable_path=os.path.join(os.path.dirname(__file__),'chromedriver.exe'), desired_capabilities = desired_capabilities ) remove_webdriver_feature_js = os.path.join( os.path.dirname(__file__), 'stealth.min.js') with open(remove_webdriver_feature_js) as fid: js = fid.read() browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", { "source": js }) wait = WebDriverWait(browser,10) # 等待网页反应最多10秒 user_agent_list = [ \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"] def scroll_down(): #下滑操作 sroll_cnt = 0 while True: if (sroll_cnt) < 6: browser.execute_script('window.scrollBy(0, 2000)') time.sleep(0.5*np.random.rand()) sroll_cnt += 1 else: break def parse_page(subclass_url='http://www.manmanbuy.com/list_6.aspx',snap_filepath='tmp.json'): browser.get(subclass_url) with open(snap_filepath, 'w', encoding='utf-8') as fid: failure_cnt = 0; while True: scroll_down() try: wait.until(EC.presence_of_all_elements_located((By.ID,"dispage"))) except: browser.delete_all_cookies() failure_cnt += 1 continue if failure_cnt > 5: break; failure_cnt = 0 # items = browser.find_elements_by_class_name('item') # get all items content = BeautifulSoup(browser.page_source, 'lxml') cmpsku_items = content.find_all(attrs={'class': 'item'}) num_pattern = re.compile(r'\d+') for item in cmpsku_items: platform_num = num_pattern.search(item.find('div', attrs={'class':'btns'}).a.text).group() url = "http://www.manmanbuy.com/" + item.find('div', attrs={'class':'pic'}).a['href'] pic_url = item.find('div', attrs={'class':'pic'}).a.img['original'] title = item.find('div', attrs={'class':'name'}).a.text tmp_dict = {} tmp_dict['platform_num'] = platform_num tmp_dict['url'] = url tmp_dict['pic_url'] = pic_url tmp_dict['title'] = title fid.write('{}\n'.format(json.dumps(tmp_dict, ensure_ascii=False))) page_current_total_pattern = re.compile(r'(?P<current_page>\d+)/(?P<total_page>\d+)') disp_item = content.find(attrs={'id': 'dispage'}) next_seqnum = 0 page_items = disp_item.find_all('a') for cnt, page_item in enumerate(page_items): if page_item.text == '下一页': next_seqnum = cnt next_url = 'http://www.manmanbuy.com/'+'{}'.format(page_item['href']) else: continue page_status = page_current_total_pattern.search(disp_item.text) if int(page_status['current_page']) % 10 == 0: fid.flush() if int(page_status['current_page']) < int(page_status['total_page']): print("第",page_status['current_page'],"页数据爬取完毕,共",page_status['total_page'],"页") # browser.find_element_by_xpath('//div[@id="dispage"]/a').click() browser.get(next_url) time.sleep(3*np.random.rand()) else: break def parse_detail_urls(json_path=r'洗衣机.json'): result_json_path = os.path.splitext(json_path)[0] + "_lst.json" with open(json_path, 'r', encoding='utf-8') as fid: json_lines = fid.readlines() with open(result_json_path, 'w', encoding='utf-8') as fid: for cnt, line in enumerate(json_lines): if cnt % 20 == 0: ua = random.choice(user_agent_list) if ua: browser.execute_cdp_cmd("Emulation.setUserAgentOverride", {"userAgent": ua}) browser.delete_all_cookies() json_info = json.loads(line) browser.get(json_info['url']) time.sleep(3) try: scroll_down() content = BeautifulSoup(browser.page_source, 'lxml') pro_mall_nodes = content.find_all('div', attrs={'class': 'pro-mall-list'}) if pro_mall_nodes is None or len(pro_mall_nodes) <= 0: continue pro_mall_node = pro_mall_nodes[0] pro_mall_list = pro_mall_node.find_all('li') result_mall_pro_list = [] for pro_mall in pro_mall_list: if pro_mall.div is None: continue pro_dict = {} pro_head_info = eval(pro_mall.div['v']) if pro_head_info is None: continue pro_dict['sitename'] = pro_head_info['sitename'] pro_dict['skuid'] = pro_mall.div['skuid'] redirect_url = pro_mall.find('a')['href'] if not redirect_url.startswith('http'): pro_dict['redirect_url'] = 'http://www.manmanbuy.com/' + redirect_url else: pro_dict['redirect_url'] = redirect_url result_mall_pro_list.append(pro_dict) json_info['mall_list'] = result_mall_pro_list fid.write('{}\n'.format(json.dumps(json_info, ensure_ascii=False))) fid.flush() except Exception as err: print('Error happends: {}'.format(err)) fid.flush() def get_start_urls(): browser.get('http://home.manmanbuy.com/bijia.aspx') time.sleep(6) html = browser.page_source response = HtmlResponse(url=browser.current_url, body=html, encoding='utf-8') pattern = "http://www\.manmanbuy\.com/list_.*\.aspx" link_extractor = LinkExtractor(allow=pattern) links = link_extractor.extract_links(response) detail_urls = {i.text:i.url for i in links} return detail_urls if __name__ == '__main__': detail_urls = get_start_urls() pdb.set_trace() for typeflag, url in detail_urls.items(): parse_page(subclass_url=url, snap_filepath='{}.json'.format(typeflag)) # for _json_path in [ '洗衣机.json']: # parse_detail_urls(json_path=_json_path)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。