赞
踩
现在有新方法可以搞定这个模拟登陆了,不会被检测出来,可以完美的绕过对window.navigator.webdriver的检测,pyppeteer是个好东西!
需要用到的python包:asyncio、pyppeteer
友情提醒一下,第一次运行pyppeteer的会下载chromium,速度很慢慢慢慢,还有可能失败。务必耐心等待!!!然后,这个pyppeteer对网速和电脑运行速度还有一定的要求
下载提示信息(不显示下载进度可以kill掉再run):
[W:pyppeteer.chromium_downloader] start chromium download.
Download may take a few minutes.
如果在centos上使用,需要安装下面的依赖
yum install pango.x86_64 libXcomposite.x86_64 libXcursor.x86_64 libXdamage.x86_64 libXext.x86_64 libXi.x86_64 libXtst.x86_64 cups-libs.x86_64 libXScrnSaver.x86_64 libXrandr.x86_64 GConf2.x86_64 alsa-lib.x86_64 atk.x86_64 gtk3.x86_64 -y
browser = await launch();//启动浏览器实例
page = await browser.newPage();//打开一个空白页
await page.goto('https://example.com');//在地址栏输入网址并等待加载
await page.screenshot({path: 'example.png'});//截个图
await page.setUserAgent(
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299')
await page.goto('https://www.baidu.com')
await page.type('#kw', keyword, {'delay': 100}) 获取输入框焦点并输入文字{'delay': input_time_random() - 50}
await page.click('#su')
await page.waitForNavigation({
waitUntil: 'load'});//等待页面加载出来,等同于window.onload
await browser.close();//关掉浏览器
content = await page.content()
cookies = await page.cookies()
await page.evaluate(js1) //在网页中执行js代码
await page.keyboard.press 模拟键盘按下某个按键,目前mac上组合键无效为已知bug
page.waitFor 页面等待,可以是时间、某个元素、某个函数
page.url 获得当前访问的url
await get_cookie(page)
page.frames() 获取当前页面所有的 iframe,然后根据 iframe 的名字精确获取某个想要的 iframe
iframe.$('.srchsongst') 获取 iframe 中的某个元素
iframe.evaluate() 在浏览器中执行函数,相当于在控制台中执行函数,返回一个 Promise
Array.from 将类数组对象转化为对象
page.click() 点击一个元素
iframe.$eval() 相当于在 iframe 中运行 document.queryselector 获取指定元素,并将其作为第一个参数传递
iframe.$$eval 相当于在 iframe 中运行 document.querySelectorAll 获取指定元素数组,并将其作为第一个参数传递
#await page.waitFor(10000)
url_elements = await page.querySelectorAll('.result h3 a')
for url_element in url_elements:
url = await page.evaluate('(url_element) => url_element.href', url_element)
url_queue.put(url)
爬取网页代码
import asyncio
import pyppeteer
import os
os.environ['PYPPETEER_CHROMIUM_REVISION'] ='588429'
pyppeteer.DEBUG = True
async def main():
print("in main ")
print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
browser = await pyppeteer.launch()
page = await browser.newPage()
await page.goto('http://www.baidu.com')
content = await page.content()
cookies = await page.cookies()
# await page.screenshot({'path': 'example.png'})
await browser.close()
return {'content':content, 'cookies':cookies}
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(main())
loop.run_until_complete(task)
print(task.result())
与scrapy的整合
加入downloadmiddleware
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
import random
import pyppeteer
import asyncio
import os
from scrapy.http import HtmlResponse
pyppeteer.DEBUG = False
class FundscrapyDownloaderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the downloader middleware does not modify the
# passed objects.
def __init__(self) :
print("Init downloaderMiddleware use pypputeer.")
os.environ['PYPPETEER_CHROMIUM_REVISION'] ='588429'
# pyppeteer.DEBUG = False
print(os.environ.get('PYPPETEER_CHROMIUM_REVISION'))
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(self.getbrowser())
loop.run_until_complete(task)
#self.browser = task.result()
print(self.browser)
print(self.page)
# self.page = await browser.newPage()
async def getbrowser(self):
self.browser = await pyppeteer.launch()
self.page = await self.browser.newPage()
# return await pyppeteer.launch()
async def getnewpage(self):
return await self.browser.newPage()
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
loop = asyncio.get_event_loop()
task = asyncio.ensure_future(self.usePypuppeteer(request))
loop.run_until_complete(task)
# return task.result()
return HtmlResponse(url=request.url, body=task.result(), encoding="utf-8",request=request)
async def usePypuppeteer(self, request):
print(request.url)
# page = await self.browser.newPage()
await self.page.goto(request.url)
content = await self.page.content()
return content
def process_response(self, request, response, spider):
# Called with the response returned from the downloader.
# Must either;
# - return a Response object
# - return a Request object
# - or raise IgnoreRequest
return response
def process_exception(self, request, exception, spider):
# Called when a download handler or a process_request()
# (from other downloader middleware) raises an exception.
# Must either:
# - return None: continue processing this exception
# - return a Response object: stops process_exception() chain
# - return a Request object: stops process_exception() chain
pass
def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。