赞
踩
我使用selenium简单的爬取搜索的URL,这应该对于那自动测试漏洞有用,我想使用谷歌搜索的,奈何没钱买代理,Google 语法感觉比百度语法有用多了,
代码
- # -*- coding: utf-8 -*-
- """
- Created on Sat May 2 15:17:58 2020
- @author: 14504
- """
-
-
- from selenium import webdriver
- from selenium.common.exceptions import TimeoutException
- from selenium.webdriver.support.wait import WebDriverWait
- from urllib.parse import quote
- from pyquery import PyQuery as pq
- import requests
- import time
-
- url_save_path="./url.txt"
- SearchInformation="inurl: (admin)"
- starPage=1 #页数
- endPage=1
-
- # 添加无界面参数
- options = webdriver.ChromeOptions()
- options.add_argument('--headless')
- browser = webdriver.Chrome(options=options)
-
- #browser = webdriver.Chrome()
- wait= WebDriverWait(browser,10)
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:74.0) Gecko/20100101 Firefox/74.0'
- }
-
-
-
- def searchURL(page):
- pageScema="&pn="+str(page)
- url="https://www.baidu.com/s?wd="+quote(SearchInformation)+pageScema
- try:
- browser.get(url)
- urlnum=geturl()
- return urlnum
-
- except TimeoutException:
- print("请求超时")
-
- def geturl():
- urlnum=0;
- html=browser.page_source
- doc=pq(html)
- items = doc('div#content_left .result.c-container').items()
- for item in items:
- BDurl=item.children('div.f13 > a').attr('href')
- real_url=urlDecode(BDurl)
- if real_url=="":
- print("none")
- else:
- saveTotxt(real_url)
- urlnum=urlnum+1
- print("这一页成功爬取了"+str(urlnum)+"个\n")
- return urlnum
-
- #百度url解码
- def urlDecode(BDurl):
- try:
- res = requests.get(BDurl,allow_redirects=False)
- Real_url=res.headers['Location']
- return Real_url
- except requests.exceptions.ConnectionError as e:
- print('ConnectionError', e.args)
- return("")
-
- except requests.exceptions.MissingSchema as e:
- print('Schema is none', e.args)
- return("")
-
- except:
- return("")
-
-
- def saveTotxt(real_url):
- with open(url_save_path, 'a', encoding='utf-8') as file:
- file.write(real_url)
- file.write("\n")
-
- def main():
- urlsum=0
- for page in range(starPage-1,endPage):
- print("正在爬取第"+str(page+1)+"页")
- page=page*10
- urlnum=searchURL(page)
- urlsum=urlnum+urlsum
- time.sleep(1)
-
- print("成功爬取"+str(urlsum)+"个url地址")
-
-
- main()
-

Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。