赞
踩
由于爬虫的敏感性,本文隐藏掉具体的站点信息:
- import time
- import requests
- import random
- import logging
- from selenium import webdriver
-
- logging.basicConfig(level=logging.DEBUG,
- format='%(asctime)s - %(levelname)s - %(message)s')
-
- TIME = 5
-
- #访问具体链接地址。
- def selenium_browser(page_number, addr):
- option = webdriver.ChromeOptions()
- option.set_headless() # 不打开浏览器窗口。
- browser = webdriver.Chrome(
- options=option, executable_path=r"D:\program\chromedriver_win32\chromedriver.exe")
- browser.set_page_load_timeout(60) # 页面加载最大时间为60秒,否则超时。
- try:
- browser.get(addr)
- t = browser.title
-
- time.sleep(random.randint(0, TIME))
-
- browser.close()
- logging.info("访问第"+str(page_number)+"页-文章:"+t+"结束 - "+addr)
- except:
- logging.info("访问第"+str(page_number)+"页- 异常 - "+addr)
- browser.close()
-
- #page_number,分页的编码。
- def htttp_page(page_number):
- u = "http://*************/s/articlelist_**********_0_" + \
- str(page_number)+".html"
- r = requests.get(u)
-
- logging.info("访问第"+str(page_number)+"页")
-
- r.raise_for_status()
- tag = r'href="http://************/s/blog_'
- parts = r.text.split()
-
- addrs = []
-
- for part in parts:
- if part.startswith(tag):
- lnk = part.split()
- addr = lnk[0].split('"')[1]
- addrs.append(addr)
-
- random.shuffle(addrs)
-
- for addr in addrs:
- selenium_browser(page_number, addr)
- time.sleep(random.randint(0, TIME))
-
-
- def start_get_web():
- array = list(range(1, 9))
- random.shuffle(array) # 随机页码
- print("随机后的分页页码:")
- print(array)
-
- for i in array:
- htttp_page(i)
- time.sleep(random.randint(0, TIME))
-
-
- while True:
- try:
- start_get_web()
- time.sleep(random.randint(0, TIME))
- except Exception as e:
- logging.error(str(e))
注:本文是研究性质,NOT NO EVIL
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。