赞
踩
from selenium import webdriver import csv import threading import time from lxml import etree from queue import Queue class BaiduSpider(object): def __init__(self): self.url = 'https://www.baidu.com/' self.driver = webdriver.Chrome() self.q = Queue() def get_first_page(self): self.driver.get(self.url) self.driver.find_element_by_name('wd').send_keys('注册页面') self.driver.find_element_by_id('su').click() time.sleep(2) def parse_page(self): response = self.driver.page_source html = etree.HTML(response) hrefs = html.xpath('//div[@class="result c-container "]/h3[@class="t"]/a/@href') flag = html.xpath('//div[@id="page"]/a[last()]/@class')[0] return flag, hrefs def parse_detail_page(self): while True: if self.q.qsize(): # 获取队列中的URL url = self.q.get() driver = webdriver.Chrome() driver.get(url) try: html = driver.page_source except: driver.quit() else: driver.quit() # 保存到本地 self.save_new_url(url) else: time.sleep(10) def get_page_html(self): # 获取第一页数据 self.get_first_page() page_num = 1 while True: # 解析页面数据,判定是否继续爬取,以及获取详情页的URL flag, urls = self.parse_page() # 判断是否为末页 if flag != 'n': print('已爬取百度全部数据') break # 将详情页URL加入队列 for url in urls: self.q.put(url) # 跳转至下一页 time.sleep(4) self.driver.find_element_by_xpath('//div[@id="page"]/a[last()]').click() def get_dtail_html(self): while True: if self.q.qsize() != 0: print(self.q.qsize()) url = self.q.get() print(url) else: time.sleep(5) def run(self): # 获取每页URL c = threading.Thread(target=self.get_page_html) c.start() # 解析详情页 t = threading.Thread(target=self.get_dtail_html) t.start() if __name__ == '__main__': zhuce = BaiduSpider() zhuce.run()
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。