当前位置:   article > 正文

python+selenium多线程与多进程爬虫_selenium多线程爬虫

selenium多线程爬虫

使用python+selenium抓取深圳证券交易所本所公告数据,刚开始是用单进程爬取的,最近将代码修改了一下,分别用多进程和多线程进行抓取,速度非常快。如果对selenium不了解的请移步别的地方学习一下。

多进程爬取

# coding=utf-8
'''
多进程抓取深圳证券交易所本所公告数据
标题和公告内容写入了不同的csv文件里
Author:西兰
Date:2019-11-30
'''

from selenium import webdriver
import time
import csv
from multiprocessing import Process

def process(start,end,num):
    driver_path = r"D:\chromedriver.exe"
    #使用开发者模式
    #options = webdriver.ChromeOptions()
    #options.add_experimental_option('excludeSwitches', ['enable-automation'])
    browser = webdriver.Chrome(executable_path=driver_path)
    browser.implicitly_wait(1)


    count=1
    for j in range(start,end):
      if(count%21==0):
           count=1
      if(j==1):
          url="http://www.szse.cn/disclosure/notice/index.html"
      else:
          url="http://www.szse.cn/disclosure/notice/index_"+str(j-1)+".html"
      # if(j%10==0):#每处理10页数据,关闭并重启一次浏览器
      #     browser.quit()
      #     browser = webdriver.Chrome(executable_path=driver_path)

      for i in range(20):
        browser.get(url)
        browser.maximize_window()
        print("####################################################第",j,"页,第",count,"条记录")
        # 获取列表页handle
        list_page_handle = browser.current_window_handle
        div_content = browser.find_element_by_css_selector('div.g-content-list')
        li_list = div_content.find_elements_by_tag_name('li')
        a_href = li_list[i].find_element_by_tag_name('a').get_attribute('href')
        if(a_href.find('.pdf')>0 or a_href.find('.doc')>0 or a_href.find('.DOC')>0):continue
        print(a_href)
        li_list[i].find_element_by_tag_name('a').click()
        all_handles = browser.window_handles
        for handle in all_handles:
            if (handle != list_page_handle):
                browser.switch_to.window(handle)
        #标题
        title_div = browser.find_element_by_css_selector('div.des-header')
        title_h2 = title_div.find_element_by_tag_name('h2')
        print(title_h2.text)
        data_row_title = [title_h2.text]
        with open('./data/sz_data_title' + str(num) + '.csv', 'a+', newline="", encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row_title)
        #公告内容
        content_div = browser.find_element_by_id('desContent')
        p_content_list = content_div.find_elements_by_tag_name('p')
        final_text=""
        for p in p_content_list:
            final_text+=p.text.strip()
        print(final_text)
        data_row = [final_text]
        with open('./data/sz_data'+ str(num) +'.csv', 'a+', newline="",encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row)
        time.sleep(1)
        count += 1
        browser.close()
        browser.switch_to.window(list_page_handle)

def main():
    #开启4个进程,传入爬取的页码范围
    process_list = []
    p1 = Process(target=process, args=(400,600,1))
    p1.start()
    p2 = Process(target=process, args=(600, 800,1))
    p2.start()
    p3 = Process(target=process, args=(800, 1000, 1))
    p3.start()
    p4 = Process(target=process, args=(1000, 1129, 1))
    p4.start()
    process_list.append(p1)
    process_list.append(p2)
    process_list.append(p3)
    process_list.append(p4)
    for t in process_list:
        t.join()

if __name__ == '__main__':
    s = time.time()
    main()
    e = time.time()
    print('总用时:',e-s)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98

多线程爬取

# coding=utf-8
# --coding--=utf-8
'''
多线程抓取深圳证券交易所本所公告数据
Author:西兰
Date:2019-11-30
'''

from selenium import webdriver
import time
import csv
from threading import Thread

def process(start,end,num):
    driver_path = r"D:\chromedriver.exe"
    #使用开发者模式
    #options = webdriver.ChromeOptions()
    #options.add_experimental_option('excludeSwitches', ['enable-automation'])
    browser = webdriver.Chrome(executable_path=driver_path)
    browser.implicitly_wait(1)


    count=1
    for j in range(start,end):
      if(count%21==0):
           count=1
      if(j==1):
          url="http://www.szse.cn/disclosure/notice/index.html"
      else:
          url="http://www.szse.cn/disclosure/notice/index_"+str(j-1)+".html"
      # if(j%10==0):#每处理10页数据,关闭并重启一次浏览器
      #     browser.quit()
      #     browser = webdriver.Chrome(executable_path=driver_path)

      for i in range(20):
        browser.get(url)
        browser.maximize_window()
        print("####################################################第",j,"页,第",count,"条记录")
        # 获取列表页handle
        list_page_handle = browser.current_window_handle
        div_content = browser.find_element_by_css_selector('div.g-content-list')
        li_list = div_content.find_elements_by_tag_name('li')
        a_href = li_list[i].find_element_by_tag_name('a').get_attribute('href')
        if(a_href.find('.pdf')>0 or a_href.find('.doc')>0 or a_href.find('.DOC')>0):continue
        print(a_href)
        li_list[i].find_element_by_tag_name('a').click()
        all_handles = browser.window_handles
        for handle in all_handles:
            if (handle != list_page_handle):
                browser.switch_to.window(handle)
        #标题
        title_div = browser.find_element_by_css_selector('div.des-header')
        title_h2 = title_div.find_element_by_tag_name('h2')
        print(title_h2.text)
        data_row_title = [title_h2.text]
        with open('./data/sz_data_title' + str(num) + '.csv', 'a+', newline="", encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row_title)
        #公告内容
        content_div = browser.find_element_by_id('desContent')
        p_content_list = content_div.find_elements_by_tag_name('p')
        final_text=""
        for p in p_content_list:
            final_text+=p.text.strip()
        print(final_text)
        data_row = [final_text]
        with open('./data/sz_data'+ str(num) +'.csv', 'a+', newline="",encoding='utf-8') as f:
            csv_add = csv.writer(f)
            csv_add.writerow(data_row)
        time.sleep(1)
        count += 1
        browser.close()
        browser.switch_to.window(list_page_handle)

def main():
	 #开启4个进程,传入爬取的页码范围
    thead_list = []
    t1 = Thread(target=process, args=(400,600,1))
    t1.start()
    t2 = Thread(target=process, args=(600, 800,1))
    t2.start()
    t3 = Thread(target=process, args=(800, 1000, 3))
    t3.start()
    t4 = Thread(target=process, args=(1000, 1129, 4))
    t4.start()
    thead_list.append(t1)
    thead_list.append(t2)
    thead_list.append(t3)
    thead_list.append(t4)
    for t in thead_list:
        t.join()

if __name__ == '__main__':
    s = time.time()
    main()
    e = time.time()
    print('总用时:',e-s)

  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97
  • 98

喜欢编程的朋友可以关注我的公众号,我们一起进步!
在这里插入图片描述

参考:python多进程与多线程

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/黑客灵魂/article/detail/1012670
推荐阅读
相关标签
  

闽ICP备14008679号