当前位置:   article > 正文

python 通过线程池实现更快的爬虫_multiprocessing.pool()结合queue实现python爬虫

multiprocessing.pool()结合queue实现python爬虫

1. 线程池使用方法介绍

  1. 实例化线程池对象

    1. from multiprocessing.dummy import Pool
    2. pool = Pool(process=5) #默认大小是cup的个数
  2. 把从发送请求,提取数据,到保存合并成一个函数,交给线程池异步执行

    使用方法pool.apply_async(func)

    1. def exetute_requests_item_save(self):
    2. url = self.queue.get()
    3. html_str = self.parse_url(url)
    4. content_list = self.get_content_list(html_str)
    5. self.save_content_list(content_list)
    6. self.total_response_num +=1
    7. pool.apply_async(self.exetute_requests_item_save)
  3. 添加回调函数

    通过apply_async的方法能够让函数异步执行,但是只能够执行一次

    为了让其能够被反复执行,通过添加回调函数的方式能够让_callback 递归的调用自己

    同时需要指定递归退出的条件

    1. def _callback(self,temp):
    2. if self.is_running:
    3. pool.apply_async(self.exetute_requests_item_save,callback=self._callback)
    4. pool.apply_async(self.exetute_requests_item_save,callback=self._callback)
  4. 确定程序结束的条件 程序在获取的响应和url数量相同的时候可以结束

    1. while True: #防止主线程结束
    2. time.sleep(0.0001) #避免cpu空转,浪费资源
    3. if self.total_response_num>=self.total_requests_num:
    4. self.is_running= False
    5. break
    6. self.pool.close() #关闭线程池,防止新的线程开启
    7. # self.pool.join() #等待所有的子线程结束

2. 使用线程池实现爬虫的具体实现

  1. # coding=utf-8
  2. import requests
  3. from lxml import etree
  4. from queue import Queue
  5. from multiprocessing.dummy import Pool
  6. import time
  7. class QiubaiSpider:
  8. def __init__(self):
  9. self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
  10. self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X \
  11. 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
  12. self.queue = Queue()
  13. self.pool = Pool(5)
  14. self.is_running = True
  15. self.total_requests_num = 0
  16. self.total_response_num = 0
  17. def get_url_list(self): # 获取url列表
  18. for i in range(1, 14):
  19. self.queue.put(self.url_temp.format(i))
  20. self.total_requests_num += 1
  21. def parse_url(self, url): # 发送请求,获取响应
  22. return requests.get(url, headers=self.headers).content.decode()
  23. def get_content_list(self, html_str): # 提取段子
  24. html = etree.HTML(html_str)
  25. div_list = html.xpath("//div[@id='content-left']/div")
  26. content_list = []
  27. for div in div_list:
  28. content = {}
  29. content["content"] = div.xpath(".//div[@class='content']/span/text()")
  30. print(content)
  31. content_list.append(content)
  32. return content_list
  33. def save_content_list(self, content_list): # 保存数据
  34. pass
  35. def exetute_requests_item_save(self):
  36. url = self.queue.get()
  37. html_str = self.parse_url(url)
  38. content_list = self.get_content_list(html_str)
  39. self.save_content_list(content_list)
  40. self.total_response_num += 1
  41. def _callback(self, temp):
  42. if self.is_running:
  43. self.pool.apply_async(self.exetute_requests_item_save, callback=self._callback)
  44. def run(self):
  45. self.get_url_list()
  46. for i in range(2): # 控制并发
  47. self.pool.apply_async(self.exetute_requests_item_save, callback=self._callback)
  48. while True: # 防止主线程结束
  49. time.sleep(0.0001) # 避免cpu空转,浪费资源
  50. if self.total_response_num >= self.total_requests_num:
  51. self.is_running = False
  52. break
  53. self.pool.close() # 关闭线程池,防止新的线程开启
  54. # self.pool.join() #等待所有的子线程结束
  55. if __name__ == '__main__':
  56. qiubai = QiubaiSpider()
  57. qiubai.run()

3. 使用协程池实现爬虫的具体实现

  1. # coding=utf-8
  2. import gevent.monky
  3. gevent.monky.path_all()
  4. from gevent.pool import Pool
  5. import requests
  6. from lxml import etree
  7. from queue import Queue
  8. import time
  9. class QiubaiSpider:
  10. def __init__(self):
  11. self.url_temp = "https://www.qiushibaike.com/8hr/page/{}/"
  12. self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X \
  13. 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36"}
  14. self.queue = Queue()
  15. self.pool = Pool(5)
  16. self.is_running = True
  17. self.total_requests_num = 0
  18. self.total_response_num = 0
  19. def get_url_list(self): # 获取url列表
  20. for i in range(1, 14):
  21. self.queue.put(self.url_temp.format(i))
  22. self.total_requests_num += 1
  23. def parse_url(self, url): # 发送请求,获取响应
  24. return requests.get(url, headers=self.headers).content.decode()
  25. def get_content_list(self, html_str): # 提取段子
  26. html = etree.HTML(html_str)
  27. div_list = html.xpath("//div[@id='content-left']/div")
  28. content_list = []
  29. for div in div_list:
  30. content = {}
  31. content["content"] = div.xpath(".//div[@class='content']/span/text()")
  32. print(content)
  33. content_list.append(content)
  34. return content_list
  35. def save_content_list(self, content_list): # 保存数据
  36. pass
  37. def exetute_requests_item_save(self):
  38. url = self.queue.get()
  39. html_str = self.parse_url(url)
  40. content_list = self.get_content_list(html_str)
  41. self.save_content_list(content_list)
  42. self.total_response_num += 1
  43. def _callback(self, temp):
  44. if self.is_running:
  45. self.pool.apply_async(self.exetute_requests_item_save, callback=self._callback)
  46. def run(self):
  47. self.get_url_list()
  48. for i in range(2): # 控制并发
  49. self.pool.apply_async(self.exetute_requests_item_save, callback=self._callback)
  50. while True: # 防止主线程结束
  51. time.sleep(0.0001) # 避免cpu空转,浪费资源
  52. if self.total_response_num >= self.total_requests_num:
  53. self.is_running = False
  54. break
  55. if __name__ == '__main__':
  56. qiubai = QiubaiSpider()
  57. qiubai.run()
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/weixin_40725706/article/detail/189681
推荐阅读
相关标签
  

闽ICP备14008679号