当前位置:   article > 正文

python多线程queue_python多线程+队列(提高爬虫时效性)

python queue put 很慢

#仅供学习使用,如有侵权请留言删除

from queue import Queue

import requests

from bs4 import BeautifulSoup

import time

import threading

q = Queue()

'''

队列使用,

.queue 查看队列内容

.get() 获取队列内容

.put()添加队列内容

'''

headers = {

"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"

}

#生产者

def req_list_page():

'''

请求文章列表

:return:

'''

#判断队列是否为空

print("生产者")

print("获取首页内容,获取link添加队列中")

for i in range(1,3):

url = 'https://club.autohome.com.cn/o/bbs/forum-c-4410-{}.html#pvareaid=6830274'.format(i)

res = requests.get(url,headers=headers).text

soup =BeautifulSoup(res,'lxml')

for item in soup.select('dl[class="list_dl"]') :

# print(item)

try:

link = 'https://club.autohome.com.cn'+item.select('dt > a')[0].get('href')

q.put(link)

except :

pass

print("初始队列状态")

# print(q.queue)

print(q.qsize())

time.sleep(3)

#消费者

def req_info_page():

'''

:return:

'''

while True:

#当队列没有内容了则终止

if q.empty():

break

else:

print("消费者")

link = q.get()

print("请求了:{}".format(link))

# print(q.queue)

print(q.qsize())

time.sleep(1)

print(1111)

if __name__ == '__main__':

start = time.time()

#创建线程,一个生产,两个消费

product = threading.Thread(target=req_list_page)

consume1 = threading.Thread(target=req_info_page)

consume2 = threading.Thread(target=req_info_page)

consume3 = threading.Thread(target=req_info_page)

consume4 = threading.Thread(target=req_info_page)

consume5 = threading.Thread(target=req_info_page)

#启动线程

#获取所有link(生产者)

product.start()

product.join()

#同时消费5个队列内容

consume1.start()

consume2.start()

consume3.start()

consume4.start()

consume5.start()

#设置守护线程,子线程执行完毕,主线程结束

consume1.join()

consume2.join()

consume3.join()

consume4.join()

consume5.join()

end = time.time()

print('总消耗时间:',end-start)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/小蓝xlanll/article/detail/189692
推荐阅读
相关标签
  

闽ICP备14008679号