赞
踩
#仅供学习使用,如有侵权请留言删除
from queue import Queue
import requests
from bs4 import BeautifulSoup
import time
import threading
q = Queue()
'''
队列使用,
.queue 查看队列内容
.get() 获取队列内容
.put()添加队列内容
'''
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36"
}
#生产者
def req_list_page():
'''
请求文章列表
:return:
'''
#判断队列是否为空
print("生产者")
print("获取首页内容,获取link添加队列中")
for i in range(1,3):
url = 'https://club.autohome.com.cn/o/bbs/forum-c-4410-{}.html#pvareaid=6830274'.format(i)
res = requests.get(url,headers=headers).text
soup =BeautifulSoup(res,'lxml')
for item in soup.select('dl[class="list_dl"]') :
# print(item)
try:
link = 'https://club.autohome.com.cn'+item.select('dt > a')[0].get('href')
q.put(link)
except :
pass
print("初始队列状态")
# print(q.queue)
print(q.qsize())
time.sleep(3)
#消费者
def req_info_page():
'''
:return:
'''
while True:
#当队列没有内容了则终止
if q.empty():
break
else:
print("消费者")
link = q.get()
print("请求了:{}".format(link))
# print(q.queue)
print(q.qsize())
time.sleep(1)
print(1111)
if __name__ == '__main__':
start = time.time()
#创建线程,一个生产,两个消费
product = threading.Thread(target=req_list_page)
consume1 = threading.Thread(target=req_info_page)
consume2 = threading.Thread(target=req_info_page)
consume3 = threading.Thread(target=req_info_page)
consume4 = threading.Thread(target=req_info_page)
consume5 = threading.Thread(target=req_info_page)
#启动线程
#获取所有link(生产者)
product.start()
product.join()
#同时消费5个队列内容
consume1.start()
consume2.start()
consume3.start()
consume4.start()
consume5.start()
#设置守护线程,子线程执行完毕,主线程结束
consume1.join()
consume2.join()
consume3.join()
consume4.join()
consume5.join()
end = time.time()
print('总消耗时间:',end-start)
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。