赞
踩
测试模型:resnet50和resnet101
测试显卡:2080ti
单独测试耗时:resnet50 24.4ms resnet101 48.22ms
初始化种子和热机:为了保证每次验证的一致性,需要初始化种子,使得每次测试的输入数据保持一致,同时,为了准确统计时间,测试前先跑100次热机,然后再统计时间。
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/5/8 16:20 # @Author : wangjianrong # @File : 1.模型串行.py from torchvision.models.resnet import resnet50, resnet101 import random import os import numpy as np import torch from time import time import asyncio from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED def init_seed(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html if seed == 0: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def inference(model, x,name): y = model(x) return name def main(): init_seed(0) s = time() fake_input = torch.randn(1, 3, 224, 224) e = time() print("gen data:", e - s) fake_input = fake_input.cuda() e = time() print("gen data:", e - s) warm_cnt = 100 repeat = 100 model1 = resnet50(True).cuda().eval() model2 = resnet101(True).cuda().eval() s = time() for i in range(warm_cnt): y = model1(fake_input) e = time() print("warm up res50:", e - s) s = time() for i in range(warm_cnt): y = model2(fake_input) e = time() print("warm up re101:", e - s) s = time() for i in range(repeat): y = inference(model1,fake_input,1) y = inference(model2,fake_input,1) e = time() print("模型串行耗时:",e - s) if __name__ == '__main__': main()
结果:
warm up res50: 2.331266403198242
warm up re101: 4.534073352813721
模型串行耗时: 6.889774560928345
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/5/8 16:26 # @Author : wangjianrong # @File : 2.多线程.py from torchvision.models.resnet import resnet50, resnet101 import random import os import numpy as np import torch from time import time import asyncio from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED def init_seed(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html if seed == 0: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def inference(model, x,name): y = model(x) return name def main(): init_seed(0) s = time() fake_input = torch.randn(1, 3, 224, 224) e = time() print("gen data:", e - s) fake_input = fake_input.cuda() e = time() print("gen data:", e - s) warm_cnt = 100 repeat = 100 model1 = resnet50(True).cuda().eval() model2 = resnet101(True).cuda().eval() s = time() for i in range(warm_cnt): y = model1(fake_input) e = time() print("warm up res50:", e - s) s = time() for i in range(warm_cnt): y = model2(fake_input) e = time() print("warm up re101:", e - s) pool = ThreadPoolExecutor(max_workers=2) s = time() for i in range(repeat): # 方法1 map # 此处res为函数返回值 for res in pool.map(inference,[model1,model2],[fake_input,fake_input],["resnet50","res101"]): # print(res) pass e = time() print("多线程map:", e - s) s = time() for i in range(repeat): # 方法2 submit+wait # 返回future对象 f1 = pool.submit(inference,model1,fake_input,'res50') f2 = pool.submit(inference,model2,fake_input,'res101') res = wait([f1,f2],return_when=ALL_COMPLETED) for r in res.done: # print(r.result()) pass e = time() print("多线程wait:", e - s) if __name__ == '__main__': main() # asyncio.run(main())
结果:两种方式时间整体上比较接近,总时间约等于耗时比较长的模型的时间,比串行模型耗时少很多
warm up res50: 2.4041590690612793
warm up re101: 4.691877365112305
多线程map: 4.694884538650513
多线程wait: 4.744607210159302
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2021/5/8 16:49 # @Author : wangjianrong # @File : 3.协程.py from torchvision.models.resnet import resnet50, resnet101 import random import os import numpy as np import torch from time import time import asyncio from concurrent.futures import ThreadPoolExecutor, as_completed, wait, ALL_COMPLETED def init_seed(seed): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html if seed == 0: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def inference(model, x, name): y = model(x) return name async def ainference(model, x, name): y = model(x) return name async def main(): init_seed(0) s = time() fake_input = torch.randn(1, 3, 224, 224) e = time() print("gen data:", e - s) fake_input = fake_input.cuda() e = time() print("gen data:", e - s) warm_cnt = 100 repeat = 100 model1 = resnet50(True).cuda().eval() model2 = resnet101(True).cuda().eval() s = time() for i in range(warm_cnt): y = model1(fake_input) e = time() print("warm up res50:", e - s) s = time() for i in range(warm_cnt): y = model2(fake_input) e = time() print("warm up re101:", e - s) loop = asyncio.get_running_loop() # 方法1 使用协程函数 s = time() for i in range(repeat): tasks = [ainference(model1, fake_input,'res50'), ainference(model2, fake_input,'res101')] done, pending = await asyncio.wait(tasks) e = time() print("直接使用协程函数:",e - s) # 方法2 将非协程函数转成协程future s = time() for i in range(repeat): f1 = loop.run_in_executor(None, inference, model1, fake_input, "res50") f2 = loop.run_in_executor(None, inference, model2, fake_input, "res101") done, pending = await asyncio.wait([f1, f2]) e = time() print("将非协程函数转成协程:",e-s) # 方法3 通过线程池 pool = ThreadPoolExecutor(max_workers=2) s = time() for i in range(repeat): f1 = loop.run_in_executor(pool, inference, model1, fake_input, "res50") f2 = loop.run_in_executor(pool, inference, model2, fake_input, "res101") done,pending = await asyncio.wait([f1,f2]) e = time() print("通过线程池:",e-s) if __name__ == '__main__': # main() asyncio.run(main())
结果:
直接调用协程函数耗时和串行耗时差不多,可能是因为都是密集型计算。
后两者的区别可能是线程池默认线程数不同导致的,最大线程数为2时耗时最少,等于1则和串行耗时差不多,大于2耗时均会有所增加
warm up res50: 2.3446831703186035
warm up re101: 4.573963165283203
直接使用协程函数: 6.90220308303833
将非协程函数转成协程: 5.143332481384277
通过线程池: 4.59693717956543
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。