当前位置:   article > 正文

pytorch多模型异步推理_多线程推理多个模型 python实现

多线程推理多个模型 python实现

测试模型:resnet50和resnet101
测试显卡:2080ti
单独测试耗时:resnet50 24.4ms resnet101 48.22ms
初始化种子和热机:为了保证每次验证的一致性,需要初始化种子,使得每次测试的输入数据保持一致,同时,为了准确统计时间,测试前先跑100次热机,然后再统计时间。

第一种情况:两个模型串行跑

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/5/8 16:20
# @Author  : wangjianrong
# @File    : 1.模型串行.py
 
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED
 
 
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
    if seed == 0:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
 
def inference(model, x,name):
    y = model(x)
    return name
 
def main():
    init_seed(0)
    s = time()
    fake_input = torch.randn(1, 3, 224, 224)
    e = time()
    print("gen data:", e - s)
    fake_input = fake_input.cuda()
    e = time()
    print("gen data:", e - s)
    warm_cnt = 100
    repeat = 100
    model1 = resnet50(True).cuda().eval()
    model2 = resnet101(True).cuda().eval()
    s = time()
    for i in range(warm_cnt):
        y = model1(fake_input)
    e = time()
    print("warm up res50:", e - s)
    s = time()
    for i in range(warm_cnt):
        y = model2(fake_input)
    e = time()
    print("warm up re101:", e - s)
 
    s = time()
 
    for i in range(repeat):
        y = inference(model1,fake_input,1)
        y = inference(model2,fake_input,1)
 
    e = time()
    print("模型串行耗时:",e - s)
 
 
if __name__ == '__main__':
    main()
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70

结果:

warm up res50: 2.331266403198242
warm up re101: 4.534073352813721
模型串行耗时: 6.889774560928345
  • 1
  • 2
  • 3

第二种情况:使用线程池同时跑两个模型推理

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/5/8 16:26
# @Author  : wangjianrong
# @File    : 2.多线程.py
 
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor,as_completed,wait,ALL_COMPLETED
 
 
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
    if seed == 0:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
 
def inference(model, x,name):
    y = model(x)
    return name
 
def main():
    init_seed(0)
    s = time()
    fake_input = torch.randn(1, 3, 224, 224)
    e = time()
    print("gen data:", e - s)
    fake_input = fake_input.cuda()
    e = time()
    print("gen data:", e - s)
    warm_cnt = 100
    repeat = 100
    model1 = resnet50(True).cuda().eval()
    model2 = resnet101(True).cuda().eval()
    s = time()
    for i in range(warm_cnt):
        y = model1(fake_input)
    e = time()
    print("warm up res50:", e - s)
    s = time()
    for i in range(warm_cnt):
        y = model2(fake_input)
    e = time()
    print("warm up re101:", e - s)
 
    pool = ThreadPoolExecutor(max_workers=2)
 
    s = time()
    for i in range(repeat):
        # 方法1 map
        # 此处res为函数返回值
        for res in pool.map(inference,[model1,model2],[fake_input,fake_input],["resnet50","res101"]):
            # print(res)
            pass
    e = time()
    print("多线程map:", e - s)
 
    s = time()
    for i in range(repeat):
        # 方法2 submit+wait
        # 返回future对象
        f1 = pool.submit(inference,model1,fake_input,'res50')
        f2 = pool.submit(inference,model2,fake_input,'res101')
        res = wait([f1,f2],return_when=ALL_COMPLETED)
        for r in res.done:
            # print(r.result())
            pass
    e = time()
    print("多线程wait:", e - s)
 
 
if __name__ == '__main__':
    main()
    # asyncio.run(main())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87

结果:两种方式时间整体上比较接近,总时间约等于耗时比较长的模型的时间,比串行模型耗时少很多

warm up res50: 2.4041590690612793
warm up re101: 4.691877365112305
多线程map: 4.694884538650513
多线程wait: 4.744607210159302
  • 1
  • 2
  • 3
  • 4

第三种情况:使用协程的方法

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/5/8 16:49
# @Author  : wangjianrong
# @File    : 3.协程.py
 
from torchvision.models.resnet import resnet50, resnet101
import random
import os
import numpy as np
import torch
from time import time
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed, wait, ALL_COMPLETED
 
 
def init_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
 
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Remove randomness (may be slower on Tesla GPUs) # https://pytorch.org/docs/stable/notes/randomness.html
    if seed == 0:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
 
 
def inference(model, x, name):
    y = model(x)
    return name
 
 
async def ainference(model, x, name):
    y = model(x)
    return name
 
 
async def main():
    init_seed(0)
    s = time()
    fake_input = torch.randn(1, 3, 224, 224)
    e = time()
    print("gen data:", e - s)
    fake_input = fake_input.cuda()
    e = time()
    print("gen data:", e - s)
    warm_cnt = 100
    repeat = 100
    model1 = resnet50(True).cuda().eval()
    model2 = resnet101(True).cuda().eval()
    s = time()
    for i in range(warm_cnt):
        y = model1(fake_input)
    e = time()
    print("warm up res50:", e - s)
    s = time()
    for i in range(warm_cnt):
        y = model2(fake_input)
    e = time()
    print("warm up re101:", e - s)
 
    loop = asyncio.get_running_loop()
    # 方法1 使用协程函数
    s = time()
    for i in range(repeat):
        tasks = [ainference(model1, fake_input,'res50'), ainference(model2, fake_input,'res101')]
        done, pending = await asyncio.wait(tasks)
    e = time()
    print("直接使用协程函数:",e - s)
 
    # 方法2 将非协程函数转成协程future
    s = time()
    for i in range(repeat):
        f1 = loop.run_in_executor(None, inference, model1, fake_input, "res50")
        f2 = loop.run_in_executor(None, inference, model2, fake_input, "res101")
        done, pending = await asyncio.wait([f1, f2])
    e = time()
    print("将非协程函数转成协程:",e-s)
 
    # 方法3 通过线程池
    pool = ThreadPoolExecutor(max_workers=2)
    s = time()
    for i in range(repeat):
        f1 = loop.run_in_executor(pool, inference, model1, fake_input, "res50")
        f2 = loop.run_in_executor(pool, inference, model2, fake_input, "res101")
        done,pending = await asyncio.wait([f1,f2])
    e = time()
    print("通过线程池:",e-s)
 
 
 
if __name__ == '__main__':
    # main()
    asyncio.run(main())
  • 1
  • 2
  • 3
  • 4
  • 5
  • 6
  • 7
  • 8
  • 9
  • 10
  • 11
  • 12
  • 13
  • 14
  • 15
  • 16
  • 17
  • 18
  • 19
  • 20
  • 21
  • 22
  • 23
  • 24
  • 25
  • 26
  • 27
  • 28
  • 29
  • 30
  • 31
  • 32
  • 33
  • 34
  • 35
  • 36
  • 37
  • 38
  • 39
  • 40
  • 41
  • 42
  • 43
  • 44
  • 45
  • 46
  • 47
  • 48
  • 49
  • 50
  • 51
  • 52
  • 53
  • 54
  • 55
  • 56
  • 57
  • 58
  • 59
  • 60
  • 61
  • 62
  • 63
  • 64
  • 65
  • 66
  • 67
  • 68
  • 69
  • 70
  • 71
  • 72
  • 73
  • 74
  • 75
  • 76
  • 77
  • 78
  • 79
  • 80
  • 81
  • 82
  • 83
  • 84
  • 85
  • 86
  • 87
  • 88
  • 89
  • 90
  • 91
  • 92
  • 93
  • 94
  • 95
  • 96
  • 97

结果:

直接调用协程函数耗时和串行耗时差不多,可能是因为都是密集型计算。

后两者的区别可能是线程池默认线程数不同导致的,最大线程数为2时耗时最少,等于1则和串行耗时差不多,大于2耗时均会有所增加

warm up res50: 2.3446831703186035
warm up re101: 4.573963165283203
直接使用协程函数: 6.90220308303833
将非协程函数转成协程: 5.143332481384277
通过线程池: 4.59693717956543
  • 1
  • 2
  • 3
  • 4
  • 5
声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/凡人多烦事01/article/detail/300244
推荐阅读
相关标签
  

闽ICP备14008679号