赞
踩
pb_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [v.op.name for v in outputs])
with tf.gfile,FastGFile('./pbmodel_name.pb',model='wb') as f:
f.write(pb_graph.SerializeToString())\
只需要在模型定义和读取权重之后执行以上代码,调用tf.graph_util.convert_variables_to_constants函数将权重转为常量,其中outputs是需要作为输出的tensor的列表,最后用pb_graph.SerializeToString()将graph序列化并写入到pb文件当中,就生成了pb模型。
python /usr/lib/python2.7/site-packages/uff/bin/convert_to_uff.py pbmodel_name.pb
转换成功后会输出包含总结点的个数以及推断出的输入输出节点的信息。
3. tensorrt部署模型
使用tensorrt部署生成好的uff模型需要先将uff中保存的模型权值以及网络结构导入进来,然后执行优化算法生成对应的inference engine。
//初始化NVINFER PLUGINS initLibNvInferPlugins(&gLogger.getTRTLogger(), ""); //1.IBulider IBuilder * builder = createInferBuilder(gLogger.getTRTLogger()); assert(builder ! = nullptr); //建立UFFParser auto parser = createUffParser(); //登记输入的node名,尺寸,通道顺序 parser->registerInput(inputtensor_name,DimsCHW(INPUT_C,INPUT_H,INPUT_W),UffInputOrder::kNCHW); // MarkOutput_0 is a node created by the UFF converter when we specify an ouput with -O. parser->registerOutput() parser->registerOutput(outputtensor_name); // Parse the UFF model to populate the network, then set the outputs. INetworkDefinition* network=builder->createeNetwork(); gLogInfo << "Begin parsing model..." << std::endl; if(!parser->parse(uffFile,*network,nvinfer1::DataType::kFLOAT)) { gLogError<<"Failure while parsing UFF file"<<std::endl; return nullptr; } gLogInfo << "End parsing model..." << std::endl; // Build the engine. builder->setMaxBatchSize(maxBatchSize); // The _GB literal operator is defined in common/common.h builder->setMaxWorkspaceSize(MAX_WORKSPACE); // We need about 1GB of scratch space for the plugin layer for batch size 5. if (gArgs.runInInt8) { builder->setInt8Mode(gArgs.runInInt8); builder->setInt8Calibrator(calibrator); } builder->setFp16Mode(gArgs.runInFp16); samplesCommon::enableDLA(builder, gArgs.useDLACore); gLogInfo << "Begin building engine..." << std::endl; ICudaEngine* engine = builder->buildCudaEngine(*network); if (!engine) { gLogError << "Unable to create engine" << std::endl; return nullptr; } gLogInfo << "End building engine..." << std::endl; // We don't need the network any more, and we can destroy the parser. network->destroy(); parser->destroy(); builder->destroy(); shutdownProtobufLibrary();
生成engine之后就可以进行推断了,执行推断时需要有一个上下文执行上下文IExecutionContext* context,通过engine->creatExecutionContext()获得。
context = engine->createExecutionContext();
assert(context != nullptr);
推断的核心代码为
context.execute(batchSize, &buffers[0]);
其中buffers是一个void*数组对应的是模型输入输出tensor的设备地址,通过cudaMalloc开辟输入输出所需要的设备空间(显存)将对应指针存到buffer数组中,在执行execute操作前通过cudaMemcpy把输入数据(输入图像)拷贝到对应输入的设备空间,执行execute之后还是通过cudaMemcpy把输出的结果从设备上拷贝出来。
// Run inference. doInference(*context, &data[0], &detectionOut[0], &keepCount[0], N); void doInference(IExecutionContext& context, float* inputData, float* detectionOut, int* keepCount, int batchSize) { //建立 //auto t_start = std::chrono::high_resolution_clock::now(); const ICudaEngine& engine = context.getEngine(); // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly 1 input and 2 output. int nbBindings = engine.getNbBindings(); std::vector<void*> buffers(nbBindings); std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize); for (int i = 0; i < nbBindings; ++i) { auto bufferSizesOutput = buffersSizes[i]; buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second)); } // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings(). int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0), outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: CHECK(cudaMemcpyAsync(buffers[inputIndex], inputData, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); auto t_start = std::chrono::high_resolution_clock::now(); context.execute(batchSize, &buffers[0]); auto t_end = std::chrono::high_resolution_clock::now(); float total = std::chrono::duration<float, std::milli>(t_end - t_start).count(); gLogInfo << "Time taken for inference is " << total << " ms." << std::endl; for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx) { if (engine.bindingIsInput(bindingIdx)) continue; auto bufferSizesOutput = buffersSizes[bindingIdx]; printOutput(bufferSizesOutput.first, bufferSizesOutput.second, buffers[bindingIdx]); } CHECK(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release the stream and the buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex0])); CHECK(cudaFree(buffers[outputIndex1])); }
Copyright © 2003-2013 www.wpsshop.cn 版权所有,并保留所有权利。