pb_graph = tf.graph_util.convert_variables_to_constants(sess, sess.graph.as_graph_def(), [v.op.name for v in outputs])
with tf.gfile,FastGFile('./pbmodel_name.pb',model='wb') as f:
python /usr/lib/python2.7/site-packages/uff/bin/convert_to_uff.py pbmodel_name.pb
3. tensorrt部署模型
使用tensorrt部署生成好的uff模型需要先将uff中保存的模型权值以及网络结构导入进来,然后执行优化算法生成对应的inference engine。
//初始化NVINFER PLUGINS initLibNvInferPlugins(&gLogger.getTRTLogger(), ""); //1.IBulider IBuilder * builder = createInferBuilder(gLogger.getTRTLogger()); assert(builder ! = nullptr); //建立UFFParser auto parser = createUffParser(); //登记输入的node名,尺寸,通道顺序 parser->registerInput(inputtensor_name,DimsCHW(INPUT_C,INPUT_H,INPUT_W),UffInputOrder::kNCHW); // MarkOutput_0 is a node created by the UFF converter when we specify an ouput with -O. parser->registerOutput() parser->registerOutput(outputtensor_name); // Parse the UFF model to populate the network, then set the outputs. INetworkDefinition* network=builder->createeNetwork(); gLogInfo << "Begin parsing model..." << std::endl; if(!parser->parse(uffFile,*network,nvinfer1::DataType::kFLOAT)) { gLogError<<"Failure while parsing UFF file"<<std::endl; return nullptr; } gLogInfo << "End parsing model..." << std::endl; // Build the engine. builder->setMaxBatchSize(maxBatchSize); // The _GB literal operator is defined in common/common.h builder->setMaxWorkspaceSize(MAX_WORKSPACE); // We need about 1GB of scratch space for the plugin layer for batch size 5. if (gArgs.runInInt8) { builder->setInt8Mode(gArgs.runInInt8); builder->setInt8Calibrator(calibrator); } builder->setFp16Mode(gArgs.runInFp16); samplesCommon::enableDLA(builder, gArgs.useDLACore); gLogInfo << "Begin building engine..." << std::endl; ICudaEngine* engine = builder->buildCudaEngine(*network); if (!engine) { gLogError << "Unable to create engine" << std::endl; return nullptr; } gLogInfo << "End building engine..." << std::endl; // We don't need the network any more, and we can destroy the parser. network->destroy(); parser->destroy(); builder->destroy(); shutdownProtobufLibrary();
生成engine之后就可以进行推断了,执行推断时需要有一个上下文执行上下文IExecutionContext* context,通过engine->creatExecutionContext()获得。
context = engine->createExecutionContext();
assert(context != nullptr);
context.execute(batchSize, &buffers[0]);
// Run inference. doInference(*context, &data[0], &detectionOut[0], &keepCount[0], N); void doInference(IExecutionContext& context, float* inputData, float* detectionOut, int* keepCount, int batchSize) { //建立 //auto t_start = std::chrono::high_resolution_clock::now(); const ICudaEngine& engine = context.getEngine(); // Input and output buffer pointers that we pass to the engine - the engine requires exactly IEngine::getNbBindings(), // of these, but in this case we know that there is exactly 1 input and 2 output. int nbBindings = engine.getNbBindings(); std::vector<void*> buffers(nbBindings); std::vector<std::pair<int64_t, DataType>> buffersSizes = calculateBindingBufferSizes(engine, nbBindings, batchSize); for (int i = 0; i < nbBindings; ++i) { auto bufferSizesOutput = buffersSizes[i]; buffers[i] = samplesCommon::safeCudaMalloc(bufferSizesOutput.first * samplesCommon::getElementSize(bufferSizesOutput.second)); } // In order to bind the buffers, we need to know the names of the input and output tensors. // Note that indices are guaranteed to be less than IEngine::getNbBindings(). int inputIndex = engine.getBindingIndex(INPUT_BLOB_NAME), outputIndex0 = engine.getBindingIndex(OUTPUT_BLOB_NAME0), outputIndex1 = outputIndex0 + 1; //engine.getBindingIndex(OUTPUT_BLOB_NAME1); cudaStream_t stream; CHECK(cudaStreamCreate(&stream)); // DMA the input to the GPU, execute the batch asynchronously, and DMA it back: CHECK(cudaMemcpyAsync(buffers[inputIndex], inputData, batchSize * INPUT_C * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream)); auto t_start = std::chrono::high_resolution_clock::now(); context.execute(batchSize, &buffers[0]); auto t_end = std::chrono::high_resolution_clock::now(); float total = std::chrono::duration<float, std::milli>(t_end - t_start).count(); gLogInfo << "Time taken for inference is " << total << " ms." << std::endl; for (int bindingIdx = 0; bindingIdx < nbBindings; ++bindingIdx) { if (engine.bindingIsInput(bindingIdx)) continue; auto bufferSizesOutput = buffersSizes[bindingIdx]; printOutput(bufferSizesOutput.first, bufferSizesOutput.second, buffers[bindingIdx]); } CHECK(cudaMemcpyAsync(detectionOut, buffers[outputIndex0], batchSize * detectionOutputParam.keepTopK * 7 * sizeof(float), cudaMemcpyDeviceToHost, stream)); CHECK(cudaMemcpyAsync(keepCount, buffers[outputIndex1], batchSize * sizeof(int), cudaMemcpyDeviceToHost, stream)); cudaStreamSynchronize(stream); // Release the stream and the buffers cudaStreamDestroy(stream); CHECK(cudaFree(buffers[inputIndex])); CHECK(cudaFree(buffers[outputIndex0])); CHECK(cudaFree(buffers[outputIndex1])); }
