上一篇学习了tensorrt转onnx后如何使用python API进行批量化的处理,这篇文章的目的是将python API翻译成c++ API并测试下其时间消耗。
代码如下:
#include "argsParser.h"
#include "buffers.h"
#include "common.h"
#include "logger.h"
#include "parserOnnxConfig.h"
#include "NvInfer.h"
#include <cuda_runtime_api.h>
#include <random>
#include <iostream>
#include <vector>
using namespace nvinfer1;
template <typename T>
using SampleUniquePtr = std::unique_ptr<T, samplesCommon::InferDeleter>;
const std::string gSampleName = "TensorRT.sample_dynamic_reshape";
const int BatchSize = 32;
ICudaEngine* loadEngine(const std::string& engine, int DLACore, std::ostream& err)
{
std::ifstream engineFile(engine, std::ios::binary);
if (!engineFile)
{
err << "Error opening engine file: " << engine << std::endl;
return nullptr;
}
engineFile.seekg(0, engineFile.end);
long int fsize = engineFile.tellg();
engineFile.seekg(0, engineFile.beg);
std::vector<char> engineData(fsize);
engineFile.read(engineData.data(), fsize);
if (!engineFile)
{
err << "Error loading engine file: " << engine << std::endl;
return nullptr;
}
auto runtime{createInferRuntime(gLogger.getTRTLogger())};
if (DLACore != -1)
{
runtime->setDLACore(DLACore);
}
return runtime->deserializeCudaEngine(engineData.data(), fsize, nullptr);
}
void get_binding_idxs(ICudaEngine* engine, int profile_idx,
std::vector<int>& input_binding_idxs, std::vector<int>& output_binding_idxs)
{
int num_bindings_per_profile = engine->getNbBindings()/engine->getNbOptimizationProfiles();
int start_binding = profile_idx * num_bindings_per_profile;
int end_binding = start_binding + num_bindings_per_profile;
std::cout<<"Engine/Binding Metadata\n";
std::cout<<"\tNumber of optimization profiles: "<<engine->getNbOptimizationProfiles()<<"\n";
std::cout<<"\tNumber of bindings per profile: "<< num_bindings_per_profile<<"\n";
std::cout<<"\tFirst binding for profile 0: "<< start_binding<<"\n";
std::cout<<"\tLast binding for profile 0: "<<end_binding-1<<"\n";
// 拆分input_idx和output_idx
for(int i=start_binding; i<end_binding; ++i)
{
if(engine->bindingIsInput(i))
input_binding_idxs.push_back(i);
else
output_binding_idxs.push_back(i);
}
}
float* get_random_inputs(ICudaEngine* engine, SampleUniquePtr<IExecutionContext>& context,
std::vector<int>& input_binding_idxs)
{
std::cout<<"Read imgs from: "<<"\n";
std::vector<float> fileData(BatchSize*3*224*224);
for(int i=0; i<BatchSize*3*224*224; ++i)
fileData[i] = 1-float(rand()%255/255.0);
return fileData.data();
}
void setuo_binding_shapes(ICudaEngine* engine, IExecutionContext* context)
{}
int main(int argc, char** argv)
{
std::string engineFile = "../resnet18_dynamic_fp16.engine";
auto sampleTest = gLogger.defineTest(gSampleName, argc, argv);
gLogger.reportTestStart(sampleTest);
nvinfer1::ICudaEngine* engine = loadEngine(engineFile, 1, gLogError); // 加载cuda Engine
std::cout<<"CudaEngine: "<< engineFile <<" has been deserialized successful!\n";
// 由cudaEngine创建 ExecuteContext, 该context可以重复使用
auto context = SampleUniquePtr<IExecutionContext>(engine->createExecutionContext());
context->setOptimizationProfile(0);
std::cout<<"Activate Optimization Profile: 0\n";
// 从engine中获得输入输出的idx,这部分其实可以手工指定
std::vector<int> input_binding_idxs, output_binding_idxs;
get_binding_idxs(engine, context->getOptimizationProfile(), input_binding_idxs, output_binding_idxs);
std::vector<const char*> input_names;
for(int i: input_binding_idxs)
input_names.push_back(engine->getBindingName(i));
// 生成随机变量
//给输入分配空间,并拷贝到cuda上
std::vector<void *> buffers(2);
float* host_inputs = get_random_inputs(engine, context, input_binding_idxs);
CHECK(cudaMalloc(&buffers[input_binding_idxs[0]], BatchSize*3*224*224*sizeof(float)));
cudaStream_t stream;
CHECK(cudaStreamCreate(&stream));
CHECK(cudaMemcpyAsync(buffers[input_binding_idxs[0]], host_inputs, BatchSize*3*224*224*sizeof(float),
cudaMemcpyHostToDevice, stream));
std::cout<<"Input Metadata"<<"\n";
std::cout<<"\tNumber of Inputs: "<<input_binding_idxs.size()<<"\n";
// std::cout<<"\tInput Bindings for Profile 0: "<<input_binding_idxs<<"\n";
// std::cout<<"\tInput names: "<<input_names[0]<<"\n";
// 由输入大小推断输出大小,并分配空间,这里我们直接直接指定大小
context->setBindingDimensions(input_binding_idxs[0], Dims4{BatchSize, 3, 224, 224});
if(!context->allInputDimensionsSpecified())
std::cout<<"Some input dimension is not specified!\n";
CHECK(cudaMalloc(&buffers[output_binding_idxs[0]], BatchSize* 1000 * sizeof(float)));
typedef std::chrono::high_resolution_clock Time;
typedef std::chrono::duration<double, std::ratio<1, 1000>> ms;
typedef std::chrono::duration<float> fsec;
double total = 0.0;
auto t0 = Time::now();
for(int i=0; i<1000; ++i)
{
bool status = context->executeV2(buffers.data());
if(!status) std::cout<<"Something is wrong in inference!\n";
}
auto t1 = Time::now();
fsec fs = t1 - t0;
ms d = std::chrono::duration_cast<ms>(fs);
total += d.count();
std::cout<<"Running time of 1000 Batch is: "<<total/1000<<" ms\n";
std::cout<<"Running time of 1000 image is: "<<total/BatchSize/1000<<" ms\n";
float output[BatchSize*1000];
CHECK(cudaMemcpyAsync(output, buffers[output_binding_idxs[0]], BatchSize * 1000 * sizeof(float),
cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[input_binding_idxs[0]]));
CHECK(cudaFree(buffers[output_binding_idxs[0]]));
return gLogger.reportPass(sampleTest);
}
使用 trtexec
工具由onnx模型转换成 engine 的方法见上篇。
TensorRT的C++ API查询 NVIDIA/TensorRT/C++ API
在V100上测试ResNet18分别使用TensorRT和Libtorch推理1000次获得的每帧平均处理时间(ms)
batchsize | 1 | 2 | 4 | 8 | 16 | 24 | 32 |
---|---|---|---|---|---|---|---|
TensorRT(FP32) | 1.69 | 0.91 | 0.52 | 0.44 | 0.36 | 0.36 | 0.35 |
TensorRT(FP16) | 1.26 | 0.64 | 0.34 | 0.19 | 0.11 | 0.09 | 0.09 |
libtorch(FP32) | 1.81 | 1.06 | 0.71 | 0.51 | 0.40 | 0.36 | 0.34 |
libtorch(FP16) | 0.43 | 0.15 | 0.08 | 0.04 | 0.03 | 0.03 | 0.02 |