jetbot11 之 hand detect

walletiger發表於2020-12-11

一直陷在物體檢測的坑裡出不來了。誰讓這坑如此之深 ! 繼續加點深度 ! 自己訓練一個 object detect 模型 再跑跑 tensorRT 加速吧。

技術主題:

yolov5s 訓練人手檢測模型並使用 tensortRT 加速。

 

一 準備資料集

1.1 下載

https://www.robots.ox.ac.uk/~vgg/data/hands/

1.2 轉換 yolo 需要的格式

參考:

https://github.com/coldlarry/YOLOv3-complete-pruning/blob/master/data/converter.py

 

1.3 擴充資料集

這個資料集下載下來只有 5000多張圖片。 訓練顯得有點少。 怎麼多塊好省的步入共產主義? 寫個指令碼映象一下把。 1w 資料集有了。

為訓練準備 yolov5/data/hand.yaml

train: /workspace/data/hand_dataset/images/train/
val: /workspace/data/hand_dataset/images/validation/

# number of classes
nc: 1

# class names
names: ['hand']

二 訓練

我是先到了一個終點栽了坑,又反向傳播回到這裡,建議:

      jetson nano 下 跑訓練的模型 img-size 就用 416

      yolov5 為使用 3.1 版本

yolov5 下載:

https://github.com/ultralytics/yolov5/tags

同時下載 yolov5s.pt

yolov5/ 下執行:

python3 train.py --img 416 --batch 16 --epochs 300  --data hand.yaml --weights yolov5s.pt --cfg yolov5s.yaml --cache-images --single-cls

最後的訓練結果:

P->0.8322 , Recall->0.9663 , map->0.6981

   294/299     1.82G   0.02242    0.0224         0   0.04482        74       416    0.8299    0.9553    0.9667    0.6961   0.02342   0.02203         0
   295/299     1.82G   0.02225   0.02221         0   0.04446        68       416      0.83    0.9553    0.9668    0.6966   0.02339   0.02202         0
   296/299     1.82G    0.0224   0.02195         0   0.04435        50       416    0.8312    0.9553    0.9669    0.6975   0.02336     0.022         0
   297/299     1.82G   0.02225   0.02205         0    0.0443        65       416    0.8326    0.9551    0.9668    0.6982   0.02334   0.02199         0
   298/299     1.82G   0.02235   0.02194         0   0.04429        69       416    0.8325    0.9551    0.9667    0.6981   0.02331   0.02198         0
   299/299     1.82G   0.02256   0.02214         0    0.0447        52       416    0.8322    0.9549    0.9663    0.6981   0.02329   0.02197         0

三 開發主機環境驗證

python3 detect.py --source 0 --weights runs/exp10/weights/best.pt 

VID

 

四 jetson nano 下部署驗證

4.1 簡單驗證

把訓練好的 模型 best.pt 放到 jetson nano下可跟開發環境同樣驗證。

測試速度 : 640x360@8fps

python3 detect.py --source test_dir/ --weights runs/exp10/weights/best.pt 

4.2 yolov5s 轉 onnx

python3 models/export.py --weights runs/exp10/weights/best.pt --img-size 416 

得到 onnx 模型用 onnxruntime 執行. onnxruntime 編譯了四個多小時 , 檢測速度沒多少提升呢。

# 部分程式碼


def main():
    img_raw = cv2.imread('/tmp/in.jpg')
    print(img_raw.shape)

    t0 = time.time()
    img,TestData = process_image_raw(img_raw)
    t1 = time.time()

    print("process img raw cost = %.1f ms " %( 1000 * (t1 - t0)))
    session = onnxruntime.InferenceSession("hand_best_300epoch.onnx")
    
    t2 = time.time()

    print("load onnx models cost = %.1f ms " %( 1000 * (t2 - t1)))
    inname = [input.name for input in session.get_inputs()][0]
    outname = [output.name for output in session.get_outputs()]
    print("inputs name:",inname,"outputs name:",outname)
    prediction = session.run(outname, {inname:TestData})

    t3 = time.time()
    print("infer  cost = %.1f ms " %( 1000 * (t3 - t2)))

    boxes = getBoxes(prediction,0.25,0.6)
    drawBox(boxes,img)

    t4 = time.time()
    print("draw box  cost = %.1f ms " %( 1000 * (t4 - t3)))


if __name__ == "__main__":
    main()

4.3  yolov5s 轉 tensorRT

yolov5s.pt -> yolov5s.wts -> yolov5s.engine

參考這裡:

https://github.com/wang-xinyu/tensorrtx/tree/master/yolov5

 

劃重點:

yololayer.h 裡:

把 CLASS_NUM, INPUT_W, INPUT_H 都改了

--- a/yolov5/yololayer.h
+++ b/yolov5/yololayer.h
@@ -16,9 +16,9 @@ namespace Yolo
         float anchors[CHECK_COUNT * 2];
     };
     static constexpr int MAX_OUTPUT_BBOX_COUNT = 1000;
-    static constexpr int CLASS_NUM = 80;
-    static constexpr int INPUT_H = 608;
-    static constexpr int INPUT_W = 608;
+    static constexpr int CLASS_NUM = 1;
+    static constexpr int INPUT_H = 416;
+    static constexpr int INPUT_W = 416;
 

 

終於, 編譯出來的 可執行程式  yolov5 -d 測試可以到 40ms 一幀的速度

 

可是。。 我需要的是 庫。

可是。。我想在python 下用。

有個 yolov5_trt.py, 一執行 記憶體吃沒了。。 卡死不得其解。

自己包裝把

4.4 包裝 yolov5 tensort 為 C++庫

//yolov5_lib.h

#pragma once 

#ifdef __cplusplus
extern "C" 
{
#endif 

void * yolov5_trt_create(const char * engine_name);

const char * yolov5_trt_detect(void *h, cv::Mat &img, float threshold);

void yolov5_trt_destroy(void *h);

#ifdef __cplusplus
}
#endif 
~            
//yolov5_lib.cpp 

#include <iostream>
#include <chrono>
#include "cuda_runtime_api.h"
#include "logging.h"
#include "common.hpp"
#include "yolov5_lib.h"

#define USE_FP16  // comment out this if want to use FP32
#define DEVICE 0  // GPU id
#define NMS_THRESH 0.4
#define CONF_THRESH 0.5
#define BATCH_SIZE 1

// stuff we know about the network and the input/output blobs
static const int INPUT_H = Yolo::INPUT_H;
static const int INPUT_W = Yolo::INPUT_W;
static const int CLASS_NUM = Yolo::CLASS_NUM;
static const int OUTPUT_SIZE = Yolo::MAX_OUTPUT_BBOX_COUNT * sizeof(Yolo::Detection) / sizeof(float) + 1;  // we assume the yololayer outputs no more than MAX_OUTPUT_BBOX_COUNT boxes that conf >= 0.1
const char* INPUT_BLOB_NAME = "data";
const char* OUTPUT_BLOB_NAME = "prob";
static Logger gLogger;


static void doInference(IExecutionContext& context, cudaStream_t& stream, void **buffers, float* input, float* output, int batchSize) {
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(buffers[0], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context.enqueue(batchSize, buffers, stream, nullptr);
    CHECK(cudaMemcpyAsync(output, buffers[1], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}


typedef struct 
{

    float *data;
    float *prob;
    IRuntime *runtime;
    ICudaEngine *engine;
    IExecutionContext *exe_context;
    void* buffers[2];
    cudaStream_t cuda_stream;
    int inputIndex;
    int outputIndex;
    char result_json_str[16384];

}Yolov5TRTContext;

oid * yolov5_trt_create(const char * engine_name)
{
    size_t size = 0;
    char *trtModelStream = NULL;
    Yolov5TRTContext * trt_ctx = NULL;

    trt_ctx = new Yolov5TRTContext();

    std::ifstream file(engine_name, std::ios::binary);
    printf("yolov5_trt_create  ... \n");
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }else
        return NULL;

    trt_ctx->data = new float[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
    trt_ctx->prob = new float[BATCH_SIZE * OUTPUT_SIZE];
    trt_ctx->runtime = createInferRuntime(gLogger);
    assert(trt_ctx->runtime != nullptr);

    printf("yolov5_trt_create  cuda engine... \n");
    trt_ctx->engine = trt_ctx->runtime->deserializeCudaEngine(trtModelStream, size);
    assert(trt_ctx->engine != nullptr);
    trt_ctx->exe_context = trt_ctx->engine->createExecutionContext();


    delete[] trtModelStream;
    assert(trt_ctx->engine->getNbBindings() == 2);

    // In order to bind the buffers, we need to know the names of the input and output tensors.
    // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    trt_ctx->inputIndex = trt_ctx->engine->getBindingIndex(INPUT_BLOB_NAME);
    trt_ctx->outputIndex = trt_ctx->engine->getBindingIndex(OUTPUT_BLOB_NAME);

    assert(trt_ctx->inputIndex == 0);
    assert(trt_ctx->outputIndex == 1);
    // Create GPU buffers on device

    printf("yolov5_trt_create  buffer ... \n");
    CHECK(cudaMalloc(&trt_ctx->buffers[trt_ctx->inputIndex], BATCH_SIZE * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&trt_ctx->buffers[trt_ctx->outputIndex], BATCH_SIZE * OUTPUT_SIZE * sizeof(float)));
    // Create stream

    printf("yolov5_trt_create  stream ... \n");
    CHECK(cudaStreamCreate(&trt_ctx->cuda_stream));
    printf("yolov5_trt_create  done ... \n");
    return (void *)trt_ctx;


}


const char * yolov5_trt_detect(void *h, cv::Mat &img, float threshold)
{
    Yolov5TRTContext *trt_ctx;
    int i;
    int delay_preprocess;
    int delay_infer;

    trt_ctx = (Yolov5TRTContext *)h;


    trt_ctx->result_json_str[0] = 0;

    if (img.empty()) return trt_ctx->result_json_str;

    auto start0 = std::chrono::system_clock::now();

    //printf("yolov5_trt_detect start preprocess img \n");
    cv::Mat pr_img = preprocess_img(img);



    //printf("yolov5_trt_detect start convert img to float\n");
    // letterbox BGR to RGB
    i = 0;
    for (int row = 0; row < INPUT_H; ++row) {
        uchar* uc_pixel = pr_img.data + row * pr_img.step;
        for (int col = 0; col < INPUT_W; ++col) {
            trt_ctx->data[i] = (float)uc_pixel[2] / 255.0;
            trt_ctx->data[i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
            trt_ctx->data[i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
            uc_pixel += 3;
            ++i;
        }
    }
    auto end0 = std::chrono::system_clock::now();

    delay_preprocess =  std::chrono::duration_cast<std::chrono::milliseconds>(end0 - start0).count();

    // Run inference
    //printf("yolov5_trt_detect start do inference\n");
    auto start = std::chrono::system_clock::now();
    doInference(*trt_ctx->exe_context, trt_ctx->cuda_stream, trt_ctx->buffers, trt_ctx->data, trt_ctx->prob, BATCH_SIZE);

    auto end = std::chrono::system_clock::now();
    delay_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();

    std::cout <<"delay_proress:" << delay_preprocess << "ms, " << "delay_infer:" << delay_infer << "ms" << std::endl;

    //printf("yolov5_trt_detect start do process infer result \n");

    int fcount = 1;
    int str_len;
    std::vector<std::vector<Yolo::Detection>> batch_res(1);
    auto& res = batch_res[0];
    nms(res, &trt_ctx->prob[0], threshold, NMS_THRESH);

    sprintf(trt_ctx->result_json_str,
                "{\"delay_preprocess\": %d,"
                "\"delay_infer\": %d,"
                "\"num_det\":%d, \"objects\":[", delay_preprocess, delay_infer, (int) res.size());

    str_len = strlen(trt_ctx->result_json_str);

    i = 0;
    for(i = 0 ; i < res.size(); i++){
        int x1, y1, x2, y2;
        int class_id;

        cv::Rect r = get_rect(img, res[i].bbox);

        x1 = r.x;
        y1 = r.y;
        x2 = r.x + r.width;
        y2 = r.y + r.height;
        class_id = (int)res[i].class_id;


        if (0 == i){
            sprintf(trt_ctx->result_json_str + str_len, "(%d,%d,%d,%d,%d)", class_id, x1, y1, x2, y2);
        }else {
            sprintf(trt_ctx->result_json_str + str_len, ",(%d,%d,%d,%d,%d)", class_id, x1, y1, x2, y2);
        }
        str_len = strlen(trt_ctx->result_json_str);

        if (str_len >= 16300)
            break;

    }

    sprintf(trt_ctx->result_json_str + str_len, "]}");


    return trt_ctx->result_json_str;

}


void yolov5_trt_destroy(void *h)
{
    Yolov5TRTContext *trt_ctx;

    trt_ctx = (Yolov5TRTContext *)h;

    // Release stream and buffers
    cudaStreamDestroy(trt_ctx->cuda_stream);
    CHECK(cudaFree(trt_ctx->buffers[trt_ctx->inputIndex]));
    CHECK(cudaFree(trt_ctx->buffers[trt_ctx->outputIndex]));
    // Destroy the engine
    trt_ctx->exe_context->destroy();
    trt_ctx->engine->destroy();
    trt_ctx->runtime->destroy();

    delete trt_ctx->data;
    delete trt_ctx->prob;

    delete trt_ctx;

}


 

修改 CMakeList.txt

 

diff --git a/yolov5/CMakeLists.txt b/yolov5/CMakeLists.txt
index f40e006..be0f7b1 100644
--- a/yolov5/CMakeLists.txt
+++ b/yolov5/CMakeLists.txt
@@ -10,7 +10,7 @@ set(CMAKE_BUILD_TYPE Debug)
 
 find_package(CUDA REQUIRED)
 
-set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_30;code=sm_30)
+set(CUDA_NVCC_PLAGS ${CUDA_NVCC_PLAGS};-std=c++11;-g;-G;-gencode;arch=compute_53;code=sm_53)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 # include and link dirs of cuda and tensorrt, you need adapt them if yours are different
@@ -23,8 +23,8 @@ link_directories(/usr/lib/x86_64-linux-gnu/)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall -Ofast -Wfatal-errors -D_MWAITXINTRIN_H_INCLUDED")
 
-cuda_add_library(myplugins SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu)
-target_link_libraries(myplugins nvinfer cudart)
+cuda_add_library(yolov5_trt SHARED ${PROJECT_SOURCE_DIR}/yololayer.cu ${PROJECT_SOURCE_DIR}/yolov5_lib.cpp )
+target_link_libraries(yolov5_trt nvinfer cudart)
 
 find_package(OpenCV)
 include_directories(OpenCV_INCLUDE_DIRS)
@@ -32,7 +32,7 @@ include_directories(OpenCV_INCLUDE_DIRS)
 add_executable(yolov5 ${PROJECT_SOURCE_DIR}/yolov5.cpp)
 target_link_libraries(yolov5 nvinfer)
 target_link_libraries(yolov5 cudart)
-target_link_libraries(yolov5 myplugins)
+target_link_libraries(yolov5 yolov5_trt)
 target_link_libraries(yolov5 ${OpenCV_LIBS})
 

編譯得到 libyolov5_trt.so

4.5 包裝 yolov5 tensort 為 python 庫 (基於 c++庫)

python modules , 參考:

https://github.com/walletiger/tensorrt_retinaface_with_python/tree/main/python

python wrap

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <Python.h>

#include <opencv2/opencv.hpp>
#include <opencv2/highgui/highgui.hpp>
#include "../yolov5_lib.h"
#include "pyboostcvconverter/pyboostcvconverter.hpp"
#include <boost/python.hpp>


using namespace cv;
using namespace boost::python;



static PyObject * mpyCreate(PyObject *self,  PyObject *args)
{
    char *engine_path = NULL;
    void *trt_engine = NULL;

    if (!PyArg_ParseTuple(args, "s", &engine_path)){
        return  Py_BuildValue("K", (unsigned long long)trt_engine);
    }

    trt_engine = yolov5_trt_create(engine_path);

    printf("create yolov5-trt , instance = %p\n", trt_engine);

    return Py_BuildValue("K", (unsigned long long)trt_engine);
}

static PyObject *mpyDetect(PyObject *self, PyObject *args)
{
    void *trt_engine = NULL;
    PyObject *ndArray = NULL;
    float conf_thresh = 0.45;
    const char *ret = NULL;
    unsigned long long v; 

    if (!PyArg_ParseTuple(args, "KOf", &v, &ndArray, &conf_thresh))
        return Py_BuildValue("s", "");

    Mat mat = pbcvt::fromNDArrayToMat(ndArray);

    trt_engine = (void *)v;

    ret = yolov5_trt_detect(trt_engine, mat, conf_thresh);

    return Py_BuildValue("s", ret);
}

static PyObject * mPyDestroy(PyObject *self, PyObject *args)
{
    void *engine = NULL;
    unsigned long long v; 
    if (!PyArg_ParseTuple(args, "K", &v))
        return Py_BuildValue("O", NULL);;

    printf(" destroy engine , engine = %lu\n", v);
	engine = (void *)v;

    yolov5_trt_destroy(engine);

    return Py_BuildValue("O", NULL);

}

static PyMethodDef TRTYolov5MeThods[] = {
    {"create", mpyCreate, METH_VARARGS, "Create the engine."},
    {"detect", mpyDetect, METH_VARARGS, "use the engine to detect image"},    
    {"destroy", mPyDestroy, METH_VARARGS, "destroy the engine"},        
    {NULL, NULL, 0, NULL}
};

static struct PyModuleDef TRTYolov5Module = {
    PyModuleDef_HEAD_INIT,
    "TRTYolov5",     /* name of module */
    "",          /* module documentation, may be NULL */
    -1,          /* size of per-interpreter state of the module, or -1 if the module keeps state in global variables. */
    TRTYolov5MeThods
};

PyMODINIT_FUNC PyInit_TRTYolov5(void) {
    printf("init module ... \n");

    return PyModule_Create(&TRTYolov5Module);
}

終於, python 下可以快速執行 yolov5s tensoRT  modules 了

import cv2 
import TRTYolov5 as t

engine = t.create('../yolov5s.engine')

img = cv2.imread('/workspace/data/x3.jpg')

b = t.detect(engine, img, 0.45)

#t.destroy(engine)

print(b)

 

最後看下 jetson nano 下 實時執行效果: 可以 40ms 一幀的速度來執行檢測應用。

yolov5 jetson nano tensorRT model for hand detect

 

相關文章