Commit 84ff978c authored by Micaela Verucchi's avatar Micaela Verucchi
Browse files

Merge with master, all tests passed



Signed-off-by: default avatarMicaela Verucchi <micaelaverucchi@gmail.com>
parents 4ca69836 c306b368
......@@ -12,5 +12,11 @@ build/
*.hdf5
*.pk
*.table
cmake-build-release/
demo/COCO_val2017
demo/BDD100K_val
\ No newline at end of file
demo/BDD100K_val
/.vs
cmake-build-minsizerel/*
scripts/COCO_val2017/*
scripts/COCO_val2017.zip
scripts/all_labels.txt
\ No newline at end of file
cmake_minimum_required(VERSION 3.5)
cmake_minimum_required(VERSION 3.15)
project (tkDNN)
set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wno-deprecated-declarations -Wno-unused-variable")
if(UNIX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -fPIC -Wno-deprecated-declarations -Wno-unused-variable ")
endif()
if(WIN32)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_FLAGS "/O2 /FS /EHsc")
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif(WIN32)
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include/tkDNN)
# project specific flags
......@@ -10,7 +17,13 @@ if(DEBUG)
add_definitions(-DDEBUG)
endif()
add_definitions(-DTKDNN_PATH="${CMAKE_CURRENT_SOURCE_DIR}")
if(TKDNN_PATH)
message("SET TKDNN_PATH:"${TKDNN_PATH})
add_definitions(-DTKDNN_PATH="${TKDNN_PATH}")
else()
add_definitions(-DTKDNN_PATH="${CMAKE_CURRENT_SOURCE_DIR}")
endif()
#-------------------------------------------------------------------------------
# CUDA
......@@ -28,19 +41,21 @@ include_directories(${CUDNN_INCLUDE_DIR})
file(GLOB tkdnn_CUSRC "src/kernels/*.cu" "src/sorting.cu")
cuda_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include ${CUDA_INCLUDE_DIRS} ${CUDNN_INCLUDE_DIRS})
cuda_add_library(kernels SHARED ${tkdnn_CUSRC})
target_link_libraries(kernels ${CUDA_CUBLAS_LIBRARIES})
#-------------------------------------------------------------------------------
# External Libraries
#-------------------------------------------------------------------------------
find_package(Eigen3 REQUIRED)
message("Eigen DIR: " ${EIGEN3_INCLUDE_DIR})
include_directories(${EIGEN3_INCLUDE_DIR})
find_package(OpenCV REQUIRED)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOPENCV")
# gives problems in cross-compiling, probably malformed cmake config
#find_package(yaml-cpp REQUIRED)
find_package(yaml-cpp REQUIRED)
#-------------------------------------------------------------------------------
# Build Libraries
......@@ -48,7 +63,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DOPENCV")
file(GLOB tkdnn_SRC "src/*.cpp")
set(tkdnn_LIBS kernels ${CUDA_LIBRARIES} ${CUDA_CUBLAS_LIBRARIES} ${CUDNN_LIBRARIES} ${OpenCV_LIBS} yaml-cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include ${CUDA_INCLUDE_DIRS} ${OPENCV_INCLUDE_DIRS} ${NVINFER_INCLUDES})
add_library(tkDNN SHARED ${tkdnn_SRC})
target_link_libraries(tkDNN ${tkdnn_LIBS})
......@@ -77,6 +92,7 @@ foreach(test_SRC ${darknet_SRC})
set(test_NAME test_${test_NAME})
add_executable(${test_NAME} ${test_SRC})
target_link_libraries(${test_NAME} tkDNN)
install(TARGETS ${test_NAME} DESTINATION bin)
endforeach()
# MOBILENET
......@@ -136,7 +152,10 @@ target_link_libraries(seg_demo tkDNN)
message("install dir:" ${CMAKE_INSTALL_PREFIX})
install(DIRECTORY include/ DESTINATION include/)
install(TARGETS tkDNN kernels DESTINATION lib)
install(TARGETS test_simple test_mnist test_mnistRT test_rtinference demo map_demo DESTINATION bin)
install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/cmake/" # source directory
DESTINATION "share/tkDNN/cmake/" # target directory
)
install(DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/tests/" # source directory
DESTINATION "share/tkDNN/tests" # target directory
)
1)error C2131 @ Yolo3Detection.cpp(97) -> expression doesnt evaluate to a constant caused to read of variable outside its lifetime
\ No newline at end of file
# tkDNN
tkDNN is a Deep Neural Network library built with cuDNN and tensorRT primitives, specifically thought to work on NVIDIA Jetson Boards. It has been tested on TK1(branch cudnn2), TX1, TX2, AGX Xavier and several discrete GPU.
tkDNN is a Deep Neural Network library built with cuDNN and tensorRT primitives, specifically thought to work on NVIDIA Jetson Boards. It has been tested on TK1(branch cudnn2), TX1, TX2, AGX Xavier, Nano and several discrete GPUs.
The main goal of this project is to exploit NVIDIA boards as much as possible to obtain the best inference performance. It does not allow training.
If you use tkDNN in your research, please cite one of the following papers. For use in commercial solutions, write at gattifrancesco@hotmail.it or refer to https://hipert.unimore.it/ .
If you use tkDNN in your research, please cite the [following paper](https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=9212130&casa_token=sQTJXi7tJNoAAAAA:BguH9xCIY48MxbtDS3LXzIXzO-9sWArm7Hd7y7BwaLmqRuM_Gx8bOYizFPNMNtpo5K0kB-P-). For use in commercial solutions, write at gattifrancesco@hotmail.it and micaela.verucchi@unimore.it or refer to https://hipert.unimore.it/ .
```
Accepted paper @ IRC 2020, will soon be published.
M. Verucchi, L. Bartoli, F. Bagni, F. Gatti, P. Burgio and M. Bertogna, "Real-Time clustering and LiDAR-camera fusion on embedded platforms for self-driving cars", in proceedings in IEEE Robotic Computing (2020)
Accepted paper @ ETFA 2020, will soon be published.
M. Verucchi, G. Brilli, D. Sapienza, M. Verasani, M. Arena, F. Gatti, A. Capotondi, R. Cavicchioli, M. Bertogna, M. Solieri
"A Systematic Assessment of Embedded Neural Networks for Object Detection", in IEEE International Conference on Emerging Technologies and Factory Automation (2020)
@inproceedings{verucchi2020systematic,
title={A Systematic Assessment of Embedded Neural Networks for Object Detection},
author={Verucchi, Micaela and Brilli, Gianluca and Sapienza, Davide and Verasani, Mattia and Arena, Marco and Gatti, Francesco and Capotondi, Alessandro and Cavicchioli, Roberto and Bertogna, Marko and Solieri, Marco},
booktitle={2020 25th IEEE International Conference on Emerging Technologies and Factory Automation (ETFA)},
volume={1},
pages={937--944},
year={2020},
organization={IEEE}
}
```
## Results
Inference FPS of yolov4 with tkDNN, average of 1200 images with the same dimesion as the input size, on
### What's new (20 July 2021)
- [x] Support to sematic segmentation [REAME](readme/README_seg.md)
- [] Support to TensorRT8 (WIP)
## FPS Results
Inference FPS of yolov4 with tkDNN, average of 1200 images with the same dimension as the input size, on
* RTX 2080Ti (CUDA 10.2, TensorRT 7.0.0, Cudnn 7.6.5);
* Xavier AGX, Jetpack 4.3 (CUDA 10.0, CUDNN 7.6.3, tensorrt 6.0.1 );
* Xavier NX, Jetpack 4.4 (CUDA 10.2, CUDNN 8.0.0, tensorrt 7.1.0 ).
* Tx2, Jetpack 4.2 (CUDA 10.0, CUDNN 7.3.1, tensorrt 5.0.6 );
* Jetson Nano, Jetpack 4.4 (CUDA 10.2, CUDNN 8.0.0, tensorrt 7.1.0 ).
| Platform | Network | FP32, B=1 | FP32, B=4 | FP16, B=1 | FP16, B=4 | INT8, B=1 | INT8, B=4 |
| :------: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
| RTX 2080Ti | yolo4 320 | 118,59 |237,31 | 207,81 | 443,32 | 262,37 | 530,93 |
| RTX 2080Ti | yolo4 416 | 104,81 |162,86 | 169,06 | 293,78 | 206,93 | 353,26 |
| RTX 2080Ti | yolo4 512 | 92,98 |132,43 | 140,36 | 215,17 | 165,35 | 254,96 |
| RTX 2080Ti | yolo4 608 | 63,77 |81,53 | 111,39 | 152,89 | 127,79 | 184,72 |
| AGX Xavier | yolo4 320 | 26,78 |32,05 | 57,14 | 79,05 | 73,15 | 97,56 |
| AGX Xavier | yolo4 416 | 19,96 |21,52 | 41,01 | 49,00 | 50,81 | 60,61 |
| AGX Xavier | yolo4 512 | 16,58 |16,98 | 31,12 | 33,84 | 37,82 | 41,28 |
| AGX Xavier | yolo4 608 | 9,45 |10,13 | 21,92 | 23,36 | 27,05 | 28,93 |
| Tx2 | yolo4 320 | 11,18 | 12,07 | 15,32 | 16,31 | - | - |
| Tx2 | yolo4 416 | 7,30 | 7,58 | 9,45 | 9,90 | - | - |
| Tx2 | yolo4 512 | 5,96 | 5,95 | 7,22 | 7,23 | - | - |
| Tx2 | yolo4 608 | 3,63 | 3,65 | 4,67 | 4,70 | - | - |
| Nano | yolo4 320 | 4,23 | 4,55 | 6,14 | 6,53 | - | - |
| Nano | yolo4 416 | 2,88 | 3,00 | 3,90 | 4,04 | - | - |
| Nano | yolo4 512 | 2,32 | 2,34 | 3,02 | 3,04 | - | - |
| Nano | yolo4 608 | 1,40 | 1,41 | 1,92 | 1,93 | - | - |
| RTX 2080Ti | yolo4 320 | 118.59 | 237.31 | 207.81 | 443.32 | 262.37 | 530.93 |
| RTX 2080Ti | yolo4 416 | 104.81 | 162.86 | 169.06 | 293.78 | 206.93 | 353.26 |
| RTX 2080Ti | yolo4 512 | 92.98 | 132.43 | 140.36 | 215.17 | 165.35 | 254.96 |
| RTX 2080Ti | yolo4 608 | 63.77 | 81.53 | 111.39 | 152.89 | 127.79 | 184.72 |
| AGX Xavier | yolo4 320 | 26.78 | 32.05 | 57.14 | 79.05 | 73.15 | 97.56 |
| AGX Xavier | yolo4 416 | 19.96 | 21.52 | 41.01 | 49.00 | 50.81 | 60.61 |
| AGX Xavier | yolo4 512 | 16.58 | 16.98 | 31.12 | 33.84 | 37.82 | 41.28 |
| AGX Xavier | yolo4 608 | 9.45 | 10.13 | 21.92 | 23.36 | 27.05 | 28.93 |
| Xavier NX | yolo4 320 | 14.56 | 16.25 | 30.14 | 41.15 | 42.13 | 53.42 |
| Xavier NX | yolo4 416 | 10.02 | 10.60 | 22.43 | 25.59 | 29.08 | 32.94 |
| Xavier NX | yolo4 512 | 8.10 | 8.32 | 15.78 | 17.13 | 20.51 | 22.46 |
| Xavier NX | yolo4 608 | 5.26 | 5.18 | 11.54 | 12.06 | 15.09 | 15.82 |
| Tx2 | yolo4 320 | 11.18 | 12.07 | 15.32 | 16.31 | - | - |
| Tx2 | yolo4 416 | 7.30 | 7.58 | 9.45 | 9.90 | - | - |
| Tx2 | yolo4 512 | 5.96 | 5.95 | 7.22 | 7.23 | - | - |
| Tx2 | yolo4 608 | 3.63 | 3.65 | 4.67 | 4.70 | - | - |
| Nano | yolo4 320 | 4.23 | 4.55 | 6.14 | 6.53 | - | - |
| Nano | yolo4 416 | 2.88 | 3.00 | 3.90 | 4.04 | - | - |
| Nano | yolo4 512 | 2.32 | 2.34 | 3.02 | 3.04 | - | - |
| Nano | yolo4 608 | 1.40 | 1.41 | 1.92 | 1.93 | - | - |
## MAP Results
Results for COCO val 2017 (5k images), on RTX 2080Ti, with conf threshold=0.001
| | CodaLab | CodaLab | CodaLab | CodaLab | tkDNN map | tkDNN map |
| -------------------- | :-----------: | :-------: | :-----------: | :---------: | :-----------: | :-------: |
| | **tkDNN** | **tkDNN** | **darknet** | **darknet** | **tkDNN** | **tkDNN** |
| | MAP(0.5:0.95) | AP50 | MAP(0.5:0.95) | AP50 | MAP(0.5:0.95) | AP50 |
| Yolov3 (416x416) | 0.381 | 0.675 | 0.380 | 0.675 | 0.372 | 0.663 |
| yolov4 (416x416) | 0.468 | 0.705 | 0.471 | 0.710 | 0.459 | 0.695 |
| yolov3tiny (416x416) | 0.096 | 0.202 | 0.096 | 0.201 | 0.093 | 0.198 |
| yolov4tiny (416x416) | 0.202 | 0.400 | 0.201 | 0.400 | 0.197 | 0.395 |
| Cnet-dla34 (512x512) | 0.366 | 0.543 | \- | \- | 0.361 | 0.535 |
| mv2SSD (512x512) | 0.226 | 0.381 | \- | \- | 0.223 | 0.378 |
## Index
- [tkDNN](#tkdnn)
......@@ -58,6 +84,14 @@ Inference FPS of yolov4 with tkDNN, average of 1200 images with the same dimesio
- [mAP demo](#map-demo)
- [Existing tests and supported networks](#existing-tests-and-supported-networks)
- [References](#references)
- [tkDNN on Windows 10 (experimental)](#tkdnn-on-windows-10-experimental)
- [Dependencies-Windows](#dependencies-windows)
- [Compiling tkDNN on Windows](#compiling-tkdnn-on-windows)
- [Run the demo on Windows](#run-the-demo-on-windows)
- [FP16 inference windows](#fp16-inference-windows)
- [INT8 inference windows](#int8-inference-windows)
- [Known issues with tkDNN on Windows](#known-issues-with-tkdnn-on-windows)
......@@ -155,7 +189,7 @@ tkDNN implement and easy parser for darknet cfg files, a network can be converte
tk::dnn::Network *net = tk::dnn::darknetParser("yolov4.cfg", "yolov4/layers", "coco.names");
net->print();
```
All models from darknet are now parsed directly from cfg, you still need to export the weights with the descripted tools in the previus section.
All models from darknet are now parsed directly from cfg, you still need to export the weights with the described tools in the previous section.
<details>
<summary>Supported layers</summary>
convolutional
......@@ -173,19 +207,30 @@ All models from darknet are now parsed directly from cfg, you still need to expo
relu
leaky
mish
logistic
</details>
## Run the demo
## Run the demo
This is an example using yolov4.
To run the an object detection demo follow these steps (example with yolov3):
To run the an object detection first create the .rt file by running:
```
rm yolo3_fp32.rt # be sure to delete(or move) old tensorRT files
./test_yolo3 # run the yolo test (is slow)
./demo yolo3_fp32.rt ../demo/yolo_test.mp4 y
rm yolo4_fp32.rt # be sure to delete(or move) old tensorRT files
./test_yolo4 # run the yolo test (is slow)
```
If you get problems in the creation, try to check the error activating the debug of TensorRT in this way:
```
cmake .. -DDEBUG=True
make
```
Once you have successfully created your rt file, run the demo:
```
In general the demo program takes 4 parameters:
./demo yolo4_fp32.rt ../demo/yolo_test.mp4 y
```
./demo <network-rt-file> <path-to-video> <kind-of-network> <number-of-classes> <n-batches> <show-flag>
In general the demo program takes 7 parameters:
```
./demo <network-rt-file> <path-to-video> <kind-of-network> <number-of-classes> <n-batches> <show-flag> <conf-thresh>
```
where
* ```<network-rt-file>``` is the rt file generated by a test
......@@ -194,9 +239,11 @@ where
* ```<number-of-classes>```is the number of classes the network is trained on
* ```<n-batches>``` number of batches to use in inference (N.B. you should first export TKDNN_BATCHSIZE to the required n_batches and create again the rt file for the network).
* ```<show-flag>``` if set to 0 the demo will not show the visualization but save the video into result.mp4 (if n-batches ==1)
* ```<conf-thresh>``` confidence threshold for the detector. Only bounding boxes with threshold greater than conf-thresh will be displayed.
N.b. By default it is used FP32 inference
![demo](https://user-images.githubusercontent.com/11562617/72547657-540e7800-388d-11ea-83c6-49dfea2a0607.gif)
### FP16 inference
......@@ -221,7 +268,7 @@ You should provide image_list.txt and label_list.txt, using training images. How
```
bash scripts/download_validation.sh COCO
```
to automatically download COCO2017 validation (inside demo folder) and create those needed file. Use BDD insted of COCO to download BDD validation.
to automatically download COCO2017 validation (inside demo folder) and create those needed file. Use BDD instead of COCO to download BDD validation.
Then a complete example using yolo3 and COCO dataset would be:
```
......@@ -243,8 +290,8 @@ N.B.
export TKDNN_BATCHSIZE=2
# build tensorRT files
```
This will create a TensorRT file with the desidered **max** batch size.
The test will still run with a batch of 1, but the created tensorRT can manage the desidered batch size.
This will create a TensorRT file with the desired **max** batch size.
The test will still run with a batch of 1, but the created tensorRT can manage the desired batch size.
### Test batch Inference
This will test the network with random input and check if the output of each batch is the same.
......@@ -290,7 +337,7 @@ cd build
./map_demo dla34_cnet_FP32.rt c ../demo/COCO_val2017/all_labels.txt ../demo/config.yaml
```
This demo also creates a json file named ```net_name_COCO_res.json``` containing all the detections computed. The detections are in COCO format, the correct format to subit the results to [CodaLab COCO detection challenge](https://competitions.codalab.org/competitions/20794#participate).
This demo also creates a json file named ```net_name_COCO_res.json``` containing all the detections computed. The detections are in COCO format, the correct format to submit the results to [CodaLab COCO detection challenge](https://competitions.codalab.org/competitions/20794#participate).
## Existing tests and supported networks
......@@ -317,6 +364,98 @@ This demo also creates a json file named ```net_name_COCO_res.json``` containing
| resnet101_cnet | Centernet (Resnet101 backend)<sup>4</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 512x512 | [weights](https://cloud.hipert.unimore.it/s/5BTjHMWBcJk8g3i/download) |
| csresnext50-panet-spp | Cross Stage Partial Network <sup>7</sup> | [COCO 2014](http://cocodataset.org/) | 80 | 416x416 | [weights](https://cloud.hipert.unimore.it/s/Kcs4xBozwY4wFx8/download) |
| yolo4 | Yolov4 <sup>8</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 416x416 | [weights](https://cloud.hipert.unimore.it/s/d97CFzYqCPCp5Hg/download) |
| yolo4_berkeley | Yolov4 <sup>8</sup> | [BDD100K ](https://bair.berkeley.edu/blog/2018/05/30/bdd/) | 10 | 540x320 | [weights](https://cloud.hipert.unimore.it/s/nkWFa5fgb4NTdnB/download) |
| yolo4tiny | Yolov4 tiny <sup>9</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 416x416 | [weights](https://cloud.hipert.unimore.it/s/iRnc4pSqmx78gJs/download) |
| yolo4x | Yolov4x-mish <sup>9</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 640x640 | [weights](https://cloud.hipert.unimore.it/s/5MFjtNtgbDGdJEo/download) |
| yolo4x-cps | Scaled Yolov4 <sup>10</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 512x512 | [weights](https://cloud.hipert.unimore.it/s/AfzHE4BfTeEm2gH/download) |
### tkDNN on Windows 10 (experimental)
### Dependencies-Windows
This branch should work on every NVIDIA GPU supported in windows with the following dependencies:
* WINDOWS 10 1803 or HIGHER
* CUDA 10.0 (Recommended CUDA 11.2 )
* CUDNN 7.6 (Recommended CUDNN 8.1.1 )
* TENSORRT 6.0.1 (Recommended TENSORRT 7.2.3.4 )
* OPENCV 3.4 (Recommended OPENCV 4.2.0 )
* MSVC 16.7
* YAML-CPP
* EIGEN3
* 7ZIP (ADD TO PATH)
* NINJA 1.10
All the above mentioned dependencies except 7ZIP can be installed using Microsoft's [VCPKG](https://github.com/microsoft/vcpkg.git) .
After bootstrapping VCPKG the dependencies can be built and installed using the following command :
```
opencv4(normal) - vcpkg.exe install opencv4[tbb,jpeg,tiff,opengl,openmp,png,ffmpeg,eigen]:x64-windows yaml-cpp:x64-windows eigen3:x64-windows --x-install-root=C:\opt --x-buildtrees-root=C:\temp_vcpkg_build
opencv4(cuda) - vcpkg.exe install opencv4[cuda,nonfree,contrib,eigen,tbb,jpeg,tiff,opengl,openmp,png,ffmpeg]:x64-windows yaml-cpp:x64-windows eigen3:x64-windows --x-install-root=C:\opt --x-buildtrees-root=C:\temp_vcpkg_build
```
To build opencv4 with cuda and cudnn version corresponding to your cuda version,vcpkg's cudnn portfile needs to be modified by adding ```$ENV{CUDA_PATH}``` at lines 16 and 17 in the portfile.cmake
After VCPKG finishes building and installing all the packages delete C:\temp_vcpkg_build and add C:\opt\x64-windows\bin and C:\opt\x64-windows\debug\bin to path
### Compiling tkDNN on Windows
tkDNN is built with cmake(3.15+) on windows along with ninja.Msbuild and NMake Makefiles are drastically slower when compiling the library compared to windows
```
git clone https://github.com/ceccocats/tkDNN.git
cd tkdnn-windows
mkdir build
cd build
cmake -DCMAKE_BUILD_TYPE=Release -G"Ninja" ..
ninja -j4
```
### Run the demo on Windows
This example uses yolo4_tiny.\
To run the object detection file create .rt file bu running:
```
.\test_yolo4tiny.exe
```
Once the rt file has been successfully create,run the demo using the following command:
```
.\demo.exe yolo4tiny_fp32.rt ..\demo\yolo_test.mp4 y
```
For general info on more demo paramters,check Run the demo section on top
To run the test_all_tests.sh on windows,use git bash or msys2
### FP16 inference windows
This is an untested feature on windows.To run the object detection demo with FP16 interference follow the below steps(example with yolo4tiny):
```
set TKDNN_MODE=FP16
del /f yolo4tiny_fp16.rt
.\test_yolo4tiny.exe
.\demo.exe yolo4tiny_fp16.rt ..\demo\yolo_test.mp4
```
### INT8 inference windows
To run object detection demo with INT8 (example with yolo4tiny):
```
set TKDNN_MODE=INT8
set TKDNN_CALIB_LABEL_PATH=..\demo\COCO_val2017\all_labels.txt
set TKDNN_CALIB_IMG_PATH=..\demo\COCO_val2017\all_images.txt
del /f yolo4tiny_int8.rt # be sure to delete(or move) old tensorRT files
.\test_yolo4tiny.exe # run the yolo test (is slow)
.\demo.exe yolo4tiny_int8.rt ..\demo\yolo_test.mp4 y
```
### Known issues with tkDNN on Windows
Mobilenet and Centernet demos work properly only when built with msvc 16.7 in Release Mode,when built in debug mode for the mentioned networks one might encounter opencv assert errors
All Darknet models work properly with demo using MSVC version(16.7-16.9)
It is recommended to use Nvidia Driver(465+),Cuda unknown errors have been observed when using older drivers on pascal(SM 61) devices.
## References
......@@ -329,3 +468,5 @@ This demo also creates a json file named ```net_name_COCO_res.json``` containing
6. He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
7. Wang, Chien-Yao, et al. "CSPNet: A New Backbone that can Enhance Learning Capability of CNN." arXiv preprint arXiv:1911.11929 (2019).
8. Bochkovskiy, Alexey, Chien-Yao Wang, and Hong-Yuan Mark Liao. "YOLOv4: Optimal Speed and Accuracy of Object Detection." arXiv preprint arXiv:2004.10934 (2020).
9. Bochkovskiy, Alexey, "Yolo v4, v3 and v2 for Windows and Linux" (https://github.com/AlexeyAB/darknet)
10. Wang, Chien-Yao, Alexey Bochkovskiy, and Hong-Yuan Mark Liao. "Scaled-YOLOv4: Scaling Cross Stage Partial Network." arXiv preprint arXiv:2011.08036 (2020).
......@@ -3,5 +3,5 @@ map_points : 101 #number of recall points (0 for all, 101 for COCO, 11 Pascal
map_levels : 10 #number of IoU step for the AP
map_step : 0.05 #step of IoU
IoU_thresh : 0.5 #starting IoU threshold
conf_thresh : 0.0 #threshold on the condifence of the bbox
conf_thresh : 0.001 #threshold on the condifence of the bbox
verbose : false #print on screen information
#include <iostream>
#include <signal.h>
#include <stdlib.h> /* srand, rand */
#include <unistd.h>
//#include <unistd.h>
#include <mutex>
#include "CenternetDetection.h"
......@@ -22,10 +22,15 @@ int main(int argc, char *argv[]) {
signal(SIGINT, sig_handler);
std::string net = "yolo3_berkeley.rt";
std::string net = "yolo4tiny_fp32.rt";
if(argc > 1)
net = argv[1];
std::string input = "../demo/yolo_test.mp4";
#ifdef __linux__
std::string input = "../demo/yolo_test.mp4";
#elif _WIN32
std::string input = "..\\..\\..\\demo\\yolo_test.mp4";
#endif
if(argc > 2)
input = argv[2];
char ntype = 'y';
......@@ -40,6 +45,9 @@ int main(int argc, char *argv[]) {
bool show = true;
if(argc > 6)
show = atoi(argv[6]);
float conf_thresh=0.3;
if(argc > 7)
conf_thresh = atof(argv[7]);
if(n_batch < 1 || n_batch > 64)
FatalError("Batch dim not supported");
......@@ -69,7 +77,7 @@ int main(int argc, char *argv[]) {
FatalError("Network type not allowed (3rd parameter)\n");
}
detNN->init(net, n_classes, n_batch);
detNN->init(net, n_classes, n_batch, conf_thresh);
gRun = true;
......@@ -128,7 +136,7 @@ int main(int argc, char *argv[]) {
double mean = 0;
std::cout<<COL_GREENB<<"\n\nTime stats:\n";
std::cout<<"Min: "<<*std::min_element(detNN->stats.begin(), detNN->stats.end())/n_batch<<" ms\n";
std::cout<<"Min: "<<*std::min_element(detNN->stats.begin(), detNN->stats.end())/n_batch<<" ms\n";
std::cout<<"Max: "<<*std::max_element(detNN->stats.begin(), detNN->stats.end())/n_batch<<" ms\n";
for(int i=0; i<detNN->stats.size(); i++) mean += detNN->stats[i]; mean /= detNN->stats.size();
std::cout<<"Avg: "<<mean/n_batch<<" ms\t"<<1000/(mean/n_batch)<<" FPS\n"<<COL_END;
......
......@@ -2,7 +2,10 @@
#include <iostream>
#include <signal.h>
#include <stdlib.h> /* srand, rand */
#ifdef __linux__
#include <unistd.h>
#endif
#include <mutex>
#include "utils.h"
......@@ -105,7 +108,7 @@ int main(int argc, char *argv[])
default:
FatalError("Network type not allowed (3rd parameter)\n");
}
detNN->init(net, n_classes);
detNN->init(net, n_classes, 1, conf_thresh);
//read images
std::ifstream all_labels(labels_path);
......
FROM ceccocats/tkdnn:latest
LABEL maintainer "Francesco Gatti"
RUN cd && git clone https://github.com/ceccocats/tkDNN.git && cd tkDNN && mkdir build && cd build \
&& cmake .. && make -j12
FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
LABEL maintainer "Francesco Gatti"
ADD nv-tensorrt-repo-ubuntu1804-cuda10.2-trt7.0.0.11-ga-20191216_1-1_amd64.deb /tmp/trt.deb
RUN apt-get update && dpkg -i /tmp/trt.deb && rm /tmp/trt.deb && apt-get update
RUN apt install -y libnvinfer7=7.0.0-1+cuda10.2 libnvinfer-dev=7.0.0-1+cuda10.2
RUN DEBIAN_FRONTEND=noninteractive apt install -y git wget libeigen3-dev libyaml-cpp-dev
RUN cd /tmp && \
wget https://github.com/Kitware/CMake/releases/download/v3.17.3/cmake-3.17.3-Linux-x86_64.sh && \
chmod +x cmake-3.17.3-Linux-x86_64.sh && \
./cmake-3.17.3-Linux-x86_64.sh --prefix=/usr/local --exclude-subdir --skip-license && \
rm ./cmake-3.17.3-Linux-x86_64.sh
RUN echo "INSTALL OPENCV"
RUN apt-get install -y build-essential \
unzip \
pkg-config \
libjpeg-dev \
libpng-dev \
libtiff-dev \
libavcodec-dev \
libavformat-dev \
libswscale-dev \
libv4l-dev \
libxvidcore-dev \
libx264-dev \
libgtk-3-dev \
libatlas-base-dev \
gfortran \
libgstreamer1.0-dev \
libgstreamer-plugins-base1.0-dev \
libdc1394-22-dev \
libavresample-dev
RUN cd && wget https://github.com/opencv/opencv/archive/4.3.0.tar.gz && tar -xf 4.3.0.tar.gz && rm *.tar.gz
RUN cd && wget https://github.com/opencv/opencv_contrib/archive/4.3.0.tar.gz && tar -xf 4.3.0.tar.gz && rm *.tar.gz
RUN cd && \
cd opencv-4.3.0 && mkdir build && cd build && \
cmake -D CMAKE_BUILD_TYPE=RELEASE \
-D CMAKE_INSTALL_PREFIX=/usr/local \
-D INSTALL_PYTHON_EXAMPLES=OFF \
-D INSTALL_C_EXAMPLES=OFF \
-D OPENCV_EXTRA_MODULES_PATH='~/opencv_contrib-4.3.0/modules' \
-D BUILD_EXAMPLES=OFF \
-D WITH_CUDA=ON \
-D CUDA_ARCH_BIN=7.2 \
-D CUDA_ARCH_PTX="" \
-D ENABLE_FAST_MATH=ON \
-D CUDA_FAST_MATH=ON \
-D WITH_CUBLAS=ON \
-D WITH_LIBV4L=ON \
-D WITH_GSTREAMER=ON \
-D WITH_GSTREAMER_0_10=OFF \
-D WITH_TBB=ON \
../ && make -j12 && make install
RUN apt clean
# Use the prebuilt image
```
# build image
docker build -t tkdnn:build -f Dockerfile .
```
# Build Base Docker image
```
# make nvidia docker working
# follow this guide: https://github.com/NVIDIA/nvidia-docker
# dowload tensorrt
# from: https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/7.0/7.0.0.11/local_repo/nv-tensorrt-repo-ubuntu1804-cuda10.2-trt7.0.0.11-ga-20191216_1-1_amd64.deb
# build image
docker build -t ceccocats/tkdnn:latest -f Dockerfile.base .
# run image
docker run -ti --gpus all --rm ceccocats/tkdnn:latest bash
```
......@@ -73,7 +73,7 @@ public:
CenternetDetection() {};
~CenternetDetection() {};
bool init(const std::string& tensor_path, const int n_classes=80, const int n_batches=1);
bool init(const std::string& tensor_path, const int n_classes=80, const int n_batches=1, const float conf_thresh=0.3);
void preprocess(cv::Mat &frame, const int bi=0);
void postprocess(const int bi=0,const bool mAP=false);
};
......
......@@ -11,6 +11,7 @@ namespace tk { namespace dnn {
int channels = 3;
int batch_normalize=0;
int groups = 1;
int group_id = 0;
int filters=1;
int size_x=1;
int size_y=1;
......@@ -23,7 +24,10 @@ namespace tk { namespace dnn {
int num = 1;
int pad = 0;
int coords = 4;
int nms_kind = 0;
int new_coords= 0;
float scale_xy = 1;
float nms_thresh = 0.45;
std::vector<int> layers;
std::string activation = "linear";
......
......@@ -4,7 +4,10 @@
#include <iostream>
#include <signal.h>
#include <stdlib.h>
#ifdef __linux__
#include <unistd.h>
#endif
#include <mutex>
#include "utils.h"
......@@ -14,7 +17,7 @@
#include "tkdnn.h"
// #define OPENCV_CUDACONTRIB //if OPENCV has been compiled with CUDA and contrib.
//#define OPENCV_CUDACONTRIB //if OPENCV has been compiled with CUDA and contrib.
#ifdef OPENCV_CUDACONTRIB
#include <opencv2/cudawarping.hpp>
......@@ -76,15 +79,15 @@ class DetectionNN {
~DetectionNN(){};
/**
* Method used to inialize the class, allocate memory and compute
* Method used to initialize the class, allocate memory and compute
* needed data.
*
* @param tensor_path path to the rt file og the NN.
* @param tensor_path path to the rt file of the NN.
* @param n_classes number of classes for the given dataset.
* @param n_batches maximum number of batches to use in inference