Commit db0f8a4d authored by Micaela Verucchi's avatar Micaela Verucchi
Browse files

Support yolov4



Signed-off-by: default avatarMicaela Verucchi <micaelaverucchi@gmail.com>
parent a874fad2
......@@ -112,6 +112,9 @@ target_link_libraries(test_yolo3_berkeley tkDNN)
add_executable(test_yolo3_flir tests/yolo3_flir/yolo3_flir.cpp)
target_link_libraries(test_yolo3_flir tkDNN)
add_executable(test_yolo4 tests/yolo4/yolo4.cpp)
target_link_libraries(test_yolo4 tkDNN)
add_executable(test_mobilenetv2ssd tests/mobilenetv2ssd/mobilenetv2ssd.cpp)
target_link_libraries(test_mobilenetv2ssd tkDNN)
......
......@@ -223,6 +223,7 @@ cd build
| resnet101 | Resnet 101<sup>6</sup> | [COCO 2014](http://cocodataset.org/) | 80 | 224x224 | weights |
| resnet101_cnet | Centernet (Resnet101 backend)<sup>4</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 512x512 | [weights](https://cloud.hipert.unimore.it/s/5BTjHMWBcJk8g3i/download) |
| csresnext50-panet-spp | Cross Stage Partial Network <sup>7</sup> | [COCO 2014](http://cocodataset.org/) | 80 | 416x416 | [weights](https://cloud.hipert.unimore.it/s/Kcs4xBozwY4wFx8/download) |
| yolo4 | Yolov4 <sup>8</sup> | [COCO 2017](http://cocodataset.org/) | 80 | 416x416 | [weights](https://cloud.hipert.unimore.it/s/d97CFzYqCPCp5Hg/download) |
## References
......@@ -234,3 +235,4 @@ cd build
5. Sandler, Mark, et al. "Mobilenetv2: Inverted residuals and linear bottlenecks." Proceedings of the IEEE conference on computer vision and pattern recognition. 2018.
6. He, Kaiming, et al. "Deep residual learning for image recognition." Proceedings of the IEEE conference on computer vision and pattern recognition. 2016.
7. Wang, Chien-Yao, et al. "CSPNet: A New Backbone that can Enhance Learning Capability of CNN." arXiv preprint arXiv:1911.11929 (2019).
8. Bochkovskiy, Alexey, Chien-Yao Wang, and Hong-Yuan Mark Liao. "YOLOv4: Optimal Speed and Accuracy of Object Detection." arXiv preprint arXiv:2004.10934 (2020).
......@@ -18,6 +18,7 @@ enum layerType_t {
LAYER_ACTIVATION,
LAYER_ACTIVATION_CRELU,
LAYER_ACTIVATION_LEAKY,
LAYER_ACTIVATION_MISH,
LAYER_FLATTEN,
LAYER_RESHAPE,
LAYER_MULADD,
......@@ -66,6 +67,7 @@ public:
case LAYER_ACTIVATION: return "Activation";
case LAYER_ACTIVATION_CRELU: return "ActivationCReLU";
case LAYER_ACTIVATION_LEAKY: return "ActivationLeaky";
case LAYER_ACTIVATION_MISH: return "ActivationMish";
case LAYER_FLATTEN: return "Flatten";
case LAYER_RESHAPE: return "Reshape";
case LAYER_MULADD: return "MulAdd";
......@@ -168,7 +170,8 @@ public:
*/
typedef enum {
ACTIVATION_ELU = 100,
ACTIVATION_LEAKY = 101
ACTIVATION_LEAKY = 101,
ACTIVATION_MISH = 102
} tkdnnActivationMode_t;
/**
......@@ -187,6 +190,8 @@ public:
return LAYER_ACTIVATION_CRELU;
else if (act_mode == ACTIVATION_LEAKY)
return LAYER_ACTIVATION_LEAKY;
else if (act_mode == ACTIVATION_MISH)
return LAYER_ACTIVATION_MISH;
else
return LAYER_ACTIVATION;
};
......@@ -561,13 +566,14 @@ public:
int sort_class;
};
Yolo(Network *net, int classes, int num, std::string fname_weights, int n_masks=3);
Yolo(Network *net, int classes, int num, std::string fname_weights,int n_masks=3, float scale_xy=1);
virtual ~Yolo();
virtual layerType_t getLayerType() { return LAYER_YOLO; };
int classes, num, n_masks;
dnnType *mask_h, *mask_d; //anchors
dnnType *bias_h, *bias_d; //anchors
float scaleXY;
std::vector<std::string> classesNames;
virtual dnnType* infer(dataDim_t &dim, dnnType* srcData);
......
......@@ -25,6 +25,7 @@ template<typename T> T readBUF(const char*& buffer)
using namespace nvinfer1;
#include "pluginsRT/ActivationLeakyRT.h"
#include "pluginsRT/ActivationReLUCeilingRT.h"
#include "pluginsRT/ActivationMishRT.h"
#include "pluginsRT/ReorgRT.h"
#include "pluginsRT/RegionRT.h"
//#include "pluginsRT/RouteRT.h"
......
......@@ -8,6 +8,7 @@ void activationLEAKYForward(dnnType *srcData, dnnType *dstData, int size, cudaSt
void activationReLUCeilingForward(dnnType *srcData, dnnType *dstData, int size, const float ceiling, cudaStream_t stream = cudaStream_t(0));
void activationLOGISTICForward(dnnType *srcData, dnnType *dstData, int size, cudaStream_t stream = cudaStream_t(0));
void activationSIGMOIDForward(dnnType *srcData, dnnType *dstData, int size, cudaStream_t stream = cudaStream_t(0));
void activationMishForward(dnnType* srcData, dnnType* dstData, int size, cudaStream_t stream= cudaStream_t(0));
void fill(dnnType *data, int size, dnnType val, cudaStream_t stream = cudaStream_t(0));
......@@ -45,4 +46,6 @@ void dcnV2CudaForward(cublasStatus_t stat, cublasHandle_t handle,
const int in_n, const int in_c, const int in_h, const int in_w,
const int out_n, const int out_c, const int out_h, const int out_w,
const int dst_dim, cudaStream_t stream = cudaStream_t(0));
void scalAdd(dnnType* dstData, int size, float alpha, float beta, int inc, cudaStream_t stream = cudaStream_t(0));
#endif //KERNELS_H
#include<cassert>
#include "../kernels.h"
class ActivationMishRT : public IPlugin {
public:
ActivationMishRT() {
}
~ActivationMishRT(){
}
int getNbOutputs() const override {
return 1;
}
Dims getOutputDimensions(int index, const Dims* inputs, int nbInputDims) override {
return inputs[0];
}
void configure(const Dims* inputDims, int nbInputs, const Dims* outputDims, int nbOutputs, int maxBatchSize) override {
size = 1;
for(int i=0; i<outputDims[0].nbDims; i++)
size *= outputDims[0].d[i];
}
int initialize() override {
return 0;
}
virtual void terminate() override {
}
virtual size_t getWorkspaceSize(int maxBatchSize) const override {
return 0;
}
virtual int enqueue(int batchSize, const void*const * inputs, void** outputs, void* workspace, cudaStream_t stream) override {
activationMishForward((dnnType*)reinterpret_cast<const dnnType*>(inputs[0]),
reinterpret_cast<dnnType*>(outputs[0]), batchSize*size, stream);
return 0;
}
virtual size_t getSerializationSize() override {
return 1*sizeof(int);
}
virtual void serialize(void* buffer) override {
char *buf = reinterpret_cast<char*>(buffer);
tk::dnn::writeBUF(buf, size);
}
int size;
};
......@@ -8,11 +8,12 @@ class YoloRT : public IPlugin {
public:
YoloRT(int classes, int num, tk::dnn::Yolo *yolo = nullptr, int n_masks=3) {
YoloRT(int classes, int num, tk::dnn::Yolo *yolo = nullptr, int n_masks=3, float scale_xy=1) {
this->classes = classes;
this->num = num;
this->n_masks = n_masks;
this->scaleXY = scale_xy;
mask = new dnnType[n_masks];
bias = new dnnType[num*n_masks*2];
......@@ -64,6 +65,8 @@ public:
for(int n = 0; n < n_masks; ++n){
int index = entry_index(b, n*w*h, 0);
activationLOGISTICForward(srcData + index, dstData + index, 2*w*h, stream);
if (this->scaleXY != 1) scalAdd(dstData + index, 2 * w*h, this->scaleXY, -0.5*(this->scaleXY - 1), 1);
index = entry_index(b, n*w*h, 4);
activationLOGISTICForward(srcData + index, dstData + index, (1+classes)*w*h, stream);
......@@ -76,7 +79,7 @@ public:
virtual size_t getSerializationSize() override {
return 6*sizeof(int) + n_masks*sizeof(dnnType) + num*n_masks*2*sizeof(dnnType) + YOLORT_CLASSNAME_W*classes*sizeof(char);
return 6*sizeof(int) + sizeof(float)+ n_masks*sizeof(dnnType) + num*n_masks*2*sizeof(dnnType) + YOLORT_CLASSNAME_W*classes*sizeof(char);
}
virtual void serialize(void* buffer) override {
......@@ -87,6 +90,7 @@ public:
tk::dnn::writeBUF(buf, c);
tk::dnn::writeBUF(buf, h);
tk::dnn::writeBUF(buf, w);
tk::dnn::writeBUF(buf, scaleXY);
for(int i=0; i<n_masks; i++)
tk::dnn::writeBUF(buf, mask[i]);
for(int i=0; i<n_masks*2*num; i++)
......@@ -104,6 +108,7 @@ public:
int c, h, w;
int classes, num, n_masks;
float scaleXY;
std::vector<std::string> classesNames;
dnnType *mask;
......
......@@ -71,6 +71,7 @@ do
./test_imuodom &>> $out_file
print_output $? imuodom
test_net yolo4
test_net resnet101_cnet
test_net yolo3
test_net yolo3_flir
......@@ -87,7 +88,7 @@ do
test_net yolo_voc
test_net dla34_cnet
test_net yolo3_coco4
done
echo "If errors occured, check logfile $out_file"
......@@ -48,6 +48,10 @@ dnnType* Activation::infer(dataDim_t &dim, dnnType* srcData) {
if(act_mode == ACTIVATION_LEAKY) {
activationLEAKYForward(srcData, dstData, dim.tot());
}
else if(act_mode == ACTIVATION_MISH) {
activationMishForward(srcData, dstData, dim.tot());
} else {
dnnType alpha = dnnType(1);
dnnType beta = dnnType(0);
......
......@@ -225,7 +225,7 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Layer *l) {
return convert_layer(input, (Conv2d*) l);
if(type == LAYER_POOLING)
return convert_layer(input, (Pooling*) l);
if(type == LAYER_ACTIVATION || type == LAYER_ACTIVATION_CRELU || type == LAYER_ACTIVATION_LEAKY)
if(type == LAYER_ACTIVATION || type == LAYER_ACTIVATION_CRELU || type == LAYER_ACTIVATION_LEAKY || type == LAYER_ACTIVATION_MISH)
return convert_layer(input, (Activation*) l);
if(type == LAYER_SOFTMAX)
return convert_layer(input, (Softmax*) l);
......@@ -413,7 +413,14 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Activation *l) {
IPluginLayer *lRT = networkRT->addPlugin(&input, 1, *plugin);
checkNULL(lRT);
return lRT;
} else {
}
else if(l->act_mode == ACTIVATION_MISH) {
IPlugin *plugin = new ActivationMishRT();
IPluginLayer *lRT = networkRT->addPlugin(&input, 1, *plugin);
checkNULL(lRT);
return lRT;
}
else {
FatalError("this Activation mode is not yet implemented");
return NULL;
}
......@@ -518,7 +525,7 @@ ILayer* NetworkRT::convert_layer(ITensor *input, Yolo *l) {
//std::cout<<"convert Yolo\n";
//std::cout<<"New plugin YOLO\n";
IPlugin *plugin = new YoloRT(l->classes, l->num, l);
IPlugin *plugin = new YoloRT(l->classes, l->num, l, l->n_masks, l->scaleXY);
IPluginLayer *lRT = networkRT->addPlugin(&input, 1, *plugin);
checkNULL(lRT);
return lRT;
......@@ -637,6 +644,11 @@ IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialDa
a->size = readBUF<int>(buf);
return a;
}
if(name.find("ActivationMish") == 0) {
ActivationMishRT *a = new ActivationMishRT();
a->size = readBUF<int>(buf);
return a;
}
if(name.find("ActivationCReLU") == 0) {
ActivationReLUCeiling *a = new ActivationReLUCeiling(readBUF<float>(buf));
a->size = readBUF<int>(buf);
......@@ -728,6 +740,7 @@ IPlugin* PluginFactory::createPlugin(const char* layerName, const void* serialDa
r->c = readBUF<int>(buf);
r->h = readBUF<int>(buf);
r->w = readBUF<int>(buf);
r->scaleXY = readBUF<float>(buf);
for(int i=0; i<r->n_masks; i++)
r->mask[i] = readBUF<dnnType>(buf);
for(int i=0; i<r->n_masks*2*r->num; i++)
......
......@@ -11,12 +11,13 @@
namespace tk { namespace dnn {
Yolo::Yolo(Network *net, int classes, int num, std::string fname_weights, int n_masks) :
Yolo::Yolo(Network *net, int classes, int num, std::string fname_weights, int n_masks, float scale_xy) :
Layer(net) {
this->classes = classes;
this->num = num;
this->n_masks = n_masks;
this->scaleXY = scale_xy;
// load anchors
if(fname_weights != "") {
......@@ -74,6 +75,8 @@ dnnType* Yolo::infer(dataDim_t &dim, dnnType* srcData) {
for(int n = 0; n < n_masks; ++n){
int index = entry_index(b, n*dim.w*dim.h, 0, classes, input_dim, output_dim);
activationLOGISTICForward(srcData + index, dstData + index, 2*dim.w*dim.h);
if (this->scaleXY != 1) scalAdd(dstData + index, 2 * dim.w*dim.h, this->scaleXY, -0.5*(this->scaleXY - 1), 1);
index = entry_index(b, n*dim.w*dim.h, 4, classes, input_dim, output_dim);
activationLOGISTICForward(srcData + index, dstData + index, (1+classes)*dim.w*dim.h);
......
#include "kernels.h"
#include <math.h>
#define MISH_THRESHOLD 20
__device__ float tanh_activate_kernel(float x){return (2/(1 + expf(-2*x)) - 1);}
__device__ float softplus_kernel(float x, float threshold = 20) {
if (x > threshold) return x; // too large
else if (x < -threshold) return expf(x); // too small
return logf(expf(x) + 1);
}
// https://github.com/digantamisra98/Mish
// https://github.com/AlexeyAB/darknet/blob/master/src/activation_kernels.cu
__global__
void activation_mish(dnnType *input, dnnType *output, int size) {
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (i < size)
output[i] = input[i] * tanh_activate_kernel( softplus_kernel(input[i], MISH_THRESHOLD));
}
/**
Mish activation function
*/
void activationMishForward(dnnType* srcData, dnnType* dstData, int size, cudaStream_t stream)
{
int blocks = (size+255)/256;
int threads = 256;
activation_mish<<<blocks, threads, 0, stream>>>(srcData, dstData, size);
}
\ No newline at end of file
#include "kernels.h"
#include <math.h>
__global__ void scal_add_kernel(dnnType* dstData, int size, float alpha, float beta, int inc)
{
int i = (blockIdx.x + blockIdx.y*gridDim.x) * blockDim.x + threadIdx.x;
if (i < size) dstData[i*inc] = dstData[i*inc] * alpha + beta;
}
void scalAdd(dnnType* dstData, int size, float alpha, float beta, int inc, cudaStream_t stream)
{
int blocks = (size+255)/256;
int threads = 256;
scal_add_kernel<<<blocks, threads, 0, stream>>>(dstData, size, alpha, beta, inc);
}
\ No newline at end of file
This diff is collapsed.
[net]
# Testing
batch=1
subdivisions=1
# Training
#batch=64
#subdivisions=8
width=416
height=416
channels=3
momentum=0.949
decay=0.0005
angle=0
saturation = 1.5
exposure = 1.5
hue=.1
learning_rate=0.00261
burn_in=1000
max_batches = 500500
policy=steps
steps=400000,450000
scales=.1,.1
#cutmix=1
mosaic=1
#:104x104 54:52x52 85:26x26 104:13x13 for 416
[convolutional]
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=mish
# Downsample
[convolutional]
batch_normalize=1
filters=64
size=3
stride=2
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[route]
layers = -2
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=32
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=mish
[shortcut]
from=-3
activation=linear
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[route]
layers = -1,-7
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
# Downsample
[convolutional]
batch_normalize=1
filters=128
size=3
stride=2
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[route]
layers = -2
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=mish
[shortcut]
from=-3
activation=linear
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=mish
[shortcut]
from=-3
activation=linear
[convolutional]
batch_normalize=1
filters=64
size=1
stride=1
pad=1
activation=mish
[route]
layers = -1,-10
[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=mish
# Downsample
[convolutional]
batch_normalize=1
filters=256
size=3
stride=2
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=mish
[route]
layers = -2
[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=mish
[shortcut]
from=-3
activation=linear
[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=mish
[convolutional]
batch_normalize=1
filters=128
size=3
stride=1