2021年6月15日 星期二

Darknet with CUDA

Ubuntu22.04.1
環境: Xubuntu20.04

重灌電腦後,就裝了cuda 11, 安裝流程
$ /usr/bin/nvidia-smi

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 465.27       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:01:00.0 N/A |                  N/A |
| N/A   54C    P0    N/A /  N/A |    405MiB /  2002MiB |     N/A      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+

+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+
後來編darknet (cuda+opencv4.x+gpu)時
出現no kernel image is available for execution on the device
才發現,我電腦顯卡GeForce GT 750M的Compute Capability為3.0, 查表
只能用compute30,code=sm_30
所以就把cuda11給移除 (sudo apt-get remove --auto-remove cuda-11)
重新安裝cuda10
但後來還是顯示CUDA Version: 11.3@@
ii  cuda-10-0                                 10.0.130-1                          amd64        CUDA 10.0 meta-package
ii  cuda-command-line-tools-10-0              10.0.130-1                          amd64        CUDA command-line tools
ii  cuda-compiler-10-0                        10.0.130-1                          amd64        CUDA compiler
ii  cuda-cublas-10-0                          10.0.130-1                          amd64        CUBLAS native runtime libraries
ii  cuda-cublas-dev-10-0                      10.0.130-1                          amd64        CUBLAS native dev links, headers
...
...

bashrc設定
export PATH=$PATH:/usr/local/cuda/bin
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64:/opt/cuda/cuda_10.0/lib64/
export CUDADIR=/usr/local/cuda
export CUDA_HOME=/usr/local/cuda

下載cudnn(需另外註冊)
必需配合 CUDA 的版本

Download cuDNN v7.6.5 (November 5th, 2019), for CUDA 10.0


下載Darknet
$ git clone https://github.com/pjreddie/darknet.git
修改Makefile (OpenCV 4.x需事先安裝好)
GPU=1
CUDNN=1
OPENCV=1
OPENMP=0
DEBUG=0

ARCH= -gencode arch=compute_30,code=sm_30 (GeForce GT 750M的Compute Capability為3.0)
#      -gencode arch=compute_20,code=[sm_20,sm_21] \ This one is deprecated?

# This is what I use, uncomment if you know your arch and want to specify
# ARCH= -gencode arch=compute_52,code=compute_52

VPATH=./src/:./examples
SLIB=libdarknet.so
ALIB=libdarknet.a
EXEC=darknet
OBJDIR=./obj/

CC=gcc
CPP=g++
NVCC=nvcc 
AR=ar
ARFLAGS=rcs
OPTS=-Ofast
LDFLAGS= -lm -pthread 
COMMON= -Iinclude/ -Isrc/ -I/opt/cuda/cuda_10.0/include/
CFLAGS=-Wall -Wno-unused-result -Wno-unknown-pragmas -Wfatal-errors -fPIC

ifeq ($(OPENMP), 1) 
CFLAGS+= -fopenmp
endif

ifeq ($(DEBUG), 1) 
OPTS=-O0 -g
endif

CFLAGS+=$(OPTS)

ifeq ($(OPENCV), 1) 
COMMON+= -DOPENCV
CFLAGS+= -DOPENCV
LDFLAGS+= `pkg-config --libs opencv4` -lstdc++
COMMON+= `pkg-config --cflags opencv4` 
endif

ifeq ($(GPU), 1) 
COMMON+= -DGPU -I/usr/local/cuda/include/
CFLAGS+= -DGPU
LDFLAGS+= -L/opt/cuda/cuda_10.0/lib64 -lcuda -lcudart -lcublas -lcurand
LDFLAGS+= -L/usr/local/cuda/lib64 -lcuda -lcudart -lcublas -lcurand
endif

ifeq ($(CUDNN), 1) 
COMMON+= -DCUDNN 
CFLAGS+= -DCUDNN
LDFLAGS+= -lcudnn
endif
...
...
...
...
安裝gcc 7 (CUDA10 不支援大於7版)
$ sudo apt-get install gcc-7 g++-7
$ ln -s /usr/bin/gcc-7 /usr/local/cuda/bin/gcc
$ ln -s /usr/bin/g++-7 /usr/local/cuda/bin/g++
...
...
#define PRINT_CUDNN_ALGO 0
#define MEMORY_LIMIT 2000000000
...
...
     #if CUDNN_MAJOR >= 7
     cudnnSetConvolutionGroupCount(l->convDesc, l->groups);
     #else
     if(l->groups > 1){
         error("CUDNN < 7 doesn't support groups, please upgrade!");
     }
     #endif
 
 
    #if CUDNN_MAJOR >= 8
    int returnedAlgoCount;
    cudnnConvolutionFwdAlgoPerf_t       fw_results[2 * CUDNN_CONVOLUTION_FWD_ALGO_COUNT];
    cudnnConvolutionBwdDataAlgoPerf_t   bd_results[2 * CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT];
    cudnnConvolutionBwdFilterAlgoPerf_t bf_results[2 * CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT];
 
    cudnnFindConvolutionForwardAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->weightDesc,
            l->convDesc,
            l->dstTensorDesc,
            CUDNN_CONVOLUTION_FWD_ALGO_COUNT,
            &returnedAlgoCount,
	    fw_results);
    for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){
        #if PRINT_CUDNN_ALGO > 0
        printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
               cudnnGetErrorString(fw_results[algoIndex].status),
               fw_results[algoIndex].algo, fw_results[algoIndex].time,
               (unsigned long long)fw_results[algoIndex].memory);
        #endif
        if( fw_results[algoIndex].memory < MEMORY_LIMIT ){
            l->fw_algo = fw_results[algoIndex].algo;
            break;
	}
    }
 
    cudnnFindConvolutionBackwardDataAlgorithm(cudnn_handle(),
            l->weightDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dsrcTensorDesc,
            CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT,
            &returnedAlgoCount,
            bd_results);
    for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){
        #if PRINT_CUDNN_ALGO > 0
        printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
               cudnnGetErrorString(bd_results[algoIndex].status),
               bd_results[algoIndex].algo, bd_results[algoIndex].time,
               (unsigned long long)bd_results[algoIndex].memory);
        #endif
        if( bd_results[algoIndex].memory < MEMORY_LIMIT ){
            l->bd_algo = bd_results[algoIndex].algo;
            break;
        }
    }
 
    cudnnFindConvolutionBackwardFilterAlgorithm(cudnn_handle(),
            l->srcTensorDesc,
            l->ddstTensorDesc,
            l->convDesc,
            l->dweightDesc,
            CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT,
            &returnedAlgoCount,
            bf_results);
    for(int algoIndex = 0; algoIndex < returnedAlgoCount; ++algoIndex){
        #if PRINT_CUDNN_ALGO > 0
        printf("^^^^ %s for Algo %d: %f time requiring %llu memory\n",
               cudnnGetErrorString(bf_results[algoIndex].status),
               bf_results[algoIndex].algo, bf_results[algoIndex].time,
               (unsigned long long)bf_results[algoIndex].memory);
        #endif
        if( bf_results[algoIndex].memory < MEMORY_LIMIT ){
            l->bf_algo = bf_results[algoIndex].algo;
            break;
        }
    }
 
    #else
...
...
...
...
測試
$ wget https://pjreddie.com/media/files/yolov3.weights
$ ./darknet detect cfg/yolov3.cfg yolov3.weights data/dog.jpg
...
...
...
CUDA Error: out of memory
darknet: ./src/cuda.c:36: check_error: Assertion `0' failed.

$ vim cfg/yolov3.cfg
...
...
batch=64
subdivisions=64
width=416
height=416
...
...


碰到的問題
01. opencv2/opencv.hpp: No such file or directory
02. -lcudnn not found
03. cudnn.h: No such file or directory
04. gcc versions later than 7 are not supported by CUDA 10
05. error: 'CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT' undeclared
06. image_opencv.cpp:12:1: error: ‘IplImage’ does not name a type
07. CUDA Error: out of memory
08. Unsupported gpu architecture 'compute_30'
09. no kernel image is available for execution on the device


查看顯卡資訊的指令,但要先裝好cuda...
deviceQuery

$ cd /usr/local/cuda/samples
$ make
$ bin/x86_64/linux/release/deviceQuery


$ ./deviceQuery
./deviceQuery Starting...

CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce GTX 765M"
  CUDA Driver Version / Runtime Version          11.3 / 10.0
  CUDA Capability Major/Minor version number:    3.0
  Total amount of global memory:                 2002 MBytes (2099511296 bytes)
  ( 4) Multiprocessors, (192) CUDA Cores/MP:     768 CUDA Cores
  GPU Max Clock rate:                            863 MHz (0.86 GHz)
  Memory Clock rate:                             2004 Mhz
  Memory Bus Width:                              128-bit
...
...
...

沒有留言:

張貼留言