From ea8c965471a30fe05c68514d8e2045adb110d9ec Mon Sep 17 00:00:00 2001 From: cathyzhang222 Date: Wed, 9 Nov 2022 20:15:33 +0800 Subject: [PATCH 1/2] [Feature] Support Nms with cambricon MLU590 backend support 590 for nms --- mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu | 33 +++++++++++++++++---- mmcv/ops/csrc/common/pytorch_mlu_helper.hpp | 20 +++++++++++++ mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp | 11 +++++-- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu index fb6185048a..dcc722d854 100644 --- a/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu +++ b/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu @@ -234,7 +234,7 @@ __mlu_func__ void nms_detection_ux( IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram, const int input_num_boxes, const int max_output_size, const float thresh_iou, const float thresh_score, const float offset, - const int output_mode, const int algo) { + const int output_mode, const int algo, char *cdma_gdram) { exit_flag[0] = 0; IN_DT *sram = (IN_DT *)sram_buffer; @@ -321,7 +321,25 @@ __mlu_func__ void nms_detection_ux( __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM); } __sync_all(); -#if __BANG_ARCH__ <= 372 +#if __BANG_ARCH__ >= 590 + __memcpy((char *)cdma_gdram + REDUCE_NUM * clusterId * sizeof(IN_DT), sram, + REDUCE_NUM * sizeof(IN_DT), SRAM2GDRAM); + __sync_all(); + if (clusterId == 0 && coreId == 0) { + __bang_write_zero(inter_x1, NMS_SIZE); + __memcpy((char *)inter_x1, (char *)cdma_gdram, sizeof(IN_DT), GDRAM2NRAM, + sizeof(IN_DT), REDUCE_NUM * sizeof(IN_DT), clusterDim - 1); + __bang_max(max_box, inter_x1, NMS_SIZE); + int max_cluster = (sizeof(IN_DT) == sizeof(half)) + ? ((uint16_t *)max_box)[1] + : ((uint32_t *)max_box)[1]; + __memcpy((char *)cdma_gdram, + (char *)cdma_gdram + max_cluster * REDUCE_NUM * sizeof(IN_DT), + REDUCE_NUM * sizeof(IN_DT), GDRAM2GDRAM); + } + __sync_all(); + __memcpy(max_box, cdma_gdram, REDUCE_NUM * sizeof(IN_DT), GDRAM2NRAM); +#else findGlobalMaxBox(max_box, sram, inter_x1); #endif @@ -380,6 +398,7 @@ __mlu_global__ void MLUUionXKernelNMS( int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2; int32_t *exit_flag = (int32_t *)((char *)workspace + INFO_NUM * input_num_boxes * input_dwidth); + char *cdma_addr = (char *)exit_flag + sizeof(int32_t); int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth; int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size; @@ -409,24 +428,26 @@ __mlu_global__ void MLUUionXKernelNMS( nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output, score_data, boxes_data, input_ram, input_num_boxes, max_output_size, iou_threshold, confidence_threshold, - offset, output_mode, algo); + offset, output_mode, algo, cdma_addr); } else { nms_detection_ux(exit_flag, output_box_num, (uint32_t *)output, (half *)score_data, (half *)boxes_data, input_ram, input_num_boxes, max_output_size, iou_threshold, - confidence_threshold, offset, output_mode, algo); + confidence_threshold, offset, output_mode, algo, + cdma_addr); } } else { if (data_type_input == CNRT_FLOAT32) { nms_detection_ux(exit_flag, output_box_num, (float *)output, score_data, boxes_data, input_ram, input_num_boxes, max_output_size, iou_threshold, confidence_threshold, offset, output_mode, - algo); + algo, cdma_addr); } else { nms_detection_ux(exit_flag, output_box_num, (half *)output, (half *)score_data, (half *)boxes_data, input_ram, input_num_boxes, max_output_size, iou_threshold, - confidence_threshold, offset, output_mode, algo); + confidence_threshold, offset, output_mode, algo, + cdma_addr); } } ((uint32_t *)result_num)[0] = output_box_num; diff --git a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp index 3e1141ec21..e49572ca84 100644 --- a/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp +++ b/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp @@ -36,6 +36,26 @@ inline int32_t getJobLimitCapability() { return (int32_t)ctx_conf_param.unionLimit; } +inline int32_t getCoreNumOfJobLimitCapability() { + switch (getJobLimitCapability()) { + default: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * + getJobLimitCapability(); + case CN_KERNEL_CLASS_BLOCK: + return 1; + case CN_KERNEL_CLASS_UNION: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + case CN_KERNEL_CLASS_UNION2: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 2; + case CN_KERNEL_CLASS_UNION4: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 4; + case CN_KERNEL_CLASS_UNION8: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 8; + case CN_KERNEL_CLASS_UNION16: + return torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster) * 16; + } +} + #endif // MMCV_WITH_MLU #endif // PYTORCH_MLU_HELPER_HPP_ diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp index a45a510e89..4979a229fd 100644 --- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp +++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (C) 2021 by Cambricon. + * Copyright (C) 2021 Cambricon. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF @@ -34,6 +34,7 @@ static cnnlStatus_t policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type, int &core_num_per_class, const int input_box_num) { uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + uint32_t cluster_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount); uint32_t job_limit = getJobLimitCapability(); uint32_t core_number = job_limit; @@ -116,7 +117,11 @@ Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold, } else { space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float); } - +#if __BANG_ARCH__ > 370 + int cluster_num = getCoreNumOfJobLimitCapability() / + torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster); + space_size += cluster_number * sizeof(float) * 7; +#endif auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte)); // get compute queue @@ -148,4 +153,4 @@ Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) { } Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset); -REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu); +REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu); \ No newline at end of file From 705caa604b54f3ca144d9fe3eb28047e3e55c00e Mon Sep 17 00:00:00 2001 From: cathyzhang222 Date: Thu, 10 Nov 2022 11:15:01 +0800 Subject: [PATCH 2/2] add blank --- mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp index 4979a229fd..e2f4322a02 100644 --- a/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp +++ b/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp @@ -153,4 +153,4 @@ Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) { } Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset); -REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu); \ No newline at end of file +REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);