当前位置:   article > 正文

Ascend C 自定义PRelu算子

Ascend C 自定义PRelu算子

本文分享自华为云社区《Ascend C 自定义PRelu算子》,作者: jackwangcumt。

1 PRelu算子概述

PReLU是 Parametric Rectified Linear Unit的缩写,首次由何凯明团队提出,和LeakyReLU非常类似,是Relu的改进版本,在几乎没有增加额外参数的前提下既可以提升模型的拟合能力,又能减小过拟合风险。PReLU的数学表达式我们可以参考pytorch中PReLU的描述(PReLU — PyTorch 2.1 documentation):

cke_138.png

2 Ascend C自定义算子

基于Ascend C进行自定义算子开发之前,需要成功基于昇腾设备安装相关的驱动、固件以及开发者套件。我之前安装的开发者套件版本过低,编译运行官方的Sample部分示例会报错,因此,需要重新安装一个8.0新版本,依次用root执行如下命令:

  1. # 卸载 cann-toolkit_7.0.RC1
  2. root@atlas500ai:/home/kzroot/mysoft# ./Ascend-cann-toolkit_7.0.RC1_linux-aarch64.run --uninstall
  3. # 清空遗留文件
  4. rm -rf /usr/local/Ascend/ascend-toolkit/*
  5. # 安装 cann-toolkit_8.0.RC1.alpha002
  6. ./Ascend-cann-toolkit_8.0.RC1.alpha002_linux-aarch64.run --install --install-for-all --quiet
  7. #安装依赖protobuf
  8. pip3 install protobuf==3.20.0

在一个目录下新建单算子工程描述文件 PReluCustom.json ,内容参考如下:

  1. [
  2. {
  3. "op": "PReluCustom",
  4. "language": "cpp",
  5. "input_desc": [
  6. {
  7. "name": "x",
  8. "param_type": "required",
  9. "format": [
  10. "ND"
  11. ],
  12. "type": [
  13. "float"
  14. ]
  15. }
  16. ],
  17. "output_desc": [
  18. {
  19. "name": "y",
  20. "param_type": "required",
  21. "format": [
  22. "ND"
  23. ],
  24. "type": [
  25. "float"
  26. ]
  27. }
  28. ],
  29. "attr": [
  30. {
  31. "name": "alpha",
  32. "param_type": "optional",
  33. "type": "float",
  34. "default_value": "0.002"
  35. }
  36. ]
  37. }
  38. ]

用开发者套件中内置的算子工程生成工具msopgen ,通过描述文件自动生成单算子工程代码目录:

  1. /usr/local/Ascend/ascend-toolkit/8.0.RC1.alpha002/python/site-packages/bin/msopgen gen -i ./PReluCustom.json
  2. -c ai_core-Ascend310P3 -lan cpp -out ./PReluCustom

执行成功后,会基于C++语言生成单算子工程代码目录PReluCustom,其中包含的CMakePresets.json文件,有几个重要的配置项,特别是开发者套件安装的路径ASCEND_CANN_PACKAGE_PATH,需要根据本地情况进行修改,我这里是 /usr/local/Ascend/ascend-toolkit/latest 否则会出现编译错误,我这里修改的部分代码如下:

  1. {
  2. "version": 1,
  3. "cmakeMinimumRequired": {
  4. "major": 3,
  5. "minor": 19,
  6. "patch": 0
  7. },
  8. "configurePresets": [
  9. {
  10. "name": "default",
  11. "displayName": "Default Config",
  12. "description": "Default build using Unix Makefiles generator",
  13. "generator": "Unix Makefiles",
  14. "binaryDir": "${sourceDir}/build_out",
  15. "cacheVariables": {
  16. "CMAKE_BUILD_TYPE": {
  17. "type": "STRING",
  18. "value": "Release"
  19. },
  20. "ENABLE_SOURCE_PACKAGE": {
  21. "type": "BOOL",
  22. "value": "True"
  23. },
  24. "ENABLE_BINARY_PACKAGE": {
  25. "type": "BOOL",
  26. "value": "True"
  27. },
  28. "ASCEND_COMPUTE_UNIT": {
  29. "type": "STRING",
  30. "value": "ascend310p"
  31. },
  32. "ENABLE_TEST": {
  33. "type": "BOOL",
  34. "value": "True"
  35. },
  36. "vendor_name": {
  37. "type": "STRING",
  38. "value": "customize"
  39. },
  40. "ASCEND_CANN_PACKAGE_PATH": {
  41. "type": "PATH",
  42. "value": "/usr/local/Ascend/ascend-toolkit/latest"
  43. },
  44. "ASCEND_PYTHON_EXECUTABLE": {
  45. "type": "STRING",
  46. "value": "python3"
  47. },
  48. "CMAKE_INSTALL_PREFIX": {
  49. "type": "PATH",
  50. "value": "${sourceDir}/build_out"
  51. },
  52. "ENABLE_CROSS_COMPILE": {
  53. "type": "BOOL",
  54. "value": "False"
  55. },
  56. "CMAKE_CROSS_PLATFORM_COMPILER": {
  57. "type": "PATH",
  58. "value": "/usr/bin/aarch64-linux-gnu-g++"
  59. }
  60. }
  61. }
  62. ]
  63. }

其中的vendor_name 可以根据自己的情况进行修改,默认的算子部署后会放于customize 目录下,这里可以修改,比如改成jackwangcumt。而且单算子工程每次部署会进行覆盖,因此,这里需要注意一下。生成的p_relu_custom.cpp文件,重点的算子计算为:

  1. __aicore__ inline void Compute(int32_t progress)
  2. {
  3. // deque input tensors from VECIN queue
  4. LocalTensor<float> xLocal = inQueueX.DeQue<float>();
  5. LocalTensor<float> yLocal = outQueueY.AllocTensor<float>();
  6. LocalTensor<float> tmpTensor1 = tmpBuffer1.Get<float>();
  7. float inputVal = 0.0;
  8. Maxs(tmpTensor1, xLocal, inputVal, this->tileLength); // x >= 0 --> x
  9. // x < 0
  10. Mins(xLocal, xLocal, inputVal, this->tileLength);
  11. Muls(xLocal, xLocal, this->alpha, this->tileLength);
  12. Add(yLocal, xLocal, tmpTensor1, this->tileLength);
  13. outQueueY.EnQue<float>(yLocal);
  14. // free input tensors for reuse
  15. inQueueX.FreeTensor(xLocal);
  16. }

这里通过内置的原生算子来分别处理输入x<0和x>=0两个部分的数据处理,再通过Add将两个部分合并,得到最终的数据。在op_host目录下的p_relu_custom_tiling.h代码如下所示:

  1. #include "register/tilingdata_base.h"
  2. namespace optiling {
  3. BEGIN_TILING_DATA_DEF(TilingData)
  4. TILING_DATA_FIELD_DEF(uint32_t, totalLength);
  5. TILING_DATA_FIELD_DEF(uint32_t, tileNum);
  6. TILING_DATA_FIELD_DEF(float, alpha);
  7. END_TILING_DATA_DEF;
  8. REGISTER_TILING_DATA_CLASS(PReluCustom, TilingData)
  9. }

p_relu_custom.cpp 核心代码如下所示:

  1. #include "p_relu_custom_tiling.h"
  2. #include "register/op_def_registry.h"
  3. namespace optiling {
  4. const uint32_t BLOCK_DIM = 8;
  5. const uint32_t TILE_NUM = 16 ; // 这个数可能影响测试是否通过
  6. static ge::graphStatus TilingFunc(gert::TilingContext* context)
  7. {
  8. TilingData tiling;
  9. uint32_t totalLength = context->GetInputTensor(0)->GetShapeSize();
  10. const gert::RuntimeAttrs *attrs = context->GetAttrs();
  11. const float *alpha = attrs->GetAttrPointer<float>(0);
  12. context->SetBlockDim(BLOCK_DIM);
  13. tiling.set_totalLength(totalLength);
  14. tiling.set_tileNum(TILE_NUM);
  15. tiling.set_alpha(*alpha);
  16. tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
  17. context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
  18. size_t *currentWorkspace = context->GetWorkspaceSizes(1);
  19. currentWorkspace[0] = 0;
  20. return ge::GRAPH_SUCCESS;
  21. }
  22. }
  23. namespace ge {
  24. static ge::graphStatus InferShape(gert::InferShapeContext* context)
  25. {
  26. const gert::Shape* x1_shape = context->GetInputShape(0);
  27. gert::Shape* y_shape = context->GetOutputShape(0);
  28. *y_shape = *x1_shape;
  29. return GRAPH_SUCCESS;
  30. }
  31. }
  32. namespace ops {
  33. class PReluCustom : public OpDef {
  34. public:
  35. explicit PReluCustom(const char* name) : OpDef(name)
  36. {
  37. this->Input("x")
  38. .ParamType(REQUIRED)
  39. .DataType({ge::DT_FLOAT})
  40. .Format({ge::FORMAT_ND})
  41. .UnknownShapeFormat({ge::FORMAT_ND});
  42. this->Output("y")
  43. .ParamType(REQUIRED)
  44. .DataType({ge::DT_FLOAT})
  45. .Format({ge::FORMAT_ND})
  46. .UnknownShapeFormat({ge::FORMAT_ND});
  47. this->Attr("alpha").AttrType(OPTIONAL).Float(0.002);
  48. this->SetInferShape(ge::InferShape);
  49. this->AICore()
  50. .SetTiling(optiling::TilingFunc);
  51. this->AICore().AddConfig("ascend310p");
  52. }
  53. };
  54. OP_ADD(PReluCustom);
  55. }

执行如下命令,编译算子工程:

root@atlas500ai:/home/kzroot/mysoft/myAscendC/PReluSample/PReluCustom# bash build.sh 

cke_139.png

Self-extractable archive "custom_opp_ubuntu_aarch64.run" successfully created. 则表明编译成功。执行如下命令进行算子部署:

PReluCustom# ./build_out/custom_opp_ubuntu_aarch64.run

cke_140.png

3 Ascend C自定义算子验证

基于Ascend C 自定义算子需要进行正确性验证,这里新建一个AclNNInvocation目录(可以参考官方示例中的相关内容),目录结构如下所示:

cke_141.png

其中的gen_data.py用于生成测试的输入和输出数据,verity_result.py用于验证精度。gen_data.py内容如下所示:

  1. import numpy as np
  2. import os
  3. def gen_golden_data_simple():
  4. alpha = np.array(0.002, dtype=np.float32)
  5. input_x = np.random.uniform(-100, 100, [8, 200, 1024]).astype(np.float32)
  6. golden = np.where(input_x >= 0, input_x, input_x * alpha).astype(np.float32)
  7. os.system("mkdir -p input")
  8. os.system("mkdir -p output")
  9. input_x.tofile("./input/input_x.bin")
  10. golden.tofile("./output/golden.bin")
  11. if __name__ == "__main__":
  12. gen_golden_data_simple()

src目录下的CMakeLists.txt有一个环境变量可能需要修改,即 set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") ,默认是不需要修改的,他需要和vendor_name一致。执行如下命令进行测试:

PReluSample/AclNNInvocation# bash run.sh

cke_142.png

点击关注,第一时间了解华为云新鲜技术~

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/知新_RL/article/detail/390030
推荐阅读
相关标签
  

闽ICP备14008679号