更多样例【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言原生支持C和C标准规范主要由类库和语言扩展层构成提供多层级API满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit通过tensor高维切分计算接口中的mask连续模式实现数据非连续计算。uint64_t mask 64; // 每个迭代内只计算前64个数 AscendC::Add(dstLocal, src0Local, src1Local, mask, 4, { 1, 1, 1, 8, 8, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ... 640 undefined ... undefined 770 772 774 ... 896 undefined ... undefined 1026 1028 1030 ... 1152 undefined ... undefined 1282 1284 1286 ... 1408 undefined ... undefined]通过tensor高维切分计算接口中的mask逐比特模式实现数据非连续计算。uint64_t mask[2] { UINT64_MAX, 0 }; // mask[0]满mask[1]空每次只计算前64个数 AscendC::Add(dstLocal, src0Local, src1Local, mask, 4, { 1, 1, 1, 8, 8, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ... 640 undefined ... undefined 770 772 774 ... 896 undefined ... undefined 1026 1028 1030 ... 1152 undefined ... undefined 1282 1284 1286 ... 1408 undefined ... undefined]通过控制tensor高维切分计算接口的Repeat Stride参数实现数据非连续计算。uint64_t mask 128; // repeatTime设置为2表示一共需要进行2次迭代 // src0BlkStride, src1BlkStride设置为1表示每个迭代内src0参与计算的数据地址间隔为1个DataBlock // src0RepStride设置为16, 表示相邻迭代之间src0起始地址间隔为16个datablock AscendC::Add(dstLocal, src0Local, src1Local, mask, 2, { 1, 1, 1, 8, 16, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ...768 898 900 902 ... 1150 1152 undefined ... undefined]通过控制tensor高维切分计算接口的DataBlock Stride和Repeat Stride参数实现数据非连续计算。uint64_t mask 128; // repeatTime设置为2表示一共需要进行2次迭代 // src0BlkStride设置为2表示每个迭代内src0参与计算的数据地址间隔为2个datablock // src0RepStride设置为16, 表示相邻迭代之间src0起始地址间隔为16个datablock AscendC::Add(dstLocal, src0Local, src1Local, mask, 2, { 1, 2, 1, 8, 16, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ... 544 562 564 566 ... 592 610 612 614 ... 640 658 660 662 ... 688 706 708 710 ... 736 754 756 758 ... 784 802 804 806 ... 832 850 852 854 ... 880 898 900 902 ... 928 946 948 950 ... 976 994 996 998 ... 1024 1042 1044 1046 ... 1072 1090 1092 1094 ... 1120 1138 1140 1142 ... 1168 1186 1188 1190 ... 1216 1234 1236 1238 … 1264 undefined ... undefined]需要传入标量参数的API使用样例。#include kernel_operator.h constexpr int32_t BUFFER_NUM 2; class KernelBinaryScalar { public: __aicore__ inline KernelBinaryScalar() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, float scalar, uint32_t totalLength, uint32_t tileNum) { this-blockLength totalLength / AscendC::GetBlockNum(); this-scalar scalar; this-tileNum tileNum; ASSERT(tileNum ! 0 tile num can not be zero!); this-tileLength this-blockLength / tileNum / BUFFER_NUM; xGm.SetGlobalBuffer((__gm__ DTYPE_X*)x this-blockLength * AscendC::GetBlockIdx(), this-blockLength); zGm.SetGlobalBuffer((__gm__ DTYPE_Z*)z this-blockLength * AscendC::GetBlockIdx(), this-blockLength); pipe.InitBuffer(inQueueX, BUFFER_NUM, this-tileLength * sizeof(DTYPE_X)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this-tileLength * sizeof(DTYPE_Z)); } __aicore__ inline void Process() { int32_t loopCount this-tileNum * BUFFER_NUM; for (int32_t i 0; i loopCount; i) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensorDTYPE_X xLocal inQueueX.AllocTensorDTYPE_X(); AscendC::DataCopy(xLocal, xGm[progress * this-tileLength], this-tileLength); inQueueX.EnQue(xLocal); } __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensorDTYPE_X xLocal inQueueX.DeQueDTYPE_X(); AscendC::LocalTensorDTYPE_Z zLocal outQueueZ.AllocTensorDTYPE_Z(); AscendC::Adds(zLocal, xLocal, (DTYPE_X)scalar, this-tileLength); outQueueZ.EnQueDTYPE_Z(zLocal); inQueueX.FreeTensor(xLocal); } __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensorDTYPE_Z zLocal outQueueZ.DeQueDTYPE_Z(); AscendC::DataCopy(zGm[progress * this-tileLength], zLocal, this-tileLength); outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; AscendC::TQueAscendC::TPosition::VECIN, BUFFER_NUM inQueueX; AscendC::TQueAscendC::TPosition::VECOUT, BUFFER_NUM outQueueZ; AscendC::GlobalTensorDTYPE_X xGm; AscendC::GlobalTensorDTYPE_Z zGm; float scalar; uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; }; extern C __global__ __aicore__ void binary_scalar_simple_kernel(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); KernelBinaryScalar op; op.Init(x, z, tilingData.scalar, tilingData.totalLength, tilingData.tileNum); if (TILING_KEY_IS(1)) { op.Process(); } }【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言原生支持C和C标准规范主要由类库和语言扩展层构成提供多层级API满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考
CANN/asc-devkit SIMD基础算术示例
发布时间:2026/5/20 12:58:30
更多样例【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言原生支持C和C标准规范主要由类库和语言扩展层构成提供多层级API满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit通过tensor高维切分计算接口中的mask连续模式实现数据非连续计算。uint64_t mask 64; // 每个迭代内只计算前64个数 AscendC::Add(dstLocal, src0Local, src1Local, mask, 4, { 1, 1, 1, 8, 8, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ... 640 undefined ... undefined 770 772 774 ... 896 undefined ... undefined 1026 1028 1030 ... 1152 undefined ... undefined 1282 1284 1286 ... 1408 undefined ... undefined]通过tensor高维切分计算接口中的mask逐比特模式实现数据非连续计算。uint64_t mask[2] { UINT64_MAX, 0 }; // mask[0]满mask[1]空每次只计算前64个数 AscendC::Add(dstLocal, src0Local, src1Local, mask, 4, { 1, 1, 1, 8, 8, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ... 640 undefined ... undefined 770 772 774 ... 896 undefined ... undefined 1026 1028 1030 ... 1152 undefined ... undefined 1282 1284 1286 ... 1408 undefined ... undefined]通过控制tensor高维切分计算接口的Repeat Stride参数实现数据非连续计算。uint64_t mask 128; // repeatTime设置为2表示一共需要进行2次迭代 // src0BlkStride, src1BlkStride设置为1表示每个迭代内src0参与计算的数据地址间隔为1个DataBlock // src0RepStride设置为16, 表示相邻迭代之间src0起始地址间隔为16个datablock AscendC::Add(dstLocal, src0Local, src1Local, mask, 2, { 1, 1, 1, 8, 16, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ...768 898 900 902 ... 1150 1152 undefined ... undefined]通过控制tensor高维切分计算接口的DataBlock Stride和Repeat Stride参数实现数据非连续计算。uint64_t mask 128; // repeatTime设置为2表示一共需要进行2次迭代 // src0BlkStride设置为2表示每个迭代内src0参与计算的数据地址间隔为2个datablock // src0RepStride设置为16, 表示相邻迭代之间src0起始地址间隔为16个datablock AscendC::Add(dstLocal, src0Local, src1Local, mask, 2, { 1, 2, 1, 8, 16, 8 });结果示例如下输入数据src0Local[1 2 3 ... 512] 输入数据src1Local[513 514 515 ... 1024] 输出数据dstLocal [514 516 518 ... 544 562 564 566 ... 592 610 612 614 ... 640 658 660 662 ... 688 706 708 710 ... 736 754 756 758 ... 784 802 804 806 ... 832 850 852 854 ... 880 898 900 902 ... 928 946 948 950 ... 976 994 996 998 ... 1024 1042 1044 1046 ... 1072 1090 1092 1094 ... 1120 1138 1140 1142 ... 1168 1186 1188 1190 ... 1216 1234 1236 1238 … 1264 undefined ... undefined]需要传入标量参数的API使用样例。#include kernel_operator.h constexpr int32_t BUFFER_NUM 2; class KernelBinaryScalar { public: __aicore__ inline KernelBinaryScalar() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, float scalar, uint32_t totalLength, uint32_t tileNum) { this-blockLength totalLength / AscendC::GetBlockNum(); this-scalar scalar; this-tileNum tileNum; ASSERT(tileNum ! 0 tile num can not be zero!); this-tileLength this-blockLength / tileNum / BUFFER_NUM; xGm.SetGlobalBuffer((__gm__ DTYPE_X*)x this-blockLength * AscendC::GetBlockIdx(), this-blockLength); zGm.SetGlobalBuffer((__gm__ DTYPE_Z*)z this-blockLength * AscendC::GetBlockIdx(), this-blockLength); pipe.InitBuffer(inQueueX, BUFFER_NUM, this-tileLength * sizeof(DTYPE_X)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this-tileLength * sizeof(DTYPE_Z)); } __aicore__ inline void Process() { int32_t loopCount this-tileNum * BUFFER_NUM; for (int32_t i 0; i loopCount; i) { CopyIn(i); Compute(i); CopyOut(i); } } private: __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensorDTYPE_X xLocal inQueueX.AllocTensorDTYPE_X(); AscendC::DataCopy(xLocal, xGm[progress * this-tileLength], this-tileLength); inQueueX.EnQue(xLocal); } __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensorDTYPE_X xLocal inQueueX.DeQueDTYPE_X(); AscendC::LocalTensorDTYPE_Z zLocal outQueueZ.AllocTensorDTYPE_Z(); AscendC::Adds(zLocal, xLocal, (DTYPE_X)scalar, this-tileLength); outQueueZ.EnQueDTYPE_Z(zLocal); inQueueX.FreeTensor(xLocal); } __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensorDTYPE_Z zLocal outQueueZ.DeQueDTYPE_Z(); AscendC::DataCopy(zGm[progress * this-tileLength], zLocal, this-tileLength); outQueueZ.FreeTensor(zLocal); } private: AscendC::TPipe pipe; AscendC::TQueAscendC::TPosition::VECIN, BUFFER_NUM inQueueX; AscendC::TQueAscendC::TPosition::VECOUT, BUFFER_NUM outQueueZ; AscendC::GlobalTensorDTYPE_X xGm; AscendC::GlobalTensorDTYPE_Z zGm; float scalar; uint32_t blockLength; uint32_t tileNum; uint32_t tileLength; }; extern C __global__ __aicore__ void binary_scalar_simple_kernel(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); KernelBinaryScalar op; op.Init(x, z, tilingData.scalar, tilingData.totalLength, tilingData.tileNum); if (TILING_KEY_IS(1)) { op.Process(); } }【免费下载链接】asc-devkit本项目是CANN 推出的昇腾AI处理器专用的算子程序开发语言原生支持C和C标准规范主要由类库和语言扩展层构成提供多层级API满足多维场景算子开发诉求。项目地址: https://gitcode.com/cann/asc-devkit创作声明:本文部分内容由AI辅助生成(AIGC),仅供参考