Ascend C運算元開發指南
Ascend C的特點
C/C++原生程式設計:Ascend C原生支援C和C++標準規範。
遮蔽硬體差異:程式設計模型遮蔽了硬體差異,提高了程式碼的通用性。
API封裝:類庫API封裝,既保證易用性,又兼顧高效性。
孿生除錯:支援在CPU側模擬NPU側的行為,便於除錯。
開發基本流程
環境準備:
安裝CANN開發套件包,根據機器CPU架構下載對應的版本。
示例(AArch64架構):
bash
複製程式碼
wget -O Ascend-cann-toolkit_8.0.RC1.alpha002_linux-aarch64.run <下載連結>
chmod +x Ascend-cann-toolkit_8.0.RC1.alpha002_linux-x86_64.run
./Ascend-cann-toolkit_8.0.RC1.alpha002_linux-x86_64.run --check
sudo ./Ascend-cann-toolkit_8.0.RC1.alpha002_linux-x86_64.run --install
source /usr/local/Ascend/ascend-toolkit/set_env.sh
運算元分析:
分析運算元的數學表示式、輸入輸出資料型別和計算邏輯。
例如,Add運算元的數學表示式為 $z = x + y$,輸入輸出資料型別為half(float16),支援的shape為(8, 2048)。
核函式開發(以Add運算元為例):
獲取樣例程式碼目錄quick-start,依次開發add_custom.cpp、main.cpp、gen_data.py三個檔案。
核函式實現(add_custom.cpp):
cpp
複製程式碼
extern "C" global aicore void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) {
KernelAdd op;
op.Init(x, y, z);
op.Process();
}
void add_custom_do(uint32_t blockDim, void* l2ctrl, void* stream, uint8_t* x, uint8_t* y, uint8_t* z) {
add_custom<<<blockDim, l2ctrl, stream>>>(x, y, z);
}
運算元類實現(KernelAdd):
cpp
複製程式碼
class KernelAdd {
public:
aicore inline KernelAdd() {}
aicore inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) {
// 初始化程式碼
}
aicore inline void Process() {
// 核心處理函式
}
private:
// 各階段函式定義
aicore inline void CopyIn(int32_t progress) {}
aicore inline void Compute(int32_t progress) {}
aicore inline void CopyOut(int32_t progress) {}
private:
TPipe pipe;
TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
GlobalTensor
};
Process函式:
cpp
複製程式碼
aicore inline void Process() {
constexpr int32_t loopCount = TILE_NUM * BUFFER_NUM;
for (int32_t i = 0; i < loopCount; i++) {
CopyIn(i);
Compute(i);
CopyOut(i);
}
}
CopyIn函式:
cpp
複製程式碼
aicore inline void CopyIn(int32_t progress) {
LocalTensor
LocalTensor
DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
inQueueX.EnQue(xLocal);
inQueueY.EnQue(yLocal);
}
Compute函式:
cpp
複製程式碼
aicore inline void Compute(int32_t progress) {
LocalTensor
LocalTensor
LocalTensor
Add(zLocal, xLocal, yLocal, TILE_LENGTH);
outQueueZ.EnQue
inQueueX.FreeTensor(xLocal);
inQueueY.FreeTensor(yLocal);
}
CopyOut函式:
cpp
複製程式碼
aicore inline void CopyOut(int32_t progress) {
LocalTensor
DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
outQueueZ.FreeTensor(zLocal);
}
執行驗證(main.cpp):
CPU側驗證:
cpp
複製程式碼
// 初始化記憶體並呼叫核函式
uint8_t* x = (uint8_t)AscendC::GmAlloc(inputByteSize);
uint8_t y = (uint8_t)AscendC::GmAlloc(inputByteSize);
uint8_t z = (uint8_t*)AscendC::GmAlloc(outputByteSize);
ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
AscendC::SetKernelMode(KernelMode::AIV_MODE);
ICPU_RUN_KF(add_custom, blockDim, x, y, z);
WriteFile("./output/output_z.bin", z, outputByteSize);
AscendC::GmFree((void *)x);
AscendC::GmFree((void *)y);
AscendC::GmFree((void *)z);
NPU側驗證:
cpp
複製程式碼
// 初始化AscendCL
CHECK_ACL(aclInit(nullptr));
aclrtContext context;
int32_t deviceId = 0;
CHECK_ACL(aclrtSetDevice(deviceId));
CHECK_ACL(aclrtCreateContext(&context, deviceId));
aclrtStream stream = nullptr;
CHECK_ACL(aclrtCreateStream(&stream));
// 分配記憶體並進行資料複製
uint8_t *xHost, *yHost, *zHost;
uint8_t *xDevice, *yDevice, *zDevice;
CHECK_ACL(aclrtMallocHost((void)(&xHost), inputByteSize));
CHECK_ACL(aclrtMallocHost((void)(&yHost), inputByteSize));
CHECK_ACL(aclrtMallocHost((void)(&zHost), outputByteSize));
CHECK_ACL(aclrtMalloc((void)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMalloc((void)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
CHECK_ACL(aclrtMalloc((void)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
add_custom_do(blockDim, nullptr, stream, xDevice, yDevice, zDevice);
CHECK_ACL(aclrtSynchronizeStream(stream));
CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
WriteFile("./output/output_z.bin", zHost, outputByteSize);
CHECK_ACL(aclrtFree(xDevice));
CHECK_ACL(aclrtFree(yDevice));
CHECK_ACL(aclrtFree(zDevice));
CHECK_ACL(aclrtFreeHost(xHost));
CHECK_ACL(aclrtFreeHost(yHost));
CHECK_ACL(aclrtFreeHost(zHost));
CHECK_ACL(aclrtDestroyStream(stream));
CHECK_ACL(aclrtDestroyContext(context));
CHECK_ACL(aclrtResetDevice(deviceId));
CHECK_ACL(aclFinalize());
資料生成(gen_data.py):
python
複製程式碼
import numpy as np
def gen_golden_data_simple():
input_x = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16)
input_y = np.random.uniform(-100, 100, [8, 2048]).astype(np.float16)
golden = (input_x + input_y).astype(np.float16)
input_x.tofile("./input/input_x.bin")
input_y.tofile("./input/input_y.bin")
golden.tofile("./output/golden.bin")
if name == "main":
gen_golden_data_simple()
執行驗證:
設定環境變數:
bash
複製程式碼
export ASCEND_HOME_DIR=/usr/local/Ascend/ascend-toolkit/latest
執行指令碼:
bash
複製程式碼
bash run.sh <soc_version> <run_mode>
透過以上步驟,即可完成Ascend C運算元的開發和驗證。