下面的cuda程式碼為什麼指定compute capability = 8.0 後在A100上跑結果不對?
nvcc a.cu # 正確
nvcc a.cu -gencode arch=compute_80,code=sm_80 # 結果不正確
// a.cu
#include <cuda_runtime.h>
#include <stdio.h>
#include <stdlib.h>
using ValueType = float;
__constant__ ValueType col_idx[10];
__global__ void aa_kernel(ValueType *x) {
int idx = threadIdx.x;
printf("idx = %d\n", idx);
if(idx < 10)
x[idx] = col_idx[idx];
}
int main() {
ValueType *col_h = new ValueType[10];
col_h[0] = 1.0;
for(int i=1; i<10; i++) col_h[i] = 2*col_h[i-1];
ValueType * x_d;
cudaError_t err = cudaSuccess;
err = cudaMalloc(&x_d, 32*sizeof(ValueType));
if(err != cudaSuccess) printf("error\n");
cudaMemcpyToSymbol(col_idx, col_h, 10*sizeof(ValueType));
cudaStream_t stream;
cudaStreamCreate(&stream);
aa_kernel<<<1, 32, 0, stream>>>(x_d);
cudaDeviceSynchronize();
ValueType x[32] = {0};
cudaMemcpy(x, x_d, 32*sizeof(ValueType), cudaMemcpyDeviceToHost);
for(int i=0; i<12; i++) printf("%f\n", x[i]);
return 0;
}