CUDA3.0的C++支援詳解
CUDA3特性中最吸引人的一條就是其對C++的支援,今天兄弟就切實的嘗試了一下,所有的程式碼都以向量相加為例子,體會分享如下:
函式過載與預設引數
允許在同一個類中出現多個同名但引數列表不同的過程或函式,執行時才確定呼叫函式的入口地址。記得在2.3中就已經開始支援了,但是一直都沒有嘗試,今天就試了一下,感覺但是不錯的,呵呵!程式碼如下:
其中定義的DEFAULT用於測試預設引數;同時還試驗了一下__restrict__關鍵字,但是有個限制,如果大家試驗一下相信很容易知道,呵呵!大家試試,看看是不是能夠出正確結果。
運算子過載
程式碼如下:
yyfn是一個結構體,其有兩個域,一個是money,一個是girl,運算子過載的使用使得程式清晰簡潔了不少。但是運算子過載應當不能用於核心,因為運算子過載要返回值。
模板
CUDA很早以前就已經支援這個,所以就不詳細說了,給了個例子,大家體會一下,呵呵!
函式物件
在C++中,函式物件廣泛用於演算法中,CUDA應該是為以後支援更多的演算法準備的。程式碼如下 :
本來還想試驗一下複雜的類機制,但是仔細一想,還是不打擊自己了,呵呵!
函式過載與預設引數
允許在同一個類中出現多個同名但引數列表不同的過程或函式,執行時才確定呼叫函式的入口地址。記得在2.3中就已經開始支援了,但是一直都沒有嘗試,今天就試了一下,感覺但是不錯的,呵呵!程式碼如下:
CODE:
#include
#define DEFAULT
#define BLOCKSIZE 256
__device__ int add(int a=0,int b=0){
return (a+b);
}
__device__ float add(float a,float b){
return (a+b);
}
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
#ifdef DEFAULT
c[id]=add();
#else
c[id]=add(a[id],b[id]);
#endif
}
}
__global__ void add(const float* __restrict__ a,const float* __restrict__ b,float* __restrict__ c,const unsigned int num){
const unsigned int id=blockIdx.x*blockDim.x+threadIdx.x;
if(id
c[id]=add(a[id],b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
float *af;
float *bf;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
cudaMallocHost((void**)&af,num*sizeof(float));
cudaMallocHost((void**)&bf,num*sizeof(float));
for(int i=0;i
a[i]=1;
b[i]=2;
af[i]=1.1f;
bf[i]=2.2f;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_af;
cudaMalloc((void**)&d_af,num*sizeof(float));
cudaMemcpyAsync(d_af,af,num*sizeof(float),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_bf;
cudaMalloc((void**)&d_bf,num*sizeof(float));
cudaMemcpyAsync(d_bf,bf,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
float *cf;
cudaMallocHost((void**)&cf,num*sizeof(float));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
float *d_cf;
cudaMalloc((void**)&d_cf,num*sizeof(float));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
add<<>>(d_af,d_bf,d_cf,num);
cudaMemcpyAsync(cf,d_cf,num*sizeof(float),cudaMemcpyDeviceToHost,0);
for(int i=0;i
if((i+1)%20==0)
printf("%d ",c[i]);
}
printf(".......................................................\n");
for(int i=0;i
if((i+1)%20==0)
printf("%f ",cf[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFreeHost(af);
cudaFreeHost(bf);
cudaFreeHost(cf);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_af);
cudaFree(d_bf);
cudaFree(d_cf);
return 0;
}
int main(){
test(10000);
}
#define DEFAULT
#define BLOCKSIZE 256
__device__ int add(int a=0,int b=0){
return (a+b);
}
__device__ float add(float a,float b){
return (a+b);
}
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
c[id]=add();
#else
c[id]=add(a[id],b[id]);
#endif
}
}
__global__ void add(const float* __restrict__ a,const float* __restrict__ b,float* __restrict__ c,const unsigned int num){
const unsigned int id=blockIdx.x*blockDim.x+threadIdx.x;
if(id
}
}
int test(const unsigned int num){
int *a;
int *b;
float *af;
float *bf;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
cudaMallocHost((void**)&af,num*sizeof(float));
cudaMallocHost((void**)&bf,num*sizeof(float));
for(int i=0;i
b[i]=2;
af[i]=1.1f;
bf[i]=2.2f;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_af;
cudaMalloc((void**)&d_af,num*sizeof(float));
cudaMemcpyAsync(d_af,af,num*sizeof(float),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
float *d_bf;
cudaMalloc((void**)&d_bf,num*sizeof(float));
cudaMemcpyAsync(d_bf,bf,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
float *cf;
cudaMallocHost((void**)&cf,num*sizeof(float));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
float *d_cf;
cudaMalloc((void**)&d_cf,num*sizeof(float));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
add<<>>(d_af,d_bf,d_cf,num);
cudaMemcpyAsync(cf,d_cf,num*sizeof(float),cudaMemcpyDeviceToHost,0);
for(int i=0;i
printf("%d ",c[i]);
}
printf(".......................................................\n");
for(int i=0;i
printf("%f ",cf[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFreeHost(af);
cudaFreeHost(bf);
cudaFreeHost(cf);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_af);
cudaFree(d_bf);
cudaFree(d_cf);
return 0;
}
int main(){
test(10000);
}
其中定義的DEFAULT用於測試預設引數;同時還試驗了一下__restrict__關鍵字,但是有個限制,如果大家試驗一下相信很容易知道,呵呵!大家試試,看看是不是能夠出正確結果。
運算子過載
程式碼如下:
CODE:
#include
#define BLOCKSIZE 256
typedef struct __align__(8){
int money;
int girl;
}yyfn;
__device__ yyfn operator+ (yyfn& one,yyfn& two){
yyfn temp;
temp.money=one.money+two.money;
temp.girl=one.girl+two.girl;
return (temp);
}
__global__ void add(yyfn *a,yyfn *b,yyfn *c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
c[id]=(a[id]+b[id]);
}
}
int test(const unsigned int num){
yyfn *a;
yyfn *b;
cudaMallocHost((void**)&a,num*sizeof(yyfn));
cudaMallocHost((void**)&b,num*sizeof(yyfn));
for(int i=0;i
a[i].money=1;
a[i].girl=1;
b[i].money=2;
b[i].girl=1;
}
yyfn *d_a;
cudaMalloc((void**)&d_a,num*sizeof(yyfn));
cudaMemcpyAsync(d_a,a,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *d_b;
cudaMalloc((void**)&d_b,num*sizeof(yyfn));
cudaMemcpyAsync(d_b,b,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *c;
cudaMallocHost((void**)&c,num*sizeof(yyfn));
yyfn *d_c;
cudaMalloc((void**)&d_c,num*sizeof(yyfn));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(yyfn),cudaMemcpyDeviceToHost,0);
for(int i=0;i
if((i+1)%20==0)
printf("money=%d,girl=%d ",c[i].money,c[i].girl);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
#define BLOCKSIZE 256
typedef struct __align__(8){
int money;
int girl;
}yyfn;
__device__ yyfn operator+ (yyfn& one,yyfn& two){
yyfn temp;
temp.money=one.money+two.money;
temp.girl=one.girl+two.girl;
return (temp);
}
__global__ void add(yyfn *a,yyfn *b,yyfn *c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
}
}
int test(const unsigned int num){
yyfn *a;
yyfn *b;
cudaMallocHost((void**)&a,num*sizeof(yyfn));
cudaMallocHost((void**)&b,num*sizeof(yyfn));
for(int i=0;i
a[i].girl=1;
b[i].money=2;
b[i].girl=1;
}
yyfn *d_a;
cudaMalloc((void**)&d_a,num*sizeof(yyfn));
cudaMemcpyAsync(d_a,a,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *d_b;
cudaMalloc((void**)&d_b,num*sizeof(yyfn));
cudaMemcpyAsync(d_b,b,num*sizeof(yyfn),cudaMemcpyHostToDevice,0);
yyfn *c;
cudaMallocHost((void**)&c,num*sizeof(yyfn));
yyfn *d_c;
cudaMalloc((void**)&d_c,num*sizeof(yyfn));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(yyfn),cudaMemcpyDeviceToHost,0);
for(int i=0;i
printf("money=%d,girl=%d ",c[i].money,c[i].girl);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
yyfn是一個結構體,其有兩個域,一個是money,一個是girl,運算子過載的使用使得程式清晰簡潔了不少。但是運算子過載應當不能用於核心,因為運算子過載要返回值。
模板
CUDA很早以前就已經支援這個,所以就不詳細說了,給了個例子,大家體會一下,呵呵!
CODE:
#include
#define BLOCKSIZE 256
template
__global__ void add(const T* __restrict__ a,const T* __restrict__ b,T* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
c[id]=(a[id]+b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i
a[i]=1;
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num);
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i
if((i+1)%20==0)
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
#define BLOCKSIZE 256
template
__global__ void add(const T* __restrict__ a,const T* __restrict__ b,T* __restrict__ c,const unsigned int num){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
函式物件
在C++中,函式物件廣泛用於演算法中,CUDA應該是為以後支援更多的演算法準備的。程式碼如下 :
CODE:
#include
#define BLOCKSIZE 256
class Add{
public:
template
__device__ T operator() (T& a,T& b) const {
return (a+b);
}
};
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num,Add op){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
c[id]=op(a[id],b[id]);
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i
a[i]=1;
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num,Add());
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i
if((i+1)%20==0)
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
#define BLOCKSIZE 256
class Add{
public:
template
__device__ T operator() (T& a,T& b) const {
return (a+b);
}
};
__global__ void add(const int* __restrict__ a,const int* __restrict__ b,int* __restrict__ c,const unsigned int num,Add op){
const unsigned int id=blockDim.x*blockIdx.x+threadIdx.x;
if(id
}
}
int test(const unsigned int num){
int *a;
int *b;
cudaMallocHost((void**)&a,num*sizeof(int));
cudaMallocHost((void**)&b,num*sizeof(int));
for(int i=0;i
b[i]=2;
}
int *d_a;
cudaMalloc((void**)&d_a,num*sizeof(int));
cudaMemcpyAsync(d_a,a,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *d_b;
cudaMalloc((void**)&d_b,num*sizeof(int));
cudaMemcpyAsync(d_b,b,num*sizeof(int),cudaMemcpyHostToDevice,0);
int *c;
cudaMallocHost((void**)&c,num*sizeof(int));
int *d_c;
cudaMalloc((void**)&d_c,num*sizeof(int));
add<<>>(d_a,d_b,d_c,num,Add());
cudaMemcpyAsync(c,d_c,num*sizeof(int),cudaMemcpyDeviceToHost,0);
for(int i=0;i
printf("%d ",c[i]);
}
cudaFreeHost(a);
cudaFreeHost(b);
cudaFreeHost(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
int main(){
test(10000);
}
本來還想試驗一下複雜的類機制,但是仔細一想,還是不打擊自己了,呵呵!
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/23057064/viewspace-630474/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- 詳解C++引用C++
- C++ stl容器詳解C++
- c++ vector用法詳解C++
- C/C++的轉義字元詳解C++字元
- 關於C++中物件與類的詳解及其作用詳解C++物件
- C++引用型別詳解C++型別
- 詳解C++完美轉發C++
- C++中map的使用詳解說明C++
- C++運算子過載詳解C++
- c++ 智慧指標用法詳解C++指標
- C++之string型別詳解C++型別
- C++ typeid關鍵字詳解C++
- 匿名函式(lambda)詳解 C++函式C++
- C++中指標與引用詳解C++指標
- 詳解C++中繼承的基本內容C++中繼繼承
- C++指標的概念解讀 超詳細C++指標
- 詳解C++的模板中typename關鍵字的用法C++
- C/C++語言精髓 *和&詳解C++
- C++移動語義 詳細講解【Cherno C++教程】C++
- 詳解c++指標的指標和指標的引用C++指標
- 關於C++類的定義和物件的建立詳解C++物件
- 詳解C++中的多型和虛擬函式C++多型函式
- C++拷貝建構函式詳解C++函式
- C++六種記憶體序詳解C++記憶體
- C++ Break、Continue 和 陣列操作詳解C++陣列
- C/C++記憶體對齊詳解C++記憶體
- C++ 單例類别範本(詳解)C++單例
- C++類和物件是什麼?C++類和物件詳解C++物件
- visual studio原生支援C++的含義C++
- NDT演算法詳解與C++實現演算法C++
- 整合測試——Spring TestContext框架支援詳細解說SpringContext框架
- C++ 指標和迭代器支援的操作C++指標
- NOIP 2017初賽普及組C/C++答案詳解C++
- CMU15445 之 Project#0 - C++ Primer 詳解ProjectC++
- C++ STL:std::unorderd_map 物理結構詳解C++
- C++實現二分法詳解C++
- Python安裝MySQL庫詳解(解決Microsoft Visual C++ 9.0 is required )PythonMySqlROSC++UI
- 詳解 ARM64 核心中對 52 位虛擬地址的支援
- C++ Qt開發:Charts折線圖繪製詳解C++QT