cuda 流

拾墨、發表於2024-03-31

如下圖,將多個執行相同核函式的程序透過cuda流來使他們併發執行,提升效率

這很像cpu的流水線

想讓下面這個核函式執行兩次,每次都是不同的引數
我們需要用到cuda的流來併發的執行提升效率

__global__ void kernel( int *a, int *b, int *c ) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        int idx1 = (idx + 1) % 256;
        int idx2 = (idx + 2) % 256;
        float   as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
        float   bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
        c[idx] = (as + bs) / 2;
    }
}

實現:

#include<iostream>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

# define N (1024*1024)
# define FULL_DATA_SIZE (N*20)

__global__ void kernel(int* a, int* b, int* c)
{
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx < N)
	{
		int idx1 = (idx + 1) % 256;
		int idx2 = (idx + 2) % 256;
		float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
		float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
		c[idx] = (as + bs) / 2;
	}
}

int main(void)
{
    cudaStream_t    stream0, stream1;
    int* host_a, * host_b, * host_c;
    int* dev_a0, * dev_b0, * dev_c0;
    int* dev_a1, * dev_b1, * dev_c1;


    cudaStreamCreate(&stream0);    //初始化流
    cudaStreamCreate(&stream1);

    cudaMalloc((void**)&dev_a0, N * sizeof(int));
    cudaMalloc((void**)&dev_b0, N * sizeof(int));
    cudaMalloc((void**)&dev_c0, N * sizeof(int));
    cudaMalloc((void**)&dev_a1, N * sizeof(int));
    cudaMalloc((void**)&dev_b1, N * sizeof(int));
    cudaMalloc((void**)&dev_c1, N * sizeof(int));

    //分配一個大小為FULL_DATA_SIZE * sizeof(int)位元組的主機記憶體空間,
    //cudaHostAllocDefault是分配標誌,告訴CUDA執行時將主機記憶體分配為可被GPU直接訪問的可鎖定記憶體
    //這意味著資料可以直接從主機記憶體傳輸到GPU記憶體,而不需要中間的資料複製操作。
    cudaHostAlloc((void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc((void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc((void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault);

    //初始化
    for (int i = 0; i < FULL_DATA_SIZE; i++) { 
        host_a[i] = rand();
        host_b[i] = rand();
    }

    for (int i = 0; i < FULL_DATA_SIZE; i += N * 2) 
    {
        //將指定大小的資料從主機記憶體非同步複製到裝置記憶體。
        //由於是非同步呼叫,函式呼叫將立即返回,而複製操作將在指定的CUDA流stream中非同步執行。
        //這允許主機和裝置之間的資料傳輸與其他CUDA操作重疊,從而提高程式效能。
        cudaMemcpyAsync(dev_a0, host_a + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
        cudaMemcpyAsync(dev_a1, host_a + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);

        cudaMemcpyAsync(dev_b0, host_b + i, N * sizeof(int), cudaMemcpyHostToDevice, stream0);
        cudaMemcpyAsync(dev_b1, host_b + i + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);

        kernel << <N / 256, 256, 0, stream0 >> > (dev_a0, dev_b0, dev_c0);
        kernel << <N / 256, 256, 0, stream1 >> > (dev_a1, dev_b1, dev_c1);

        cudaMemcpyAsync(host_c + i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
        cudaMemcpyAsync(host_c + i + N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1);
    }

    //流同步 , 等待與指定流相關的所有CUDA操作完成
    cudaStreamSynchronize(stream0);
    cudaStreamSynchronize(stream1);


    //回收記憶體和流
    cudaFreeHost(host_a);
    cudaFreeHost(host_b);
    cudaFreeHost(host_c);
    cudaFree(dev_a0);
    cudaFree(dev_b0);
    cudaFree(dev_c0);
    cudaFree(dev_a1);
    cudaFree(dev_b1);
    cudaFree(dev_c1);
    cudaStreamDestroy(stream0); 
    cudaStreamDestroy(stream1);

}