CUDA 4.0中P2P與UVA的性特性使用方法
近日,CUDA 4.0已經對註冊開發者開放,其中增加了不少的功能。其中P2P(Peer-to-Peer )與UVA(Unified Virtual Address Space)的引進最為大家關心。這裡與大家一起分享下SDK中的simpleP2P這個例子,他展示瞭如何使用這兩個功能。程式碼如下: /*
* Copyright 1993-2011 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
/*
* This sample demonstrates a combination of Peer-to-Peer (P2P) and Unified
* Virtual Address Space (UVA) features new to SDK 4.0
*/
#include
#include
#include
#include
#include
const char *sSDKsample = "simpleP2P";
__global__ void SimpleKernel(float *src, float *dst)
{
// Just a dummy kernel, doing enough for us to verify that everything
// worked
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
dst[idx] = src[idx] * 2.0f;
}
int main(int argc, char **argv)
{
printf("[%s] starting...n", sSDKsample);
// Number of GPUs
printf("Checking for multiple GPUs...n");
int gpu_n;
cutilSafeCall(cudaGetDeviceCount(&gpu_n));
printf("CUDA-capable device count: %in", gpu_n);
if (gpu_n < 2)
{
printf("Two or more Tesla(s) with (SM 2.0) class GPUs are required for %s.n", sSDKsample);
printf("Waiving test.n");
printf("PASSEDn");
exit(EXIT_SUCCESS);
}
// Query device properties
cudaDeviceProp prop_0, prop_1;
cutilSafeCall(cudaGetDeviceProperties(&prop_0, 0));
cutilSafeCall(cudaGetDeviceProperties(&prop_1, 1));
// Check for TCC
#ifdef _WIN32
if (prop_0.tccDriver == 0 || prop_1.tccDriver == 0)
{
printf("Need to have both GPUs running under TCC driver to use P2P / UVA functionality.n");
printf("PASSEDn");
exit(EXIT_SUCCESS);
}
#endif // WIN32
// Check possibility for peer access
printf("Checking for peer access...n");
int can_access_peer_0_1, can_access_peer_1_0;
cutilSafeCall(cudaDeviceCanAccessPeer(&can_access_peer_0_1, 0, 1));
cutilSafeCall(cudaDeviceCanAccessPeer(&can_access_peer_1_0, 1, 0));
if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
{
printf("Two or more Tesla(s) with (SM 2.0) class GPUs are required for %s.n", sSDKsample);
printf("Peer access is not available between GPU0 GPU1, waiving test.n");
printf("PASSEDn");
exit(EXIT_SUCCESS);
}
// Enable peer access
printf("Enabling peer access...n");
cutilSafeCall(cudaSetDevice(0));
cutilSafeCall(cudaDeviceEnablePeerAccess(1, 0));
cutilSafeCall(cudaSetDevice(1));
cutilSafeCall(cudaDeviceEnablePeerAccess(0, 0));
// Check that we got UVA on both devices
printf("Checking for UVA...n");
const bool has_uva = prop_0.unifiedAddressing && prop_1.unifiedAddressing;
if (has_uva == false)
{
printf("At least one of the two GPUs has no UVA supportn");
}
// Allocate buffers
const size_t buf_size = 1024 * 1024 * 16 * sizeof(float);
printf("Allocating buffers (%iMB on GPU0, GPU1 and Host)...n", int(buf_size / 1024 / 1024));
cutilSafeCall(cudaSetDevice(0));
float* g0;
cutilSafeCall(cudaMalloc(&g0, buf_size));
cutilSafeCall(cudaSetDevice(1));
float* g1;
cutilSafeCall(cudaMalloc(&g1, buf_size));
float* h0;
if (has_uva)
cutilSafeCall(cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA
else
cutilSafeCall(cudaHostAlloc(&h0, buf_size, cudaHostAllocPortable));
float *g0_peer, *g1_peer;
if (has_uva == false)
{
// Need explicit mapping without UVA
cutilSafeCall(cudaSetDevice(0));
cutilSafeCall(cudaPeerRegister(g1, 1, cudaPeerRegisterMapped));
cutilSafeCall(cudaPeerGetDevicePointer((void **) &g1_peer, g1, 1, 0));
cutilSafeCall(cudaSetDevice(1));
cutilSafeCall(cudaPeerRegister(g0, 0, cudaPeerRegisterMapped));
cutilSafeCall(cudaPeerGetDevicePointer((void **) &g0_peer, g0, 0, 0));
}
// Create CUDA event handles
printf("Creating event handles...n");
cudaEvent_t start_event, stop_event;
float time_memcpy;
int eventflags = cudaEventBlockingSync;
cutilSafeCall(cudaEventCreateWithFlags(&start_event, eventflags));
cutilSafeCall(cudaEventCreateWithFlags(&stop_event, eventflags));
// P2P memcopy() benchmark
cutilSafeCall(cudaEventRecord(start_event, 0));
for (int i=0; i<100; i++)
{
// With UVA we don't need to specify source and target devices, the
// runtime figures this out by itself from the pointers
if (has_uva)
{
// Ping-pong copy between GPUs
if (i % 2 == 0)
cutilSafeCall(cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault));
else
cutilSafeCall(cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault));
}
else
{
// Ping-pong copy between GPUs
if (i % 2 == 0)
cutilSafeCall(cudaMemcpyPeer(g1, 1, g0, 0, buf_size));
else
cutilSafeCall(cudaMemcpyPeer(g0, 0, g1, 1, buf_size));
}
}
cutilSafeCall(cudaEventRecord(stop_event, 0));
cutilSafeCall(cudaEventSynchronize(stop_event));
cutilSafeCall(cudaEventElapsedTime(&time_memcpy, start_event, stop_event));
printf("cudaMemcpyPeer / cudaMemcpy between GPU0 and GPU1: %.2fGB/sn",
(1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f);
// Prepare host buffer and copy to GPU 0
printf("Preparing host buffer and memcpy to GPU0...n");
for (int i=0; i
h0[i] = float(i % 4096);
}
cutilSafeCall(cudaSetDevice(0));
if (has_uva)
cutilSafeCall(cudaMemcpy(g0, h0, buf_size, cudaMemcpyDefault));
else
cutilSafeCall(cudaMemcpy(g0, h0, buf_size, cudaMemcpyHostToDevice));
// Kernel launch configuration
const dim3 threads(512, 1);
const dim3 blocks((buf_size / sizeof(float)) / threads.x, 1);
// Run kernel on GPU 1, reading input from the GPU 0 buffer, writing
// output to the GPU 1 buffer
printf("Run kernel on GPU1, taking source data from GPU0 and writing to GPU1...n");
cutilSafeCall(cudaSetDevice(1));
if (has_uva)
SimpleKernel<<
else
SimpleKernel<<
// Run kernel on GPU 0, reading input from the GPU 1 buffer, writing
// output to the GPU 0 buffer
printf("Run kernel on GPU0, taking source data from GPU1 and writing to GPU0...n");
cutilSafeCall(cudaSetDevice(0));
if (has_uva)
SimpleKernel<<
else
SimpleKernel<<
// Copy data back to host and verify
printf("Copy data back to host from GPU0 and verify...n");
if (has_uva)
cutilSafeCall(cudaMemcpy(h0, g0, buf_size, cudaMemcpyDefault));
else
cutilSafeCall(cudaMemcpy(h0, g0, buf_size, cudaMemcpyHostToDevice));
int error_count = 0;
for (int i=0; i
// Re-generate input data and apply 2x '* 2.0f' computation of both
// kernel runs
if (h0[i] != float(i % 4096) * 2.0f * 2.0f)
{
printf("Verification error, element %in", i);
if (error_count++ > 10)
break;
}
}
printf((error_count == 0) ? "PASSEDn" : "FAILEDn");
// Disable peer access (also unregisters memory for non-UVA cases)
printf("Enabling peer access...n");
cutilSafeCall(cudaSetDevice(0));
cutilSafeCall(cudaDeviceDisablePeerAccess(1));
cutilSafeCall(cudaSetDevice(1));
cutilSafeCall(cudaDeviceDisablePeerAccess(0));
// Cleanup and shutdown
printf("Shutting down...n");
cutilSafeCall(cudaEventDestroy(start_event));
cutilSafeCall(cudaEventDestroy(stop_event));
cutilSafeCall(cudaSetDevice(0));
cutilSafeCall(cudaFree(g0));
cutilSafeCall(cudaSetDevice(1));
cutilSafeCall(cudaFree(g1));
cutilSafeCall(cudaFreeHost(h0));
cudaDeviceReset();
cutilExit(argc, argv);
}
其中, // Check possibility for peer access
printf("Checking for peer access...n");
int can_access_peer_0_1, can_access_peer_1_0;
cutilSafeCall(cudaDeviceCanAccessPeer(&can_access_peer_0_1, 0, 1));
cutilSafeCall(cudaDeviceCanAccessPeer(&can_access_peer_1_0, 1, 0));
if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0)
{
printf("Two or more Tesla(s) with (SM 2.0) class GPUs are required for %s.n", sSDKsample);
printf("Peer access is not available between GPU0 GPU1, waiving test.n");
printf("PASSEDn");
exit(EXIT_SUCCESS);
}
從這段程式碼可以看出,目前僅有Fermi架構的tesla卡才能支援到P2P功能。由於UVA的需要,想成功編譯執行程式,需要編譯成64程式。而且如果支援UVA(其實現在如果是支援P2P的卡理論上應該都是支援UVA的),可以使用cudaMemcpyDefault代替原有的cudaMemcpyHostToDevice等方式,而且在核心函式等的呼叫上,不在需要分別獲取各裝置記憶體單獨地址,大大的縮減了程式碼的編寫量。
本文來自CSDN部落格,轉載請標明出處:http://blog.csdn.net/dreampursue/archive/2011/03/17/6256426.aspx
[@more@]來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/25544265/viewspace-1047375/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- UVA 10498 Happiness!(線性規劃)APP
- CUDA(五)用deviceQuery看GPU屬性devGPU
- 【CUDA學習】全域性儲存器
- MQTT 5.0 新特性 |(一) 屬性與載荷MQQT
- JavaScript:prototype屬性使用方法JavaScript
- tmux的使用方法和個性化配置UX
- GPU的並行運算與CUDA的簡介GPU並行
- cuda程式設計與gpu平行計算(四):cuda程式設計模型程式設計GPU模型
- QT 全域性變數使用方法QT變數
- DOM物件屬性(property)與HTML標籤特性(attribute)物件HTML
- Golang Agent 可觀測性的全面升級與新特性介紹Golang
- CUDA與架構矩陣概覽架構矩陣
- oracle全域性臨時表的特性Oracle
- CUDA的問題
- TDD學習筆記【五】一隔絕相依性的方式與特性筆記
- CUDA
- Python中XGBoost的特性重要性和特性選擇Python
- CSS的特性之層疊性介紹CSS
- SVG與Canvas的主要特性SVGCanvas
- 學Guava發現:不可變特性與防禦性程式設計Guava程式設計
- CUDA 常用的函式函式
- cuda的c++程式C++
- CUDA常見驅動程式相容性問題一覽
- cuda 流
- cmake cuda
- round函式與trunc函式的使用方法函式
- solution-uva1594
- ubuntu 14.04 安裝cuda 7.5/CUDA 8.0Ubuntu
- P2P/WSN信任建模與模擬平臺
- P2P通訊原理與實現(C++)C++
- Go方法特性詳解:簡單性和高效性的充分體現Go
- 有Cuda能力的GPU核心GPU
- 12c OCP題庫解析060-7 Oracle 12c 安全性與合規性的新特性Oracle
- UVA 536 二叉樹的遍歷二叉樹
- Asp.net MVC中ViewData與ViewBag的使用方法ASP.NETMVCView
- C#特性-匿名型別與隱式型別區域性變數C#型別變數
- 超全圖解P2P與民間借貸的關係圖解
- cuda和cudatoolkit