CUDA一維紋理記憶體
紋理一詞來源於GPU圖形世界,GPU通用平行計算“盜用”了紋理一詞,定義了一個紋理記憶體的概念。紋理記憶體快取在 裝置上,在某些情況下能減少對記憶體的請求並降低記憶體頻寬的使用,是專門為那些在記憶體訪問模式中存在大量空間區域性性的圖形應用而設計,意味著一個執行緒讀取的位置可能與鄰近執行緒讀取的位置“非常接近”。對於GPU核心而言,紋理記憶體是隻讀記憶體,並且只有通過特殊的紋理API才能對其訪問。
紋理記憶體分為一維紋理記憶體和二維紋理記憶體,理解紋理記憶體最好的方式是丟掉“紋理”兩個字,紋理記憶體本質上是一塊記憶體,是GPU在特定應用中對一維、二維變數的特殊宣告定義以及特殊使用,這種特殊使用能夠減少記憶體流量,提升運算效能。
紋理變數(引用)必須宣告為檔案作用域內的全域性變數,這裡先探討一下一維紋理記憶體的使用方法。一維紋理記憶體的關鍵操作如下:
- 1、用texture<型別>型別宣告。
如宣告一個unsigned char 型的一維紋理tex1,格式為:
texture<unsigned char,1,cudaReadmodeElementType> tex1;
- 2、通過cudaBindTexture()繫結到紋理記憶體中,並關聯到對應的資料上。
如將unsigned char型別的dev_A繫結到一維紋理tex1上,格式為:
cudaBindTexture(0,tex1,dev_A);
注意一旦將資料繫結到一個紋理記憶體上,該資料就已經傳輸到了裝置快取上,在核函式中就可以直接訪問,不再需要額外傳入。
- 3、 通過tex1Dfetch()來讀取紋理記憶體中的資料。
紋理記憶體是一種特殊的記憶體,需要使用特定的紋理API來訪問其中的資料。如訪問tex1陣列的第3個元素,格式為:
tex1Dfetch(tex1,2);
- 4、 通過cudaUnbindTexture()取消繫結紋理記憶體。
紋理記憶體使用完之後需要取消繫結,釋放空間,如解除紋理tex1的繫結,格式為:
cudaUnbindTexture(tex1);
考慮一個簡單的應用,把一個長度是100的向量A中的資料拷貝到一個向量B中,使用普通CPU程式設計實現如下:
- #include <iostream>
- using namespace std;
- #define _length 100
- //CPU函式實現複製一個陣列
- void Copy_CPU(unsigned int * listSource, unsigned int * listTarget, int length)
- {
- for (int i = 0; i < length; i++)
- {
- listTarget[i] = listSource[i];
- }
- }
- int main()
- {
- unsigned int * listSource = new unsigned int[_length];
- unsigned int * listTarget = new unsigned int[_length];
- //賦值
- for (int i = 0; i < _length; i++)
- {
- listSource[i] = i;
- }
- //呼叫CPU複製函式
- Copy_CPU(listSource, listTarget, _length);
- cout << "原始資料: ";
- for (int i = 0; i < _length; i++)
- {
- cout << listSource[i] << " ";
- }
- cout << endl << endl << "通過CPU拷貝的資料: ";
- for (int i = 0; i < _length; i++)
- {
- cout << listTarget[i] << " ";
- }
- getchar();
- }
-
#include <iostream>
-
-
using
namespace
std;
-
-
#define _length 100
-
-
//CPU函式實現複製一個陣列
-
void Copy_CPU(unsigned int * listSource, unsigned int * listTarget, int length)
-
{
-
for (
int i =
0; i < length; i++)
-
{
-
listTarget[i] = listSource[i];
-
}
-
}
-
-
int main()
-
{
-
unsigned
int * listSource =
new
unsigned
int[_length];
-
unsigned
int * listTarget =
new
unsigned
int[_length];
-
-
//賦值
-
for (
int i =
0; i < _length; i++)
-
{
-
listSource[i] = i;
-
}
-
-
//呼叫CPU複製函式
-
Copy_CPU(listSource, listTarget, _length);
-
-
cout <<
"原始資料: ";
-
for (
int i =
0; i < _length; i++)
-
{
-
cout << listSource[i] <<
" ";
-
}
-
cout <<
endl <<
endl <<
"通過CPU拷貝的資料: ";
-
for (
int i =
0; i < _length; i++)
-
{
-
cout << listTarget[i] <<
" ";
-
}
-
getchar();
-
}
執行結果:
使用GPU程式設計,普通變數程式設計實現:
- #include"cuda_runtime.h"
- #include"device_launch_parameters.h"
- #include<iostream>
- #define _length 100
- using namespace std;
- //宣告要呼叫的Copy_GPU函式
- extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);
- void main(int argc, char** argv)
- {
- unsigned int *listSource = new unsigned int[_length];
- unsigned int *listTarget = new unsigned int[_length];
- //賦值
- for (int i = 0; i < _length; i++)
- {
- listSource[i] = i;
- }
- // 呼叫Copy_GPU函式,Copy_GPU中會呼叫gpu端的kernel函式
- Copy_GPU(listSource, listTarget, _length);
- cout << "原始資料: ";
- for (int i = 0; i < _length; i++)
- {
- cout << listSource[i] << " ";
- }
- cout << endl << endl << "通過GPU普通記憶體拷貝的資料: ";
- for (int i = 0; i < _length; i++)
- {
- cout << listTarget[i] << " ";
- }
- getchar();
- }
- //核心程式碼,在gpu端執行的kernel,
- __global__ void Blending_Texture(unsigned int* listSource, unsigned int* listTarget, int size)
- {
- //通過執行緒ID得到陣列下標
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- if (index < size)
- listTarget[index] = listSource[index];
- }
- void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length)
- {
- int data_size = length * sizeof(unsigned int);
- unsigned int *dev_Source;
- unsigned int *dev_Target;
- //在裝置上申請視訊記憶體空間
- cudaMalloc((void**)&dev_Source, data_size);
- cudaMalloc((void**)&dev_Target, data_size);
- //將host端的資料拷貝到device端
- cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice);
- //呼叫kernel
- Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Source, dev_Target, _length);
- //將結果拷貝到host端 ☆host就是CPU
- cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost);
- //釋放記憶體空間
- cudaFree(dev_Source);
- cudaFree(dev_Target);
- }
-
#include"cuda_runtime.h"
-
#include"device_launch_parameters.h"
-
#include<iostream>
-
-
#define _length 100
-
-
using
namespace
std;
-
-
//宣告要呼叫的Copy_GPU函式
-
extern
"C"
void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);
-
-
void main(int argc, char** argv)
-
{
-
unsigned
int *listSource =
new
unsigned
int[_length];
-
unsigned
int *listTarget =
new
unsigned
int[_length];
-
-
//賦值
-
for (
int i =
0; i < _length; i++)
-
{
-
listSource[i] = i;
-
}
-
-
// 呼叫Copy_GPU函式,Copy_GPU中會呼叫gpu端的kernel函式
-
Copy_GPU(listSource, listTarget, _length);
-
-
cout <<
"原始資料: ";
-
for (
int i =
0; i < _length; i++)
-
{
-
cout << listSource[i] <<
" ";
-
}
-
cout <<
endl <<
endl <<
"通過GPU普通記憶體拷貝的資料: ";
-
for (
int i =
0; i < _length; i++)
-
{
-
cout << listTarget[i] <<
" ";
-
}
-
getchar();
-
}
-
-
//核心程式碼,在gpu端執行的kernel,
-
__global__ void Blending_Texture(unsigned int* listSource, unsigned int* listTarget, int size)
-
{
-
//通過執行緒ID得到陣列下標
-
int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-
if (index < size)
-
listTarget[index] = listSource[index];
-
}
-
-
void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length)
-
{
-
int data_size = length *
sizeof(
unsigned
int);
-
-
unsigned
int *dev_Source;
-
unsigned
int *dev_Target;
-
-
//在裝置上申請視訊記憶體空間
-
cudaMalloc((
void**)&dev_Source, data_size);
-
cudaMalloc((
void**)&dev_Target, data_size);
-
-
//將host端的資料拷貝到device端
-
cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice);
-
-
//呼叫kernel
-
Blending_Texture << <
ceil(_length /
10),
10 >> > (dev_Source, dev_Target, _length);
-
-
//將結果拷貝到host端 ☆host就是CPU
-
cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost);
-
-
//釋放記憶體空間
-
cudaFree(dev_Source);
-
cudaFree(dev_Target);
-
}
執行結果:
使用GPU程式設計,一維紋理變數程式設計實現:
- #include"cuda_runtime.h"
- #include"device_launch_parameters.h"
- #include<iostream>
- #define _length 100
- using namespace std;
- //宣告紋理,用來繫結紋理,其實也就是個紋理標識
- texture<unsigned int, 1, cudaReadModeElementType> rT1;
- //宣告要呼叫的Copy_GPU函式
- extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);
- void main(int argc, char** argv)
- {
- unsigned int *listSource = new unsigned int[_length];
- unsigned int *listTarget = new unsigned int[_length];
- //賦值
- for (int i = 0; i < _length; i++)
- {
- listSource[i] = i;
- }
- // 呼叫Copy_GPU函式,Copy_GPU中會呼叫gpu端的kernel函式
- Copy_GPU(listSource, listTarget, _length);
- cout << "原始資料: ";
- for (int i = 0; i < _length; i++)
- {
- cout << listSource[i] << " ";
- }
- cout << endl << endl << "通過GPU紋理記憶體拷貝的資料: ";
- for (int i = 0; i < _length; i++)
- {
- cout << listTarget[i] << " ";
- }
- getchar();
- }
- //核心程式碼,在gpu端執行的kernel,
- __global__ void Blending_Texture(unsigned int* listTarget, int size)
- {
- //通過執行緒ID得到陣列下標
- int index = blockIdx.x * blockDim.x + threadIdx.x;
- //通過紋理獲取函式得到資料再運算
- if (index < size)
- listTarget[index] = tex1Dfetch(rT1, index);
- }
- void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length)
- {
- int data_size = length * sizeof(unsigned int);
- unsigned int *dev_Source;
- unsigned int *dev_Target;
- //在裝置上申請視訊記憶體空間
- cudaMalloc((void**)&dev_Source, data_size);
- cudaMalloc((void**)&dev_Target, data_size);
- //將host端的資料拷貝到device端
- cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice);
- //繫結紋理,繫結的紋理標識對應的資料
- cudaBindTexture(0, rT1, dev_Source);
- //呼叫kernel
- Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Target, _length);
- //將結果拷貝到host端 ☆host就是CPU
- cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost);
- //取消繫結
- cudaUnbindTexture(rT1);
- //釋放記憶體空間
- cudaFree(dev_Source);
- cudaFree(dev_Target);
- }
-
#include"cuda_runtime.h"
-
#include"device_launch_parameters.h"
-
#include<iostream>
-
-
#define _length 100
-
-
using
namespace
std;
-
-
//宣告紋理,用來繫結紋理,其實也就是個紋理標識
-
texture<
unsigned
int,
1, cudaReadModeElementType> rT1;
-
-
//宣告要呼叫的Copy_GPU函式
-
extern
"C"
void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);
-
-
void main(int argc, char** argv)
-
{
-
unsigned
int *listSource =
new
unsigned
int[_length];
-
unsigned
int *listTarget =
new
unsigned
int[_length];
-
-
//賦值
-
for (
int i =
0; i < _length; i++)
-
{
-
listSource[i] = i;
-
}
-
-
// 呼叫Copy_GPU函式,Copy_GPU中會呼叫gpu端的kernel函式
-
Copy_GPU(listSource, listTarget, _length);
-
-
cout <<
"原始資料: ";
-
for (
int i =
0; i < _length; i++)
-
{
-
cout << listSource[i] <<
" ";
-
}
-
cout <<
endl <<
endl <<
"通過GPU紋理記憶體拷貝的資料: ";
-
for (
int i =
0; i < _length; i++)
-
{
-
cout << listTarget[i] <<
" ";
-
}
-
getchar();
-
}
-
-
//核心程式碼,在gpu端執行的kernel,
-
__global__ void Blending_Texture(unsigned int* listTarget, int size)
-
{
-
//通過執行緒ID得到陣列下標
-
int index = blockIdx.x * blockDim.x + threadIdx.x;
-
-
//通過紋理獲取函式得到資料再運算
-
if (index < size)
-
listTarget[index] = tex1Dfetch(rT1, index);
-
}
-
-
void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length)
-
{
-
int data_size = length *
sizeof(
unsigned
int);
-
-
unsigned
int *dev_Source;
-
unsigned
int *dev_Target;
-
-
//在裝置上申請視訊記憶體空間
-
cudaMalloc((
void**)&dev_Source, data_size);
-
cudaMalloc((
void**)&dev_Target, data_size);
-
-
//將host端的資料拷貝到device端
-
cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice);
-
-
//繫結紋理,繫結的紋理標識對應的資料
-
cudaBindTexture(
0, rT1, dev_Source);
-
-
//呼叫kernel
-
Blending_Texture << <
ceil(_length /
10),
10 >> > (dev_Target, _length);
-
-
//將結果拷貝到host端 ☆host就是CPU
-
cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost);
-
-
//取消繫結
-
cudaUnbindTexture(rT1);
-
-
//釋放記憶體空間
-
cudaFree(dev_Source);
-
cudaFree(dev_Target);
-
}
執行結果:
再舉一個使用CUDA+OpenCv程式設計,實現複製一幅影像的例子:
- #include"cuda_runtime.h"
- #include<iostream>
- #include<highgui/highgui.hpp>
- #include<imgproc/imgproc.hpp>
- #define DIM 512 //影像尺寸
- using namespace std;
- using namespace cv;
- //一維紋理宣告
- texture<unsigned char, 1, cudaReadModeElementType> rT1;
- __global__ void Kernel_Copy(unsigned char* imageTarget)
- {
- int x = threadIdx.x + blockIdx.x*blockDim.x;
- int y = threadIdx.y + blockIdx.y*blockDim.y;
- int offset = x + y*blockDim.x*gridDim.x;
- //複製影像
- imageTarget[offset * 3 + 2] = tex1Dfetch(rT1, offset * 3 + 2);
- imageTarget[offset * 3 + 1] = tex1Dfetch(rT1, offset * 3 + 1);
- imageTarget[offset * 3 + 0] = tex1Dfetch(rT1, offset * 3 + 0);
- }
- void main(int argc, char** argv)
- {
- Mat image = imread("D:\\lena.jpg");
- Mat imageSource;
- resize(image, imageSource, Size(DIM, DIM)); //調整影像大小
- Mat imageTarget = Mat(Size(DIM, DIM), CV_8UC3, Scalar::all(0));
- //分配空間
- unsigned char *dev_imageSource;
- unsigned char *dev_imageTarget;
- cudaMalloc((void**)&dev_imageSource, 3 * imageSource.rows*imageSource.cols);
- cudaMalloc((void**)&dev_imageTarget, 3 * imageSource.rows*imageSource.cols);
- cudaMemcpy(dev_imageSource, imageSource.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice);
- cudaMemcpy(dev_imageTarget, imageTarget.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice);
- //繫結紋理
- cudaBindTexture(0, rT1, dev_imageSource);
- dim3 grids(DIM / 16, DIM / 16);
- dim3 threads(16, 16);
- //呼叫kernel
- Kernel_Copy << < grids, threads >> > (dev_imageTarget);
- //將結果拷貝到host端 ☆host就是CPU
- cudaMemcpy(imageTarget.data, dev_imageTarget, 3 * imageSource.cols*imageSource.rows, cudaMemcpyDeviceToHost);
- imshow("CUDA紋理記憶體使用示例", imageTarget);
- waitKey();
- //解除紋理繫結
- cudaUnbindTexture(rT1);
- cudaFree(dev_imageSource);
- cudaFree(dev_imageSource);
- }
-
#include"cuda_runtime.h"
-
#include<iostream>
-
#include<highgui/highgui.hpp>
-
#include<imgproc/imgproc.hpp>
-
-
#define DIM 512 //影像尺寸
-
-
using
namespace
std;
-
using
namespace cv;
-
-
//一維紋理宣告
-
texture<
unsigned
char,
1, cudaReadModeElementType> rT1;
-
-
__global__ void Kernel_Copy(unsigned char* imageTarget)
-
{
-
int x = threadIdx.x + blockIdx.x*blockDim.x;
-
int y = threadIdx.y + blockIdx.y*blockDim.y;
-
int offset = x + y*blockDim.x*gridDim.x;
-
-
//複製影像
-
imageTarget[offset *
3 +
2] = tex1Dfetch(rT1, offset *
3 +
2);
-
imageTarget[offset *
3 +
1] = tex1Dfetch(rT1, offset *
3 +
1);
-
imageTarget[offset *
3 +
0] = tex1Dfetch(rT1, offset *
3 +
0);
-
}
-
-
void main(int argc, char** argv)
-
{
-
Mat image = imread(
"D:\\lena.jpg");
-
Mat imageSource;
-
resize(image, imageSource, Size(DIM, DIM));
//調整影像大小
-
-
Mat imageTarget = Mat(Size(DIM, DIM), CV_8UC3, Scalar::all(
0));
-
-
//分配空間
-
unsigned
char *dev_imageSource;
-
unsigned
char *dev_imageTarget;
-
cudaMalloc((
void**)&dev_imageSource,
3 * imageSource.rows*imageSource.cols);
-
cudaMalloc((
void**)&dev_imageTarget,
3 * imageSource.rows*imageSource.cols);
-
-
cudaMemcpy(dev_imageSource, imageSource.data,
3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice);
-
cudaMemcpy(dev_imageTarget, imageTarget.data,
3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice);
-
-
//繫結紋理
-
cudaBindTexture(
0, rT1, dev_imageSource);
-
-
dim3 grids(DIM / 16, DIM / 16);
-
dim3 threads(16, 16);
-
//呼叫kernel
-
Kernel_Copy << < grids, threads >> > (dev_imageTarget);
-
-
//將結果拷貝到host端 ☆host就是CPU
-
cudaMemcpy(imageTarget.data, dev_imageTarget,
3 * imageSource.cols*imageSource.rows, cudaMemcpyDeviceToHost);
-
-
imshow(
"CUDA紋理記憶體使用示例", imageTarget);
-
waitKey();
-
-
//解除紋理繫結
-
cudaUnbindTexture(rT1);
-
-
cudaFree(dev_imageSource);
-
cudaFree(dev_imageSource);
-
}
執行結果:
相關文章
- CUDA記憶體介紹記憶體
- CUDA 有 unified memory 還需要記憶體優化嗎?Nifi記憶體優化
- 試試二維紋理對映
- 實時渲染不是夢:通過共享記憶體優化Flutter外接紋理的渲染效能記憶體優化Flutter
- linux記憶體管理(一)實體記憶體的組織和記憶體分配Linux記憶體
- 記憶體分配問題處理記憶體
- cocos2d-x 優化(紋理渲染優化、資源快取、記憶體優化)優化快取記憶體
- 1.記憶體優化(一)記憶體洩漏記憶體優化
- Linux記憶體不足的處理方法Linux記憶體
- SqlServer運維——最大伺服器記憶體SQLServer運維伺服器記憶體
- Java記憶體模型FAQ(一) 什麼是記憶體模型Java記憶體模型
- Redis記憶體——記憶體消耗(記憶體都去哪了?)Redis記憶體
- 記一次記憶體告警記憶體
- 【譯】JavaScript的記憶體管理和 4 種處理記憶體洩漏的方法JavaScript記憶體
- 多視角三維模型紋理對映 01模型
- 記憶體管理 記憶體管理概述記憶體
- 【記憶體管理】記憶體佈局記憶體
- Windows記憶體管理分析(一)Windows記憶體
- Java的記憶體 -JVM 記憶體管理Java記憶體JVM
- Go:記憶體管理與記憶體清理Go記憶體
- 聊聊 記憶體模型與記憶體序記憶體模型
- 造成記憶體洩漏的異常處理記憶體
- w10老顯示記憶體不足怎麼解決 w10記憶體總是顯示記憶體不足處理方法記憶體
- 記憶體管理篇——實體記憶體的管理記憶體
- Webgl 紋理Web
- 遊戲記憶體對比普通記憶體區別 遊戲記憶體和普通記憶體相差大嗎?遊戲記憶體
- 理解JVM(一):記憶體結構JVM記憶體
- 深入淺出記憶體馬(一)記憶體
- Tomcat 記憶體馬(一)Listener型Tomcat記憶體
- Redis-記憶體優化(一)Redis記憶體優化
- JS中的棧記憶體、堆記憶體JS記憶體
- Java記憶體區域和記憶體模型Java記憶體模型
- 直接記憶體和堆記憶體誰快記憶體
- 記憶體溢位和記憶體洩露記憶體溢位記憶體洩露
- 快取記憶體一致性協議MESI與記憶體屏障快取記憶體協議
- 【Java基礎】實體記憶體&虛擬記憶體Java記憶體
- 圖形學之紋理後續/WebGL多紋理處理Web
- 【記憶體洩漏和記憶體溢位】JavaScript之深入淺出理解記憶體洩漏和記憶體溢位記憶體溢位JavaScript