讀取大型稀疏矩陣&KNN演算法的OpenCL加速版本
https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/
一、讀取稀疏矩陣。先以一個簡單的稀疏矩陣為例,讀進這個稀疏矩陣
即這個稀疏矩陣本來其實有4行12列。前面第1列0不用管,是標籤,是不要讀進矩陣的。
#include <stdio.h>
//#include <iostream>
#include <fstream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
void main(){
int width = 12, height = 4;
float *matrix = new float[width*height];
memset(matrix, 0,width*height * sizeof(float));
ifstream infile("simple_data.txt");
vector<string> vec_str;
string line;
size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
//char *end;
int lineID=0;
while (getline(infile, line))
{
if (line.size() == 0)
{
continue;
}
for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
{
pos_maohao = line.find_first_of(":");
pos_kongge = line.find_first_of(" ");
if (pos_maohao == string::npos || pos_kongge == string::npos)
{
continue;
}
string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
string tempData = line.substr(pos_maohao+1, 8);
line = line.substr(pos_maohao + 9);
//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
int ind = atoi(tempIndex.c_str());
float data = atof(tempData.c_str());
cout << ind << ":"<< data << " ";
matrix[lineID*width + ind] = data;
}
lineID += 1;
cout << endl;
}
cout << endl << endl;
cout << "the matrix is..." << endl;
for (int i = 0; i < height;i++)
{
for (int j = 0; j < width;j++)
{
cout << matrix[i*width + j] << " ";
}
cout << endl;
}
infile.close();
delete[] matrix;
}
即可。一、寫好的第一個KNN版本(先只拿前k個距離 而不是前k個最小距離 因為只是先測試一下是否寫錯):
在算一個200(高height)X931(寬width)的矩陣matrix,對於每一行去算它與另199行的距離,(它與自己這一行的距離為0 不用算),對於每一行可以得到一個200維的向量,取向量的前k個結果作為最終的距離矩陣instanceMatrix的結果(應該是200Xk的矩陣),這k個結果的位置矩陣positionMatrix(附帶的,其實這個不用算) localsize是[1][512] globalsize是[100][1024]即有200個工作組 我是讓每個工作組去算一行與另199行的距離。結果距離矩陣instanceMatrix中只有第1列是正確的
另k-1列是無意義的值 位置矩陣positionMatrix中k-1列無意義的值。
cl部分:
__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort)
{
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
__global float *row=matrix+width*currentGroupID;
for(uint anotherID=0;anotherID<height;anotherID++)
{
if(anotherID==currentGroupID)
{
localInstanceNoSort[anotherID]=0;
continue;
}
float sum=0;
const __global float *rowAnother=matrix+width*anotherID;
for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
barrier(CLK_LOCAL_MEM_FENCE);
localSizeInstance[get_local_id(0)]=sum;
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride)
{
localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
}
}
if (get_local_id(0) == 0)
{
localInstanceNoSort[anotherID] = localSizeInstance[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
// no sort
if(get_local_id(0)<k)
{
instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
main函式部分:
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;
const int width=931;
const int height=200;
const int k=5;
void readTxt(const char *txtname,float *matrix)
{
//int width = 931, height = 200;
//float *matrix = new float[width*height];
//memset(matrix, 0,width*height * sizeof(float));
//ifstream infile("fea_sparse.txt");
ifstream infile(txtname);
vector<string> vec_str;
string line;
size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
//char *end;
int lineID=0;
while (getline(infile, line))
{
if (line.size() == 0)
{
continue;
}
for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
{
pos_maohao = line.find_first_of(":");
pos_kongge = line.find_first_of(" ");
if (pos_maohao == string::npos || pos_kongge == string::npos)
{
continue;
}
string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
string tempData = line.substr(pos_maohao+1, 8);
line = line.substr(pos_maohao + 9);
//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
int ind = atoi(tempIndex.c_str());
float data = atof(tempData.c_str());
//cout << ind << ":"<< data << " ";
matrix[lineID*width + ind] = data;
}
lineID += 1;
//cout << endl;
}
infile.close();
//delete[] matrix;
}
int main( int argc, const char** argv)
{
shrQAStart(argc, (char **)argv);
// set logfile name and start logs
shrSetLogFileName ("oclKnn4Robin.txt");
shrLog("%s Starting...\n\n", argv[0]);
//Get the NVIDIA platform
cl_platform_id cpPlatform;
cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device ;
clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
return ciErrNum;
}
//read kernel file and build it
size_t program_length;
char* source_path = shrFindFilePath("knn.cl", argv[0]);
oclCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
oclCheckError(source != NULL, shrTRUE);
// create the program
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
return(EXIT_FAILURE);
}
cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//host data
float *matrix = new float[width*height];
const size_t matrixSize=width*height*sizeof(float);
memset(matrix, 0,matrixSize);
const char *txtfile="fea_sparse.txt";
readTxt(txtfile,matrix);
int *resultPosMatrix=new int[height*k];
size_t resultPosSize=height*k*sizeof(int);
memset(resultPosMatrix,0,resultPosSize);
float *resultInstanceMatrix=new float[height*k];
size_t resultInstanceSize=height*k*sizeof(float);
memset(resultInstanceMatrix,0,resultInstanceSize);
//send arguments to kernel file | CL_MEM_COPY_HOST_PTR
cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int), &width);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
//ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
oclCheckError(ciErrNum, CL_SUCCESS);
size_t localsize[2]={1,512};
size_t globalsize[2]={100,1024};
clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
int *positionHost=(int*)alloca(resultPosSize);
memset(positionHost,0,resultPosSize);
float *instanceHost=(float*)alloca(resultInstanceSize);
memset(instanceHost,0,resultInstanceSize);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//save as txt to check.
ofstream posOutfile("position.txt",ios::out);
ofstream instanceOutfile("instance.txt",ios::out);
for(size_t r=0;r<height;r++)
{
for(size_t c=0;c<k;c++)
{
int data=positionHost[r*k+c];
float instanceData=instanceHost[r*k+c];
posOutfile<<data<<"\t";
instanceOutfile<<instanceData<<"\t";
}
posOutfile<<endl;
instanceOutfile<<endl;
}
posOutfile.close();
instanceOutfile.close();
delete [] matrix;
delete [] resultPosMatrix;
delete [] resultInstanceMatrix;
clReleaseCommandQueue(commandQueue);
clReleaseContext(cxGPUContext);
clReleaseProgram(cpProgram);
clReleaseKernel(ckKernel);
shrLog("%s Successfully end.\n\n", argv[0]);
return 0;
}
因為檢查了cl部分 自我感覺沒錯,但距離矩陣中只有第1列是正確的,後面4列均是無意義的值,所以請教大神。大神說是main出錯了,kernel的啟動方式那裡size_t localsize[2]={1,512}; 應該倒過來{512,1},當然相應的globalsize也要倒過來,不然每一維度上無法整除。這樣不改動kernel結果就可以正確。if(get_local_id(0)<k)
{
instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
}
或者如果不像大神那樣改動的話,就把這裡的所有0換成1 換個方向即可。結果一致。
ps:原來這個localsize[2]={M,N}實際是NXM 記住了。
ps:CL_DEVICE_MAX_WORK_ITEM_SIZES:1024X1024X64是對localsize每個維度的限制,即[0]<1024,[1]<1024,[2]<64, globalsize基本上是無限的,不用擔心。 我之前以為那個就是globalsize的限制。太傻。
ps:CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024 是說每個CU上三個維度總共最多1024個工作組。就是說localsize[0]*[1]*[2]<=1024!!!
另外大神誇了我,這個kernel寫得很清晰,有進步。好開心!!!
這個版本與knn的區別就是取前k個最小的,而我取的直接就是前k個,這個後續再改就好了。
另外大神說,我這個忘記考慮一點了 不然可以節省一半的時間,除了我自己想到的“對每個group的511個執行緒拿當前行的地址是重複的511次 還有自己組的groupID也重複去拿了 還有別的行的地址也是重複拿了”這裡外,還有更重要的一點,我這個版本,第1行去算它與第8行的距離,第8行還會算一次它與第1行的距離,所以重複了一倍的計算!我目前想的改進就是“讓每個group代表行 只訪問後續的group代表行即可解決你說的那個重複一半的問題 不讓它向前追溯 ”當然這個是後話(等我把取前k個最小寫完 編譯執行都正確的情況下 再來對這個部分提高效能)。
另外大神還說了一個要注意事項:
即group 0讀取第888行的時候, 和group 777讀取888行的時候,可能前者導致的888行被載入到cache中, 到了777個group需要888行的時候,888行實際上已經不在cache中了.導致需要從視訊記憶體晶片被重複讀取到cache一次(儘量不是這種情況).
二、第二個KNN版本(即仍舊是200X931的稀疏矩陣,取前k個最小距離及其相應位置,先不要求這k個最小距離一定要按從小到大排列)
我本來用的是CUDA裡有的thrust下的sort.h,我想用sort_by_key()這個函式,於是在main.cpp前加入了#include <thrust/sort.h> 然後在cl檔案中直接用sort_by_key(),這樣距離和距離對應的位置可以跟著自動排序,但結果在編譯kernel時報錯:ptxas application ptx input, line16; fatal error :Parsing error near '[]': syntax error
ptxas: fatal error :Ptx assembly aborted due to errors
我檢查了下,並不關第16行或者附近的[]的問題,我想應該是與網上比較多的“ptx, line 150; fatal error :Parsing error near '-': syntax error”這個報錯就是因為路徑中文,與什麼150行的- 根本沒關係。我感覺我這個也與[]沒關係。但具體什麼原因 谷歌不到?!!!!!!
這個kernel:
__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
__local float *localTempInst199,__local int *localTempPos199)
{
///////the first step : calculate the distances!!!!
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
__global float *row=matrix+width*currentGroupID;
for(uint anotherID=0;anotherID<height;anotherID++)
{
if(anotherID==currentGroupID)
{
localInstanceNoSort[anotherID]=0;
continue;
}
float sum=0;
const __global float *rowAnother=matrix+width*anotherID;
for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
barrier(CLK_LOCAL_MEM_FENCE);
localSizeInstance[get_local_id(0)]=sum;
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride)
{
localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
}
}
if (get_local_id(0) == 0)
{
localInstanceNoSort[anotherID] = localSizeInstance[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
///////erase the 0 instance (the instance from the current row to iteself).
if(get_local_id(0)<height)
{
if(get_local_id(0)<currentGroupID)
{
localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
localTempPos199[get_local_id(0)]=get_local_id(0);
}
else if(get_local_id(0)>currentGroupID)
{
localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
localTempPos199[get_local_id(0)]=get_local_id(0)+1;
}
else
{
;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
////////extract the first k datas!!!
//solution1:
//sort()! but it cannot work!!!
if(get_local_id(0)==0)
{
//thrust::sort(localTempInst199,localTempInst199+199);
//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
}
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<k)
{
instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
positionMatrix[currentGroupID*k+get_local_id(0)]=localTempPos199[get_local_id(0)];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
所以我想 如果這個函式sort_by_key()不報錯 那麼這個版本的knn是可以用的。
三、第三個KNN版本
思路(我們公司的大神提供的思路,比我自己想的另一個好很多):在想一個從N(非常非常大)個數裡拿出前k個最小值和對應位置(k<<N)。先用小一點的數實驗:N=199,k=5,thread_num=10,每個執行緒負責20個數(thread9其實只負責19個數)。對於某個thread,比如負責的數是:2,1,7,3,9,5,10,2,7,3,6,5.9,9,3.5,7.2,8,8.8,17,1,12。這20個數對應的位置分別是8,1,3,7,6,24,48,15,0,12,41,26,59,33,14,57,89,57,31,55. 先把前k個數放進一個data陣列裡[2,1,7,3,9] 同時將對應位置前k個放進pos陣列[8,1,3,7,6]。先算data的最大值max為9.然後這個執行緒將第6個數(第6個數為5)與max比較,如果小於max,則替換。(pos中對應地方也要替換)。結果data為[2,1,7,3,5],pos為[8,1,3,7,24]然後又計算這個data的最大值max=7,將第7個數(為10)與此max比較,大於max不用替換。繼續將第8個數2與max=7比較,小於它,又替換掉7,此時data變成[2,1,2,3,5] pos變成[8,1,15,7,24]....此執行緒遍歷到第20個數結束時,此執行緒將負責的20個數變成了只有k=5個。。。共10個執行緒,所以199個數變成了50個數。然後進行第2輪,需要三個執行緒,每個執行緒負責20個數。。。
按照這個思路,我開始寫cl檔案,寫完了但結果不正確:
__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
__local float *localTempInst199,__local int *localTempPos199,
__local float *localTempInst50,__local int *localTempPos50,
__local float *localTempInst15,__local int *localTempPos15,
__local float *data,__local int *pos )
{
///////the first step : calculate the distances!!!!
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
__global float *row=matrix+width*currentGroupID;
for(uint anotherID=0;anotherID<height;anotherID++)
{
if(anotherID==currentGroupID)
{
localInstanceNoSort[anotherID]=0;
continue;
}
float sum=0;
const __global float *rowAnother=matrix+width*anotherID;
for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
barrier(CLK_LOCAL_MEM_FENCE);
localSizeInstance[get_local_id(0)]=sum;
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride)
{
localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
}
}
if (get_local_id(0) == 0)
{
localInstanceNoSort[anotherID] = localSizeInstance[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
///////erase the 0 instance (the instance from the current row to iteself).
if(get_local_id(0)<height)
{
if(get_local_id(0)<currentGroupID)
{
localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
localTempPos199[get_local_id(0)]=get_local_id(0);
}
else if(get_local_id(0)>currentGroupID)
{
localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
localTempPos199[get_local_id(0)]=get_local_id(0)+1;
}
else
{
;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
////////extract the first k datas!!!
/* //solution1:
//sort()! but it cannot work!!!
if(get_local_id(0)==0)
{
//thrust::sort(localTempInst199,localTempInst199+199);
//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
}
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<k)
{
instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
//solution2:use these instead!
//use the 10 threads of a group ,every thread include 20 datas,k=5
//circle0 ,handle with 199 datas!
if(get_local_id(0)<10)
{
//copy the forward k data and position
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst199[k0];
pos[ind]=localTempPos199[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==9 && the1st==199)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst199[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst199[the1st];
pos[tempMaxPos]=localTempPos199[the1st];
}
the1st+=1;
}
//result of circle0
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<3)
{
//copy the forward k data and position
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==2 && the1st==49)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst50[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst50[the1st];
pos[tempMaxPos]=localTempPos50[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<1)
{
//copy the forward k data and position
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst15[k0];
pos[ind]=localTempPos15[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(the1st==15)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
main.cpp檔案:
/*
* main.cpp
*breaf:knn algorithm with OpenCL accelaration
* Created on: 2017年6月13日
* Author: root
*/
//#define __CL_ENABLE_EXCEPTIONS
#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <thrust/sort.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;
const int width=931;
const int height=200;
const int k=5;
void readTxt(const char *txtname,float *matrix)
{
//int width = 931, height = 200;
//float *matrix = new float[width*height];
//memset(matrix, 0,width*height * sizeof(float));
//ifstream infile("fea_sparse.txt");
ifstream infile(txtname);
vector<string> vec_str;
string line;
size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
//char *end;
int lineID=0;
while (getline(infile, line))
{
if (line.size() == 0)
{
continue;
}
for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
{
pos_maohao = line.find_first_of(":");
pos_kongge = line.find_first_of(" ");
if (pos_maohao == string::npos || pos_kongge == string::npos)
{
continue;
}
string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
string tempData = line.substr(pos_maohao+1, 8);
line = line.substr(pos_maohao + 9);
//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
int ind = atoi(tempIndex.c_str());
float data = atof(tempData.c_str());
//cout << ind << ":"<< data << " ";
matrix[lineID*width + ind] = data;
}
lineID += 1;
//cout << endl;
}
infile.close();
//delete[] matrix;
}
int main( int argc, const char** argv)
{
shrQAStart(argc, (char **)argv);
// set logfile name and start logs
shrSetLogFileName ("oclKnn4Robin.txt");
shrLog("%s Starting...\n\n", argv[0]);
//Get the NVIDIA platform
cl_platform_id cpPlatform;
cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device ;
clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
return ciErrNum;
}
//read kernel file and build it
size_t program_length;
char* source_path = shrFindFilePath("knn.cl", argv[0]);
oclCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
oclCheckError(source != NULL, shrTRUE);
// create the program
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
return(EXIT_FAILURE);
}
cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//host data
float *matrix = new float[width*height];
const size_t matrixSize=width*height*sizeof(float);
memset(matrix, 0,matrixSize);
const char *txtfile="fea_sparse.txt";
readTxt(txtfile,matrix);
int *resultPosMatrix=new int[height*k];
size_t resultPosSize=height*k*sizeof(int);
memset(resultPosMatrix,0,resultPosSize);
float *resultInstanceMatrix=new float[height*k];
size_t resultInstanceSize=height*k*sizeof(float);
memset(resultInstanceMatrix,0,resultInstanceSize);
//send arguments to kernel file | CL_MEM_COPY_HOST_PTR
cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int), &width);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
//ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
ciErrNum |= clSetKernelArg(ckKernel, 14, k*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 15, k*sizeof(int), 0);
oclCheckError(ciErrNum, CL_SUCCESS);
size_t localsize[2]={512,1};
size_t globalsize[2]={1024,200};
clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
int *positionHost=(int*)alloca(resultPosSize);
memset(positionHost,0,resultPosSize);
float *instanceHost=(float*)alloca(resultInstanceSize);
memset(instanceHost,0,resultInstanceSize);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//save as txt to check.
ofstream posOutfile("position.txt",ios::out);
ofstream instanceOutfile("instance.txt",ios::out);
for(size_t r=0;r<height;r++)
{
for(size_t c=0;c<k;c++)
{
int data=positionHost[r*k+c];
float instanceData=instanceHost[r*k+c];
posOutfile<<data<<"\t";
instanceOutfile<<instanceData<<"\t";
}
posOutfile<<endl;
instanceOutfile<<endl;
}
posOutfile.close();
instanceOutfile.close();
delete [] matrix;
delete [] resultPosMatrix;
delete [] resultInstanceMatrix;
clReleaseCommandQueue(commandQueue);
clReleaseContext(cxGPUContext);
clReleaseProgram(cpProgram);
clReleaseKernel(ckKernel);
shrLog("%s Successfully end.\n\n", argv[0]);
return 0;
}
但結果執行完不是正確的。
因為總共存在5處bug:host上1處,cl上4處(大神幫我看到4處 自己只檢查出1處)
1:host上的bug:
size_t globalsize[2]={1024,200};
我要的是200個,在紙上規劃好了結果這裡心急寫了200,實際是100,100*2=200。
2:cl上4處bug:
a:從200個距離中去掉與自己這一行的距離0時,我以為越過了那個空位,實際沒有,大神幫我點出此處,然後自己修改正確。
b:本來10個執行緒處理199個數,最後一個執行緒是處理19個數,我就寫的19而不是199!相應的後面我寫的9而不是49.後來才改過來。
c:第2個for中 break的條件應該是==50就break!
d:對於每個執行緒data和pos我設定的是__local型,每個執行緒都可以操作這個,導致data race!應該設定成每個執行緒獨享的而不是local。
修改好後全部正確並穩定:
修改後的cl檔案:
__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,
__local float *localSizeInstance,__local float *localInstanceNoSort,
__local float *localTempInst199,__local int *localTempPos199,
__local float *localTempInst50,__local int *localTempPos50,
__local float *localTempInst15,__local int *localTempPos15)
{
///////the first step : calculate the distances!!!!
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
__global float *row=matrix+width*currentGroupID;
for(uint anotherID=0;anotherID<height;anotherID++)
{
if(anotherID==currentGroupID)
{
localInstanceNoSort[anotherID]=0;
continue;
}
float sum=0;
const __global float *rowAnother=matrix+width*anotherID;
for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
barrier(CLK_LOCAL_MEM_FENCE);
localSizeInstance[get_local_id(0)]=sum;
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride)
{
localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
}
}
if (get_local_id(0) == 0)
{
localInstanceNoSort[anotherID] = localSizeInstance[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
///////erase the 0 instance (the instance from the current row to iteself).
if(get_local_id(0)<height-1)
{
if(get_local_id(0)<currentGroupID)
{
localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
localTempPos199[get_local_id(0)]=get_local_id(0);
}
else
{
localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)+1];
localTempPos199[get_local_id(0)]=get_local_id(0)+1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
////////extract the first k min datas!!!
//use the 10 threads of a group ,every thread include 20 datas,k=5
//circle0 ,handle with 199 datas!
if(get_local_id(0)<10)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst199[k0];
pos[ind]=localTempPos199[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==9 && the1st>198)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst199[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst199[the1st];
pos[tempMaxPos]=localTempPos199[the1st];
}
the1st+=1;
}
//result of circle0
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
if(get_local_id(0)==0)
{
for(int r=0;r<50;r++)
{
int instanceData=localTempPos50[r];
printf("%d \n",instanceData);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
//circle1
if(get_local_id(0)<3)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==2 && the1st==50)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst50[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst50[the1st];
pos[tempMaxPos]=localTempPos50[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
if(currentGroupID<200 && get_local_id(0)==0)
{
printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
//circle2
if(get_local_id(0)<1)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst15[k0];
pos[ind]=localTempPos15[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(the1st>14)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
if(currentGroupID<200 && get_local_id(0)==0)
{
printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
}
main.cpp檔案:
const int width=931;
const int height=200;
const int k=5;
void readTxt(const char *txtname,float *matrix)
{
//int width = 931, height = 200;
//float *matrix = new float[width*height];
//memset(matrix, 0,width*height * sizeof(float));
//ifstream infile("fea_sparse.txt");
ifstream infile(txtname);
vector<string> vec_str;
string line;
size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
//char *end;
int lineID=0;
while (getline(infile, line))
{
if (line.size() == 0)
{
continue;
}
for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
{
pos_maohao = line.find_first_of(":");
pos_kongge = line.find_first_of(" ");
if (pos_maohao == string::npos || pos_kongge == string::npos)
{
continue;
}
string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
string tempData = line.substr(pos_maohao+1, 8);
line = line.substr(pos_maohao + 9);
//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
int ind = atoi(tempIndex.c_str());
float data = atof(tempData.c_str());
//cout << ind << ":"<< data << " ";
matrix[lineID*width + ind] = data;
}
lineID += 1;
//cout << endl;
}
/*
cout << endl << endl;
cout << "the matrix is..." << endl;
for (int i = 0; i < height;i++)
{
for (int j = 0; j < width;j++)
{
cout << matrix[i*width + j] << " ";
}
cout << endl;
}
*/
infile.close();
//delete[] matrix;
}
int main( int argc, const char** argv)
{
shrQAStart(argc, (char **)argv);
// set logfile name and start logs
shrSetLogFileName ("oclKnn4Robin.txt");
shrLog("%s Starting...\n\n", argv[0]);
//Get the NVIDIA platform
cl_platform_id cpPlatform;
cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device ;
clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
return ciErrNum;
}
//read kernel file and build it
size_t program_length;
char* source_path = shrFindFilePath("knn.cl", argv[0]);
oclCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
oclCheckError(source != NULL, shrTRUE);
// create the program
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
return(EXIT_FAILURE);
}
cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//host data
float *matrix = new float[width*height];
const size_t matrixSize=width*height*sizeof(float);
memset(matrix, 0,matrixSize);
const char *txtfile="fea_sparse.txt";
readTxt(txtfile,matrix);
int *resultPosMatrix=new int[height*k];
size_t resultPosSize=height*k*sizeof(int);
memset(resultPosMatrix,0,resultPosSize);
float *resultInstanceMatrix=new float[height*k];
size_t resultInstanceSize=height*k*sizeof(float);
memset(resultInstanceMatrix,0,resultInstanceSize);
cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int), &width);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
oclCheckError(ciErrNum, CL_SUCCESS);
size_t localsize[2]={512,1};
size_t globalsize[2]={1024,100};
clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
int *positionHost=(int*)alloca(resultPosSize);
memset(positionHost,0,resultPosSize);
float *instanceHost=(float*)alloca(resultInstanceSize);
memset(instanceHost,0,resultInstanceSize);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//save as txt to check.
ofstream posOutfile("position.txt",ios::out);
ofstream instanceOutfile("instance.txt",ios::out);
for(size_t r=0;r<height;r++)
{
for(size_t c=0;c<k;c++)
{
int data=positionHost[r*k+c];
float instanceData=instanceHost[r*k+c];
posOutfile<<data<<"\t";
instanceOutfile<<instanceData<<"\t";
}
posOutfile<<endl;
instanceOutfile<<endl;
}
posOutfile.close();
instanceOutfile.close();
delete [] matrix;
delete [] resultPosMatrix;
delete [] resultInstanceMatrix;
clReleaseCommandQueue(commandQueue);
clReleaseContext(cxGPUContext);
clReleaseProgram(cpProgram);
clReleaseKernel(ckKernel);
shrLog("%s Successfully end.\n\n", argv[0]);
return 0;
}
(截圖不完全)距離結果和位置結果(即這一行與第幾行的距離):
原稀疏矩陣:
另外大神說:
四、第4個KNN版本(仍舊是200X931的小測資料,但是計算距離時採用向下追溯節約時間)
__kernel void knn200ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__global float *localInstanceNoSort0)
{
///////////the first step : calculate the distances!!!!
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
__global float *row=matrix+width*currentGroupID;
//__global float localInstanceNoSort0[40000];
__local float localSizeInstance[512];
localInstanceNoSort0[currentGroupID*height+currentGroupID]=0;
for(uint anotherID=currentGroupID+1;anotherID<height;anotherID++)
{
float sum=0;
const __global float *rowAnother=matrix+width*anotherID;
for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
barrier(CLK_LOCAL_MEM_FENCE);
localSizeInstance[get_local_id(0)]=sum;
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride)
{
localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
}
}
if (get_local_id(0) == 0)
{
localInstanceNoSort0[currentGroupID*height+anotherID] = localSizeInstance[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for(uint forward0=get_local_id(0);forward0<currentGroupID;forward0++)
{
localInstanceNoSort0[currentGroupID*height+forward0]=localInstanceNoSort0[forward0*height+currentGroupID];
}
barrier(CLK_LOCAL_MEM_FENCE);
//////////erase the 0 instance (the instance from the current row to iteself).
__local float localTempInst199[199];
__local int localTempPos199[199];
if(get_local_id(0)<height-1)
{
if(get_local_id(0)<currentGroupID)
{
localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)];
localTempPos199[get_local_id(0)]=get_local_id(0);
}
else
{
localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)+1];
localTempPos199[get_local_id(0)]=get_local_id(0)+1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//////////extract the first k min datas!!!
__local float localTempInst50[50];
__local int localTempPos50[50];
//use the 10 threads of a group ,every thread include 20 datas,k=5
//circle0 ,handle with 199 datas!
if(get_local_id(0)<10)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst199[k0];
pos[ind]=localTempPos199[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==9 && the1st>198)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst199[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst199[the1st];
pos[tempMaxPos]=localTempPos199[the1st];
}
the1st+=1;
}
//result of circle0
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
if(get_local_id(0)==0)
{
for(int r=0;r<50;r++)
{
int instanceData=localTempPos50[r];
printf("%d \n",instanceData);
}
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
//circle1
__local float localTempInst15[15];
__local int localTempPos15[15];
if(get_local_id(0)<3)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==2 && the1st==50)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst50[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst50[the1st];
pos[tempMaxPos]=localTempPos50[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
if(currentGroupID<200 && get_local_id(0)==0)
{
printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
//circle2
if(get_local_id(0)<1)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst15[k0];
pos[ind]=localTempPos15[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(the1st>14)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/*
if(currentGroupID<200 && get_local_id(0)==0)
{
printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
}
barrier(CLK_LOCAL_MEM_FENCE);
*/
}
結果與前面一致,是正確的。時間第3個版本是:13262us,而此版本是:4960us!!!!!!!!整整一倍多的時間啊!!!!其實還可以繼續優化的,不用每次比較k箇中的最大值,上次沒有變的最大值下次就不用比,這樣會更快!
大神說:她說的這個手工緩衝我下次再去想!
在寫這個時,我開始是對localInstanceNoSort0[]這個是local型,OpenCL中只有barrier(CLK_LOCAL_MEM_FENCE)讓同一個group內所有執行緒都到達這個柵欄點,再一起往下走。但opencl沒有CLK_GLOBAL_MEM_FENCE這種說法,所以我後面的group中要拿前面某個group的結果,如果前面那個group還沒有算完怎麼給這個group!而且即使拿到了,第2個group的結果是data[]陣列裡,而第199個group的結果也是data[]。只是這兩個data是並行,裡面的東西不一樣而已。但後面第199個group的data[]的前198個數要替換成第2個group的data[]的前198個數。雖然data是global型,但第199個group的data[0]要替換成第2個group的data[0],這無法做到啊因為分不清誰是誰的!所以我將這個變數改成了global型別,進行了一個變樣,不過實質是達到了這個目的,這樣就行了。也就是說幾個Work Group,那麼必須通過全域性變數進行通訊。
於是我開始是:__global float localInstanceNoSort0[40000]; 但報錯說OpenCL不支援這樣,然後我又在host端
ciErrNum |= clSetKernelArg(ckKernel, 6, 40000*sizeof(float), 0);
結果報錯說:<kernel>:error:global variables in function scope must have static storage class 所以後來我在host改成 正經的建立一個新陣列,建立此陣列的buffer然後寫進kernel的第7個引數!這樣才行了!所以我想應該是:kernel中要使用global型別的陣列變數,此變數必須是從host傳進來的實際變數,而不是像傳local型一樣給個大小賦0傳進kernel就好了!
五、第5個KNN版本(2002350X931的矩陣,即假設稀疏矩陣還原後的樣子)
__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)
{
__local float localSizeInstance[931];
__local float localTempInst50[1024*5];
__local float localTempPos50[1024*5];
__local float localTempInst15[32*5];
__local float localTempPos15[32*5];
__local float localTempInst5[5*5];
__local float localTempPos5[5*5];
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for (size_t y = currentGroupID; y < height; y += get_local_size(0))
{
////////////the first step : calculate the distances!!!!
__global float *row=matrix+width*y;
localInstanceNoSort[y*height+y]=0;
for(uint i=y+1;i<height;i+=1)
{
float sum=0;
const __global float *rowAnother=matrix+width*i;
for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
barrier(CLK_LOCAL_MEM_FENCE);
localSizeInstance[get_local_id(0)]=sum;
for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < stride)
{
localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
}
}
if (get_local_id(0) == 0)
{
localInstanceNoSort[y*height+i] = localSizeInstance[0];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
barrier(CLK_LOCAL_MEM_FENCE);
for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
{
localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////erase the 0 instance (the instance from the current row to iteself).
__local float localTempInst199[2002349];
__local int localTempPos199[2002349];
for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
{
if(t<currentGroupID)
{
localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
localTempPos199[t]=t;
}
else
{
localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
localTempPos199[t]=t+1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
///////////////////extract the first k min datas!!!
//use the 1024 threads of a group ,every thread include 1956 datas,k=5
//circle0 ,handle with 2002349 datas!
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
{
data[ind]=localTempInst199[k0];
pos[ind]=localTempPos199[k0];
}
int the1st=get_local_id(0)*1956+k;
int thelast=get_local_id(0)*1956+1956;
while(the1st<thelast)
{
if(get_local_id(0)==1023 && the1st>2002348)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst199[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst199[the1st];
pos[tempMaxPos]=localTempPos199[the1st];
}
the1st+=1;
}
//result of circle0
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
barrier(CLK_LOCAL_MEM_FENCE);
//circle1
if(get_local_id(0)<32)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*32*k+k;
int thelast=get_local_id(0)*32*k+32*k;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst50[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst50[the1st];
pos[tempMaxPos]=localTempPos50[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//circle2
if(get_local_id(0)<5)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*32+k;
int thelast=get_local_id(0)*32+32;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//circle2
if(get_local_id(0)<1)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
{
data[ind]=localTempInst5[k0];
pos[ind]=localTempPos5[k0];
}
int the1st=k;
int thelast=25;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
但這個kernel不夠好,改成下面這樣執行緒利用率更高:
__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)
{
__local float localSizeInstance[931];
__local float localTempInst50[1024*5];
__local float localTempPos50[1024*5];
__local float localTempInst15[32*5];
__local float localTempPos15[32*5];
__local float localTempInst5[5*5];
__local float localTempPos5[5*5];
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for (size_t y = currentGroupID; y < height; y += get_local_size(0))
{
////////////the first step : calculate the distances!!!!
__global float *row=matrix+width*y;
localInstanceNoSort[y*height+y]=0;
for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
{
float sum=0;
const __global float *rowAnother=matrix+width*i;
for(uint j=0;j<width;j+=1)
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
localInstanceNoSort[y*height+i] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
{
localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////erase the 0 instance (the instance from the current row to iteself).
__local float localTempInst199[2002349];
__local int localTempPos199[2002349];
for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
{
if(t<currentGroupID)
{
localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
localTempPos199[t]=t;
}
else
{
localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
localTempPos199[t]=t+1;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
///////////////////extract the first k min datas!!!
//use the 1024 threads of a group ,every thread include 1956 datas,k=5
//circle0 ,handle with 2002349 datas!
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
{
data[ind]=localTempInst199[k0];
pos[ind]=localTempPos199[k0];
}
int the1st=get_local_id(0)*1956+k;
int thelast=get_local_id(0)*1956+1956;
while(the1st<thelast)
{
if(get_local_id(0)==1023 && the1st>2002348)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst199[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst199[the1st];
pos[tempMaxPos]=localTempPos199[the1st];
}
the1st+=1;
}
//result of circle0
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
barrier(CLK_LOCAL_MEM_FENCE);
//circle1
if(get_local_id(0)<32)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*32*k+k;
int thelast=get_local_id(0)*32*k+32*k;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst50[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst50[the1st];
pos[tempMaxPos]=localTempPos50[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//circle2
if(get_local_id(0)<5)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
int the1st=get_local_id(0)*32+k;
int thelast=get_local_id(0)*32+32;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
//circle2
if(get_local_id(0)<1)
{
//copy the forward k data and position
float data[5];
int pos[5];
size_t ind=0;
for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
{
data[ind]=localTempInst5[k0];
pos[ind]=localTempPos5[k0];
}
int the1st=k;
int thelast=25;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
main:
#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;
const int width=931;
const int height=200;
const int height2=2002350;
const int k=5;
int64_t system_time()
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC,&t);
return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}
int main( int argc, const char** argv)
{
shrQAStart(argc, (char **)argv);
// set logfile name and start logs
shrSetLogFileName ("oclKnn4Robin.txt");
shrLog("%s Starting...\n\n", argv[0]);
//Get the NVIDIA platform
cl_platform_id cpPlatform;
cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device ;
clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
return ciErrNum;
}
//read kernel file and build it
size_t program_length;
char* source_path = shrFindFilePath("knn2002350.cl", argv[0]);
oclCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
oclCheckError(source != NULL, shrTRUE);
// create the program
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then return error
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
return(EXIT_FAILURE);
}
cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//host data
float *matrix;
const int matrixSizeNoFloat=width*height2;
const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
matrix=(float*)malloc(matrixSize);
shrFillArray(matrix,matrixSizeNoFloat);
size_t resultPosSize=height2*k*sizeof(int);
int *resultPosMatrix=(int*)malloc(resultPosSize);
memset(resultPosMatrix,0,resultPosSize);
size_t resultInstanceSize=height2*k*sizeof(float);
float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
shrFillArray(resultInstanceMatrix,height2*k);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int), &width);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height2);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
size_t NoSortSize=height2*height2*sizeof(float);
float *NoSort=(float*)malloc(NoSortSize);
memset(NoSort,0,NoSortSize);
cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&NoSort_buffer);
oclCheckError(ciErrNum, CL_SUCCESS);
clFinish(commandQueue);
int64_t time_start=system_time();
clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
clFinish(commandQueue);
int64_t time_end=system_time();
printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);
int *positionHost=(int*)alloca(resultPosSize);
memset(positionHost,0,resultPosSize);
float *instanceHost=(float*)alloca(resultInstanceSize);
memset(instanceHost,0,resultInstanceSize);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//save as txt to check.
ofstream posOutfile("position2002350.txt",ios::out);
ofstream instanceOutfile("distance2002350.txt",ios::out);
for(size_t r=0;r<height2;r++)
{
for(size_t c=0;c<k;c++)
{
int data=positionHost[r*k+c];
float instanceData=instanceHost[r*k+c];
posOutfile<<data<<"\t";
instanceOutfile<<instanceData<<"\t";
}
posOutfile<<endl;
instanceOutfile<<endl;
}
posOutfile.close();
instanceOutfile.close();
free(matrix);
free(resultPosMatrix);
free(resultInstanceMatrix);
clReleaseCommandQueue(commandQueue);
clReleaseContext(cxGPUContext);
clReleaseProgram(cpProgram);
clReleaseKernel(ckKernel);
shrLog("%s Successfully end.\n\n", argv[0]);
return 0;
}
報錯:clBuildProgram():ptxas:error:Entry function use too much shared data!!!大神說:
__local float localTempInst199[2002349];
__local int localTempPos199[2002349];
用得太多了!應該將這多行分段完成!(效率不高)
於是拆分成一節一節:即將原矩陣分成4個矩陣傳進來,因為大神說:比如CL_DEVICE_MAX_MEM_ALLOC_SIZE:最大不能超過500M,而原本需要600MB的緩衝區,可以拆分成兩個300MB的緩衝區傳進kernel。
__kernel void knn2002350ForRobin(__global float *matrix,__global float *matrix2,__global float *matrix3,__global float *matrix4,
int width,int height,int k,__global int *positionMatrix,
__global float *instanceMatrix,__global float *localInstanceNoSort,
__local float *distanceK,__local float *positionK)
{
__local float localTempInst199[1024];
__local int localTempPos199[1024];
__local float localTempInst50[51*5+4];
__local float localTempPos50[51*5+4];
__local float localTempInst15[65];
__local int localTempPos15[65];
__local float localTempInst5[20];
__local int localTempPos5[20];
__local float localTempInst430[429];
__local int localTempPos430[429];
__local float localTempInst110[110];
__local int localTempPos110[110];
__local float localTempInst30[30];
__local int localTempPos30[30];
__local float localTempInst10[10];
__local int localTempPos10[10];
__local float localTempInst489[2445];
__local int localTempPos489[2445];
__local float localTempInst123[123*5];
__local int localTempPos123[123*5];
__local float localTempInst31[31*5];
__local int localTempPos31[31*5];
__local float localTempInst40[40];
__local int localTempPos40[40];
uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
for (size_t y = currentGroupID; y < height; y += get_local_size(0))
{
////////////the first step : calculate the distances!!!!
__global float *row;
if(currentGroupID<500000)
{
row=matrix+width*y;
}
else if(currentGroupID>=500000 && currentGroupID<1000000)
{
row=matrix2+width*(y-500000);
}
else if(currentGroupID>=1000000 && currentGroupID<1500000)
{
row=matrix3+width*(y-1000000);
}
else
{
row=matrix4+width*(y-1500000);
}
localInstanceNoSort[y*height+y]=0;
const __global float *rowAnother;
for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
{
float sum=0;
if(i<500000)
{
rowAnother=matrix+width*i;
}
else if(i>=500000 && i<1000000)
{
rowAnother=matrix2+width*(i-500000);
}
else if(i>=1000000 && i<1500000)
{
rowAnother=matrix3+width*(i-1000000);
}
else
{
rowAnother=matrix4+width*(i-1500000);
}
for(uint j=0;j<width;j+=1)
{
sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
}
localInstanceNoSort[y*height+i] = sum;
}
barrier(CLK_LOCAL_MEM_FENCE);
for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
{
localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
}
barrier(CLK_LOCAL_MEM_FENCE);
//////////////////////////////////////2002350=1955*1024+430
for(uint index=0;index<1955;index++)
{
//get 1024 datas from 2002350 every time .
uint t=get_local_id(0);
if(index*1024+t<currentGroupID)
{
localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t];
localTempPos199[t]=index*1024+t;
}
else
{
localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t+1];
localTempPos199[t]=index*1024+t+1;
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle0 ,handle with 1024 datas!
//use the 52 threads of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
float data[5];
int pos[5];
int ind=0;
if(get_local_id(0)<52)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
if(k0<1024)
{
data[ind]=localTempInst199[k0];
pos[ind]=localTempPos199[k0];
}
else
{
break;
}
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==52)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst199[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst199[the1st];
pos[tempMaxPos]=localTempPos199[the1st];
}
the1st+=1;
}
//result of circle0
if(get_local_id(0)<51)
{
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
else
{
for(size_t finalk0=0;finalk0<k-1;finalk0++)
{
localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle1 ,handle with 514 datas!
//use the 13 threads of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<13)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
if(k0<259)
{
data[ind]=localTempInst50[k0];
pos[ind]=localTempPos50[k0];
}
else
{
break;
}
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==12 && the1st>18)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempPos50[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst50[the1st];
pos[tempMaxPos]=localTempPos50[the1st];
}
the1st+=1;
}
//result of circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle2 ,handle with 65 datas!
//use the 4 threads of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<4)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
if(k0<65)
{
data[ind]=localTempInst15[k0];
pos[ind]=localTempPos15[k0];
}
else
{
break;
}
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==4)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempPos15[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst15[the1st];
pos[tempMaxPos]=localTempPos15[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle3 ,handle with 20 datas!
//use the 1 thread of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<1)
{
for(size_t k0=0;k0<k;k0++,ind++)
{
data[ind]=localTempInst15[k0];
pos[ind]=localTempPos15[k0];
}
int the1st=k;
int thelast=20;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempPos5[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst5[the1st];
pos[tempMaxPos]=localTempPos5[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
distanceK[index*k+finalk0]=data[finalk0];
positionK[index*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
////////////////////////////get the last 430 or 429 datas from 2002350 .
uint t=get_local_id(0);
uint tt=1955*1024+t;
if(t<429)
{
if(currentGroupID<1955*1024)
{
localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
localTempPos430[t]=tt+1;
}
else
{
if(tt<currentGroupID)
{
localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt];
localTempPos430[t]=tt;
}
else
{
localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
localTempPos430[t]=tt+1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle0 ,handle with 430 or 429 datas!
//use the 22 threads of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
float data[5];
int pos[5];
int ind=0;
if(get_local_id(0)<22)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst430[k0];
pos[ind]=localTempPos430[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==21 && the1st==429)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst430[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst430[the1st];
pos[tempMaxPos]=localTempPos430[the1st];
}
the1st+=1;
}
//result circle0
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst110[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos110[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle1 ,handle with 110 datas!
//use the 5 threads of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<6)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=localTempInst110[k0];
pos[ind]=localTempPos110[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==5 && the1st==10)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst110[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst110[the1st];
pos[tempMaxPos]=localTempPos110[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst30[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos30[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle2 ,handle with 30 datas!
//use the 2 threads of a group ,every thread handles 15 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<2)
{
for(size_t k0=get_local_id(0)*15;k0<get_local_id(0)*15+k;k0++,ind++)
{
data[ind]=localTempInst110[k0];
pos[ind]=localTempPos110[k0];
}
int the1st=get_local_id(0)*15+k;
int thelast=get_local_id(0)*15+15;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempInst30[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst30[the1st];
pos[tempMaxPos]=localTempPos30[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle3 ,handle with 20 datas!
//use the 1 thread of a group ,every thread handles 10 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<1)
{
for(size_t k0=0;k0<k;k0++,ind++)
{
data[ind]=localTempInst10[k0];
pos[ind]=localTempPos10[k0];
}
int the1st=k;
int thelast=10;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(localTempPos10[the1st]<kmaxdata)
{
data[tempMaxPos]=localTempInst10[the1st];
pos[tempMaxPos]=localTempPos10[the1st];
}
the1st+=1;
}
//result of circle2
for(size_t finalk0=0;finalk0<k;finalk0++)
{
distanceK[1955*k+finalk0]=data[finalk0];
positionK[1955*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
////////////result of a group :1956*5 data
/////////////////////circle0,handle with 1956*5 datas!
//use the 489 thread of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<489)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=distanceK[k0];
pos[ind]=positionK[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(distanceK[the1st]<kmaxdata)
{
data[tempMaxPos]=distanceK[the1st];
pos[tempMaxPos]=distanceK[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst489[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos489[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle1,handle with 489*5 datas!
//use the 123 thread of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<123)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=distanceK[k0];
pos[ind]=positionK[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==122)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(distanceK[the1st]<kmaxdata)
{
data[tempMaxPos]=distanceK[the1st];
pos[tempMaxPos]=distanceK[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst123[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos123[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle2,handle with 123*5 datas!
//use the 31 thread of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<31)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=distanceK[k0];
pos[ind]=positionK[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==30 && the1st>614)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(distanceK[the1st]<kmaxdata)
{
data[tempMaxPos]=distanceK[the1st];
pos[tempMaxPos]=distanceK[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst31[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos31[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle3,handle with 31*5 datas!
//use the 8 thread of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<8)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=distanceK[k0];
pos[ind]=positionK[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
if(get_local_id(0)==7 && the1st>154)
{
break;
}
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(distanceK[the1st]<kmaxdata)
{
data[tempMaxPos]=distanceK[the1st];
pos[tempMaxPos]=distanceK[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst40[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos40[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle4,handle with 40 datas!
//use the 2 thread of a group ,every thread handles 20 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<2)
{
for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
{
data[ind]=distanceK[k0];
pos[ind]=positionK[k0];
}
int the1st=get_local_id(0)*20+k;
int thelast=get_local_id(0)*20+20;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(distanceK[the1st]<kmaxdata)
{
data[tempMaxPos]=distanceK[the1st];
pos[tempMaxPos]=distanceK[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
/////////////////////circle5,handle with 10 datas!
//use the 1 thread of a group ,every thread handles 10 datas,k=5
//copy the forward k data and position
//float data[5];
//int pos[5];
ind=0;
if(get_local_id(0)<1)
{
for(size_t k0=0;k0<k;k0++,ind++)
{
data[ind]=distanceK[k0];
pos[ind]=positionK[k0];
}
int the1st=k;
int thelast=10;
while(the1st<thelast)
{
//max from k
float kmaxdata=data[0];
int kmaxPos=pos[0];
int tempMaxPos=0;
for(size_t first=1;first<k;first++)
{
if(data[first]>kmaxdata)
{
kmaxdata=data[first];
kmaxPos=pos[first];
tempMaxPos=first;
}
}
if(distanceK[the1st]<kmaxdata)
{
data[tempMaxPos]=distanceK[the1st];
pos[tempMaxPos]=distanceK[the1st];
}
the1st+=1;
}
//result circle1
for(size_t finalk0=0;finalk0<k;finalk0++)
{
instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;
const int width=931;
const int height=200;
const int height2=500000;
const int height3=2350;
//const int height2=2002350; 2002350=500000*4+2350
const int k=5;
int64_t system_time()
{
struct timespec t;
clock_gettime(CLOCK_MONOTONIC,&t);
return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}
int main( int argc, const char** argv)
{
shrQAStart(argc, (char **)argv);
// set logfile name and start logs
shrSetLogFileName ("oclKnn4Robin.txt");
shrLog("%s Starting...\n\n", argv[0]);
//Get the NVIDIA platform
cl_platform_id cpPlatform;
cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_device_id device ;
clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
if (ciErrNum != CL_SUCCESS)
{
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
return ciErrNum;
}
//read kernel file and build it
size_t program_length;
char* source_path = shrFindFilePath("knn2002350advance.cl", argv[0]);
oclCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
oclCheckError(source != NULL, shrTRUE);
// create the program
cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
// build the program
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
printf("cannot build program successfully!\n");
size_t logSize;
ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,0, NULL, &logSize);
char *log = (char*)malloc(logSize);
ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,logSize, log, NULL);
printf("%s\n", log);
free(log);
exit(-1);
}
cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//host data
float *matrix,*matrix2,*matrix3,*matrix4;
const int matrixSizeNoFloat=width*height2;
const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
matrix=(float*)malloc(matrixSize);
memset(matrix,0,matrixSize);
matrix2=(float*)malloc(matrixSize);
memset(matrix2,0,matrixSize);
matrix3=(float*)malloc(matrixSize);
memset(matrix3,0,matrixSize);
const size_t matrixSize4=width*height3*sizeof(float);
matrix4=(float*)malloc(matrixSize4);
memset(matrix4,0,matrixSize4);
//shrFillArray(matrix,matrixSizeNoFloat);
size_t resultPosSize=height2*k*sizeof(int);
int *resultPosMatrix=(int*)malloc(resultPosSize);
memset(resultPosMatrix,0,resultPosSize);
size_t resultInstanceSize=height2*k*sizeof(float);
float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
shrFillArray(resultInstanceMatrix,height2*k);
size_t localsize[2]={1024,1};
size_t globalsize[2]={1024,1024};
cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_mem matrix2_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix2, &ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix2_buffer, CL_FALSE, 0, matrixSize, matrix2, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_mem matrix3_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix3, &ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix3_buffer, CL_FALSE, 0, matrixSize, matrix3, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
cl_mem matrix4_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize4, matrix4, &ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix4_buffer, CL_FALSE, 0, matrixSize4, matrix4, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
ciErrNum = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&matrix2_buffer);
ciErrNum = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&matrix3_buffer);
ciErrNum = clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&matrix4_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(int), &width);
ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int), &height2);
ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(int), &k);
ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&result_pos_buffer);
ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&result_ins_buffer);
size_t NoSortSize=height2*height2*sizeof(float);
float *NoSort=(float*)malloc(NoSortSize);
memset(NoSort,0,NoSortSize);
cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&NoSort_buffer);
oclCheckError(ciErrNum, CL_SUCCESS);
size_t KDistanceSize=1956*5*sizeof(float);
float *KDistance=(float*)malloc(KDistanceSize);
memset(KDistance,0,KDistanceSize);
cl_mem KDistance_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KDistanceSize,KDistance,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KDistanceSize, KDistance, 0, NULL, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 10, sizeof(cl_mem), (void*)&KDistance_buffer);
oclCheckError(ciErrNum, CL_SUCCESS);
size_t KPositinSize=1956*5*sizeof(float);
float *KPosition=(float*)malloc(KPositinSize);
memset(KPosition,0,KPositinSize);
cl_mem KPosition_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KPositinSize,KPosition,&ciErrNum);
ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KPositinSize, KPosition, 0, NULL, NULL);
ciErrNum |= clSetKernelArg(ckKernel, 11, sizeof(cl_mem), (void*)&KPosition_buffer);
oclCheckError(ciErrNum, CL_SUCCESS);
clFinish(commandQueue);
int64_t time_start=system_time();
clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
clFinish(commandQueue);
int64_t time_end=system_time();
printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);
int *positionHost=(int*)alloca(resultPosSize);
memset(positionHost,0,resultPosSize);
float *instanceHost=(float*)alloca(resultInstanceSize);
memset(instanceHost,0,resultInstanceSize);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
//save as txt to check.
ofstream posOutfile("position2002350.txt",ios::out);
ofstream instanceOutfile("distance2002350.txt",ios::out);
for(size_t r=0;r<height2;r++)
{
for(size_t c=0;c<k;c++)
{
int data=positionHost[r*k+c];
float instanceData=instanceHost[r*k+c];
posOutfile<<data<<"\t";
instanceOutfile<<instanceData<<"\t";
}
posOutfile<<endl;
instanceOutfile<<endl;
}
posOutfile.close();
instanceOutfile.close();
free(matrix);
free(resultPosMatrix);
free(resultInstanceMatrix);
free(NoSort);
free(KDistance);
free(KPosition);
clReleaseCommandQueue(commandQueue);
clReleaseContext(cxGPUContext);
clReleaseProgram(cpProgram);
clReleaseKernel(ckKernel);
shrLog("%s Successfully end.\n\n", argv[0]);
return 0;
}
第一次WriteBuffer通過了,第二次WriteBuffer時返回 -4.超過限制?!
相關文章
- 稀疏矩陣矩陣
- 巨大的矩陣(矩陣加速)矩陣
- 演算法學習:矩陣快速冪/矩陣加速演算法矩陣
- 【scipy 基礎】--稀疏矩陣矩陣
- 資料結構:陣列,稀疏矩陣,矩陣的壓縮。應用:矩陣的轉置,矩陣相乘資料結構陣列矩陣
- 資料結構(一)-稀疏矩陣資料結構矩陣
- cuda 加速矩陣乘法矩陣
- 【精選】矩陣加速矩陣
- 序列(dp+矩陣加速)矩陣
- 巨大的數(dp+矩陣加速)矩陣
- 矩陣快速冪加速最短路矩陣
- 矩陣加速線性遞推矩陣
- 雜項——矩陣加速(進階)矩陣
- MKL稀疏矩陣運算示例及函式封裝矩陣函式封裝
- 幸運數(dp+矩陣加速)矩陣
- 動態dp & 矩陣加速遞推矩陣
- 演算法-陣列與矩陣演算法陣列矩陣
- 【小白學演算法】2. 稀疏陣列演算法陣列
- 稀疏陣列陣列
- 【資料結構與演算法】——稀疏陣列資料結構演算法陣列
- 演算法之DP——01矩陣演算法矩陣
- 資料結構與演算法之稀疏陣列資料結構演算法陣列
- 資料結構與演算法:稀疏陣列(一)資料結構演算法陣列
- 求任意矩陣的伴隨矩陣矩陣
- 稀疏矩陣之三元組壓縮儲存並且轉置矩陣
- 矩陣分解--超詳細解讀矩陣
- python 計算矩陣的相關演算法Python矩陣演算法
- 稀疏陣列、佇列陣列佇列
- 20_稀疏陣列陣列
- 鄰接矩陣、度矩陣矩陣
- 大神教你python 讀取檔案並把矩陣轉成numpy的兩種方法Python矩陣
- 奇異矩陣,非奇異矩陣,偽逆矩陣矩陣
- 機器學習中的矩陣向量求導(五) 矩陣對矩陣的求導機器學習矩陣求導
- 資料結構與演算法—稀疏陣列和佇列資料結構演算法陣列佇列
- MATLAB|讀取一個檔案,並將其不同內容儲存為不同的矩陣Matlab矩陣
- golang實現稀疏陣列Golang陣列
- 矩陣矩陣
- KNN演算法實驗KNN演算法
- 基於矩陣分解的協同過濾演算法矩陣演算法