讀取大型稀疏矩陣&KNN演算法的OpenCL加速版本

元氣少女緣結神發表於2017-06-14

https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/

一、讀取稀疏矩陣。先以一個簡單的稀疏矩陣為例,讀進這個稀疏矩陣

即這個稀疏矩陣本來其實有4行12列。前面第1列0不用管,是標籤,是不要讀進矩陣的。

#include <stdio.h>
//#include <iostream>
#include <fstream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
void main(){
	int width = 12, height = 4;
	float *matrix = new float[width*height];
	memset(matrix, 0,width*height * sizeof(float));
	

	ifstream infile("simple_data.txt");
	vector<string> vec_str;
	string line;
	size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
	//char *end;
	int lineID=0;
	while (getline(infile, line))
	{
		if (line.size() == 0)
		{
			continue;
		}
		for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
		{
			pos_maohao = line.find_first_of(":");
			pos_kongge = line.find_first_of(" ");
			if (pos_maohao == string::npos || pos_kongge == string::npos)
			{
				continue;
			}
			string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
			string tempData = line.substr(pos_maohao+1, 8);
			line = line.substr(pos_maohao + 9);

			//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
			int ind = atoi(tempIndex.c_str());
			float data = atof(tempData.c_str());
			cout << ind << ":"<< data << " ";
			matrix[lineID*width + ind] = data;
	    }
		lineID += 1;
		cout << endl;
	}
	
	cout << endl << endl;
	cout << "the matrix is..." << endl;
	for (int i = 0; i < height;i++)
	{
		for (int j = 0; j < width;j++)
		{
			cout << matrix[i*width + j] << " ";
		}
		cout << endl;
	}
	
	infile.close();
	delete[] matrix;
}
即可。

一、寫好的第一個KNN版本(先只拿前k個距離 而不是前k個最小距離 因為只是先測試一下是否寫錯):

       在算一個200(高height)X931(寬width)的矩陣matrix,對於每一行去算它與另199行的距離,(它與自己這一行的距離為0 不用算),對於每一行可以得到一個200維的向量,取向量的前k個結果作為最終的距離矩陣instanceMatrix的結果(應該是200Xk的矩陣),這k個結果的位置矩陣positionMatrix(附帶的,其實這個不用算)  localsize是[1][512]  globalsize是[100][1024]即有200個工作組  我是讓每個工作組去算一行與另199行的距離。結果距離矩陣instanceMatrix中只有第1列是正確的  另k-1列是無意義的值     位置矩陣positionMatrix中k-1列無意義的值。

      cl部分:

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort)
{
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	// no sort
	if(get_local_id(0)<k)
		{
			instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
	
}
      main函式部分:

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}

		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 //send arguments to kernel file  | CL_MEM_COPY_HOST_PTR
	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 //ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={1,512};
	 size_t globalsize[2]={100,1024};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}
      因為檢查了cl部分 自我感覺沒錯,但距離矩陣中只有第1列是正確的,後面4列均是無意義的值,所以請教大神。大神說是main出錯了,kernel的啟動方式那裡size_t localsize[2]={1,512}; 應該倒過來{512,1},當然相應的globalsize也要倒過來,不然每一維度上無法整除。這樣不改動kernel結果就可以正確。
if(get_local_id(0)<k)
		{
			instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
		}
   或者如果不像大神那樣改動的話,就把這裡的所有0換成1  換個方向即可。結果一致。

    ps:原來這個localsize[2]={M,N}實際是NXM  記住了。

  ps:CL_DEVICE_MAX_WORK_ITEM_SIZES:1024X1024X64是對localsize每個維度的限制,即[0]<1024,[1]<1024,[2]<64, globalsize基本上是無限的,不用擔心。  我之前以為那個就是globalsize的限制。太傻。

  ps:CL_DEVICE_MAX_WORK_GROUP_SIZE:    1024  是說每個CU上三個維度總共最多1024個工作組。就是說localsize[0]*[1]*[2]<=1024!!!

  另外大神誇了我,這個kernel寫得很清晰,有進步。好開心!!! 

   這個版本與knn的區別就是取前k個最小的,而我取的直接就是前k個,這個後續再改就好了。  

   另外大神說,我這個忘記考慮一點了 不然可以節省一半的時間,除了我自己想到的“對每個group的511個執行緒拿當前行的地址是重複的511次 還有自己組的groupID也重複去拿了  還有別的行的地址也是重複拿了”這裡外,還有更重要的一點,我這個版本,第1行去算它與第8行的距離,第8行還會算一次它與第1行的距離,所以重複了一倍的計算!我目前想的改進就是“讓每個group代表行 只訪問後續的group代表行即可解決你說的那個重複一半的問題 不讓它向前追溯 ”當然這個是後話(等我把取前k個最小寫完 編譯執行都正確的情況下  再來對這個部分提高效能)。

   另外大神還說了一個要注意事項:

即group 0讀取第888行的時候, 和group 777讀取888行的時候,可能前者導致的888行被載入到cache中, 到了777個group需要888行的時候,888行實際上已經不在cache中了.導致需要從視訊記憶體晶片被重複讀取到cache一次(儘量不是這種情況).

二、第二個KNN版本(即仍舊是200X931的稀疏矩陣,取前k個最小距離及其相應位置,先不要求這k個最小距離一定要按從小到大排列)

我本來用的是CUDA裡有的thrust下的sort.h,我想用sort_by_key()這個函式,於是在main.cpp前加入了#include <thrust/sort.h> 然後在cl檔案中直接用sort_by_key(),這樣距離和距離對應的位置可以跟著自動排序,但結果在編譯kernel時報錯:ptxas application ptx input, line16; fatal error :Parsing error near '[]': syntax error

ptxas: fatal error :Ptx assembly aborted due to errors

我檢查了下,並不關第16行或者附近的[]的問題,我想應該是與網上比較多的“ptx, line 150; fatal error :Parsing error near '-': syntax error”這個報錯就是因為路徑中文,與什麼150行的- 根本沒關係。我感覺我這個也與[]沒關係。但具體什麼原因  谷歌不到?!!!!!!

       這個kernel:

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199)	 
{
	///////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///////erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else if(get_local_id(0)>currentGroupID)
		{
			localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
		else
		{
			;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	////////extract the first k datas!!!
	  //solution1:
	//sort()! but it cannot work!!!
	if(get_local_id(0)==0)
	{
		//thrust::sort(localTempInst199,localTempInst199+199);
		//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
		sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	if(get_local_id(0)<k)
	{
		instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
		positionMatrix[currentGroupID*k+get_local_id(0)]=localTempPos199[get_local_id(0)];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
}

所以我想 如果這個函式sort_by_key()不報錯 那麼這個版本的knn是可以用的。

三、第三個KNN版本

    思路(我們公司的大神提供的思路,比我自己想的另一個好很多):在想一個從N(非常非常大)個數裡拿出前k個最小值和對應位置(k<<N)。先用小一點的數實驗:N=199,k=5,thread_num=10,每個執行緒負責20個數(thread9其實只負責19個數)。對於某個thread,比如負責的數是:2,1,7,3,9,5,10,2,7,3,6,5.9,9,3.5,7.2,8,8.8,17,1,12。這20個數對應的位置分別是8,1,3,7,6,24,48,15,0,12,41,26,59,33,14,57,89,57,31,55. 先把前k個數放進一個data陣列裡[2,1,7,3,9] 同時將對應位置前k個放進pos陣列[8,1,3,7,6]。先算data的最大值max為9.然後這個執行緒將第6個數(第6個數為5)與max比較,如果小於max,則替換。(pos中對應地方也要替換)。結果data為[2,1,7,3,5],pos為[8,1,3,7,24]然後又計算這個data的最大值max=7,將第7個數(為10)與此max比較,大於max不用替換。繼續將第8個數2與max=7比較,小於它,又替換掉7,此時data變成[2,1,2,3,5]  pos變成[8,1,15,7,24]....此執行緒遍歷到第20個數結束時,此執行緒將負責的20個數變成了只有k=5個。。。共10個執行緒,所以199個數變成了50個數。然後進行第2輪,需要三個執行緒,每個執行緒負責20個數。。。

   按照這個思路,我開始寫cl檔案,寫完了但結果不正確:

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199,
								  __local float *localTempInst50,__local int *localTempPos50,
   							  __local float *localTempInst15,__local int *localTempPos15,
   							  __local float *data,__local int *pos )	 
{
	///////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///////erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else if(get_local_id(0)>currentGroupID)
		{
			localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
		else
		{
			;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	////////extract the first k datas!!!
	/*  //solution1:
	//sort()! but it cannot work!!!
	if(get_local_id(0)==0)
	{
		//thrust::sort(localTempInst199,localTempInst199+199);
		//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
		sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	if(get_local_id(0)<k)
	{
		instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//solution2:use these instead!
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st==199)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==49)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st==15)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			
			the1st+=1;
		}
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
		
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
}

main.cpp檔案:

/*
 * main.cpp
 *breaf:knn algorithm with OpenCL accelaration
 *  Created on: 2017年6月13日
 *      Author: root
 */
//#define __CL_ENABLE_EXCEPTIONS
#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <thrust/sort.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}
		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {

		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 //send arguments to kernel file  | CL_MEM_COPY_HOST_PTR
	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 //ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 14, k*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 15, k*sizeof(int), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={512,1};
	 size_t globalsize[2]={1024,200};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	   //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}
但結果執行完不是正確的。

因為總共存在5處bug:host上1處,cl上4處(大神幫我看到4處 自己只檢查出1處)

1:host上的bug:

size_t globalsize[2]={1024,200};

我要的是200個,在紙上規劃好了結果這裡心急寫了200,實際是100,100*2=200。

2:cl上4處bug:

a:從200個距離中去掉與自己這一行的距離0時,我以為越過了那個空位,實際沒有,大神幫我點出此處,然後自己修改正確。

      b:本來10個執行緒處理199個數,最後一個執行緒是處理19個數,我就寫的19而不是199!相應的後面我寫的9而不是49.後來才改過來。

      c:第2個for中 break的條件應該是==50就break!

      d:對於每個執行緒data和pos我設定的是__local型,每個執行緒都可以操作這個,導致data race!應該設定成每個執行緒獨享的而不是local。

修改好後全部正確並穩定:

修改後的cl檔案:

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,
								  __local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199,
								  __local float *localTempInst50,__local int *localTempPos50,
   							  __local float *localTempInst15,__local int *localTempPos15)	 
{
	///////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///////erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height-1)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)+1];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	////////extract the first k min datas!!!
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st>198)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(get_local_id(0)==0)
	{
		 for(int r=0;r<50;r++)
		 {
				 int instanceData=localTempPos50[r];
				 printf("%d  \n",instanceData);
		 }
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//circle1
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==50)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//circle2
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st>14)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
}
main.cpp檔案:

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}

		/*
		cout << endl << endl;
		cout << "the matrix is..." << endl;
		for (int i = 0; i < height;i++)
		{
			for (int j = 0; j < width;j++)
			{
				cout << matrix[i*width + j] << " ";
			}
			cout << endl;
		}
		 */
		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {

		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={512,1};
	 size_t globalsize[2]={1024,100};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);

	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}
(截圖不完全)距離結果和位置結果(即這一行與第幾行的距離):

 
原稀疏矩陣:


另外大神說:


四、第4個KNN版本(仍舊是200X931的小測資料,但是計算距離時採用向下追溯節約時間)

__kernel void knn200ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__global float *localInstanceNoSort0)	 
{
	///////////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	//__global float localInstanceNoSort0[40000];
	__local float localSizeInstance[512];
	localInstanceNoSort0[currentGroupID*height+currentGroupID]=0;
	for(uint anotherID=currentGroupID+1;anotherID<height;anotherID++)
	{
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
			 barrier(CLK_LOCAL_MEM_FENCE);
			 if (get_local_id(0) < stride) 
			 {
			    localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
			 }
       	}	 
     	if (get_local_id(0) == 0)
       	 { 
     		localInstanceNoSort0[currentGroupID*height+anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	for(uint forward0=get_local_id(0);forward0<currentGroupID;forward0++)
	{
		localInstanceNoSort0[currentGroupID*height+forward0]=localInstanceNoSort0[forward0*height+currentGroupID];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	//////////erase the 0 instance (the instance from the current row to iteself).
	__local float localTempInst199[199];
	__local int localTempPos199[199];
	if(get_local_id(0)<height-1)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)+1];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	//////////extract the first k min datas!!!
	 __local float localTempInst50[50];
	 __local int localTempPos50[50];
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st>198)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(get_local_id(0)==0)
	{
		 for(int r=0;r<50;r++)
		 {
				 int instanceData=localTempPos50[r];
				 printf("%d  \n",instanceData);
		 }
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	//circle1
	 __local float localTempInst15[15];
	 __local int localTempPos15[15];
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==50)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			the1st+=1;
		}
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	//circle2
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st>14)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			the1st+=1;
		}
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
}

結果與前面一致,是正確的。時間第3個版本是:13262us,而此版本是:4960us!!!!!!!!整整一倍多的時間啊!!!!其實還可以繼續優化的,不用每次比較k箇中的最大值,上次沒有變的最大值下次就不用比,這樣會更快!

大神說:她說的這個手工緩衝我下次再去想!

在寫這個時,我開始是對localInstanceNoSort0[]這個是local型,OpenCL中只有barrier(CLK_LOCAL_MEM_FENCE)讓同一個group內所有執行緒都到達這個柵欄點,再一起往下走。但opencl沒有CLK_GLOBAL_MEM_FENCE這種說法,所以我後面的group中要拿前面某個group的結果,如果前面那個group還沒有算完怎麼給這個group!而且即使拿到了,第2個group的結果是data[]陣列裡,而第199個group的結果也是data[]。只是這兩個data是並行,裡面的東西不一樣而已。但後面第199個group的data[]的前198個數要替換成第2個group的data[]的前198個數。雖然data是global型,但第199個group的data[0]要替換成第2個group的data[0],這無法做到啊因為分不清誰是誰的!所以我將這個變數改成了global型別,進行了一個變樣,不過實質是達到了這個目的,這樣就行了。也就是說個Work Group,那麼必須通過全域性變數進行通訊。

於是我開始是:__global float localInstanceNoSort0[40000]; 但報錯說OpenCL不支援這樣,然後我又在host端

ciErrNum |= clSetKernelArg(ckKernel, 6, 40000*sizeof(float), 0);

結果報錯說:<kernel>:error:global variables in function scope must have static storage class 所以後來我在host改成 正經的建立一個新陣列,建立此陣列的buffer然後寫進kernel的第7個引數!這樣才行了!所以我想應該是:kernel中要使用global型別的陣列變數,此變數必須是從host傳進來的實際變數,而不是像傳local型一樣給個大小賦0傳進kernel就好了!

五、第5個KNN版本(2002350X931的矩陣,即假設稀疏矩陣還原後的樣子)

__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)	 
{
	__local float localSizeInstance[931];
	__local float localTempInst50[1024*5];
	__local float localTempPos50[1024*5];
	__local float localTempInst15[32*5];
	__local float localTempPos15[32*5];
	__local float localTempInst5[5*5];
	__local float localTempPos5[5*5];
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
			////////////the first step : calculate the distances!!!!
			__global float *row=matrix+width*y;
			localInstanceNoSort[y*height+y]=0;
			for(uint i=y+1;i<height;i+=1)
			{
				float sum=0;
				const __global float *rowAnother=matrix+width*i;
			   for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
				{
					sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
				}
				barrier(CLK_LOCAL_MEM_FENCE);
				
				localSizeInstance[get_local_id(0)]=sum;
				for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
				{
		         barrier(CLK_LOCAL_MEM_FENCE);
		         if (get_local_id(0) < stride) 
		        	{
		            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
		        	}
		        }
		     	if (get_local_id(0) == 0)
		       	{ 
		     			localInstanceNoSort[y*height+i] = localSizeInstance[0];
		      	 }
		    	 barrier(CLK_LOCAL_MEM_FENCE);
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
			{
				localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/////////////////erase the 0 instance (the instance from the current row to iteself).
			__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
			for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
			{
				if(t<currentGroupID)
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
					localTempPos199[t]=t;
				}
				else
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
					localTempPos199[t]=t+1;
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			///////////////////extract the first k min datas!!!
			//use the 1024 threads of a group ,every thread include 1956 datas,k=5
			//circle0 ,handle with 2002349 datas!
			//copy the forward k data and position
			float data[5];
			int pos[5];
			size_t ind=0;
			for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
			{
				data[ind]=localTempInst199[k0];
				pos[ind]=localTempPos199[k0];
			}
			int the1st=get_local_id(0)*1956+k;
			int thelast=get_local_id(0)*1956+1956;
			while(the1st<thelast)
			{
				if(get_local_id(0)==1023 && the1st>2002348)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst199[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst199[the1st];
					pos[tempMaxPos]=localTempPos199[the1st];
				}
				the1st+=1;
			}
			//result of circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			//circle1
			if(get_local_id(0)<32)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32*k+k;
				int thelast=get_local_id(0)*32*k+32*k;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<5)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32+k;
				int thelast=get_local_id(0)*32+32;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<1)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
				{
					data[ind]=localTempInst5[k0];
					pos[ind]=localTempPos5[k0];
				}
				
				int the1st=k;
				int thelast=25;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
					positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
	}
}

但這個kernel不夠好,改成下面這樣執行緒利用率更高:

__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)	 
{
	__local float localSizeInstance[931];
	__local float localTempInst50[1024*5];
	__local float localTempPos50[1024*5];
	__local float localTempInst15[32*5];
	__local float localTempPos15[32*5];
	__local float localTempInst5[5*5];
	__local float localTempPos5[5*5];
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
			////////////the first step : calculate the distances!!!!
			__global float *row=matrix+width*y;
			localInstanceNoSort[y*height+y]=0;
			for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
			{
				float sum=0;
				const __global float *rowAnother=matrix+width*i;
			   for(uint j=0;j<width;j+=1)
				{
					sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
				}
				localInstanceNoSort[y*height+i] = sum;
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
			{
				localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/////////////////erase the 0 instance (the instance from the current row to iteself).
			__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
			for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
			{
				if(t<currentGroupID)
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
					localTempPos199[t]=t;
				}
				else
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
					localTempPos199[t]=t+1;
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			///////////////////extract the first k min datas!!!
			//use the 1024 threads of a group ,every thread include 1956 datas,k=5
			//circle0 ,handle with 2002349 datas!
			//copy the forward k data and position
			float data[5];
			int pos[5];
			size_t ind=0;
			for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
			{
				data[ind]=localTempInst199[k0];
				pos[ind]=localTempPos199[k0];
			}
			int the1st=get_local_id(0)*1956+k;
			int thelast=get_local_id(0)*1956+1956;
			while(the1st<thelast)
			{
				if(get_local_id(0)==1023 && the1st>2002348)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst199[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst199[the1st];
					pos[tempMaxPos]=localTempPos199[the1st];
				}
				the1st+=1;
			}
			//result of circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle1
			if(get_local_id(0)<32)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32*k+k;
				int thelast=get_local_id(0)*32*k+32*k;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<5)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32+k;
				int thelast=get_local_id(0)*32+32;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<1)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
				{
					data[ind]=localTempInst5[k0];
					pos[ind]=localTempPos5[k0];
				}
				
				int the1st=k;
				int thelast=25;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
					positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		
	}
}
main:

#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int height2=2002350;
const int k=5;

int64_t system_time()
{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC,&t);
	return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}

int main( int argc, const char** argv)
{
	shrQAStart(argc, (char **)argv);
	// set logfile name and start logs
	shrSetLogFileName ("oclKnn4Robin.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	//Get the NVIDIA platform
	cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn2002350.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix;
	 const int matrixSizeNoFloat=width*height2;
	 const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
	 matrix=(float*)malloc(matrixSize);
	 shrFillArray(matrix,matrixSizeNoFloat);
	 size_t resultPosSize=height2*k*sizeof(int);
	 int *resultPosMatrix=(int*)malloc(resultPosSize);
	 memset(resultPosMatrix,0,resultPosSize);
	 size_t resultInstanceSize=height2*k*sizeof(float);
	 float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
	 shrFillArray(resultInstanceMatrix,height2*k);

	 size_t localsize[2]={1024,1};
	 size_t globalsize[2]={1024,1024};

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height2);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);

	 size_t NoSortSize=height2*height2*sizeof(float);
	 float *NoSort=(float*)malloc(NoSortSize);
	 memset(NoSort,0,NoSortSize);
	 cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&NoSort_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 clFinish(commandQueue);
	 int64_t time_start=system_time();
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
	 clFinish(commandQueue);
	 int64_t time_end=system_time();
	 printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position2002350.txt",ios::out);
	 ofstream instanceOutfile("distance2002350.txt",ios::out);
	 for(size_t r=0;r<height2;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 free(matrix);
	 free(resultPosMatrix);
	 free(resultInstanceMatrix);
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 
	 return 0;
}
報錯:clBuildProgram():ptxas:error:Entry function use too much shared data!!!

大神說:

__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
用得太多了!應該將這多行分段完成!(效率不高)


於是拆分成一節一節:即將原矩陣分成4個矩陣傳進來,因為大神說:比如CL_DEVICE_MAX_MEM_ALLOC_SIZE:最大不能超過500M,而原本需要600MB的緩衝區,可以拆分成兩個300MB的緩衝區傳進kernel。

 (1)NV的卡總是一次可以分配最多1/4的視訊記憶體大小,所以你如果使用2GB視訊記憶體的卡,一次能分配在512MB左右。此時可以考慮換一個較大視訊記憶體的卡(例如4GB視訊記憶體的, 可以分配1GB一次)。
(2)CUDA沒有此限制,可以一次性分配將近全部的(如果你是2GB的卡,能分配0MB 到 大約2GB,一次性)。可以考慮轉換到CUDA.
   我也不再一下算某行與另2002350的距離,而是算其與1024個行的距離,利用迴圈遍歷完2002350!

__kernel void knn2002350ForRobin(__global float *matrix,__global float *matrix2,__global float *matrix3,__global float *matrix4,
											int width,int height,int k,__global int *positionMatrix,
											__global float *instanceMatrix,__global float *localInstanceNoSort,
											__local float *distanceK,__local float *positionK)	 
{
	__local float localTempInst199[1024];
	__local int localTempPos199[1024];
	__local float localTempInst50[51*5+4];
	__local float localTempPos50[51*5+4];
	__local float localTempInst15[65];
	__local int localTempPos15[65];
	__local float localTempInst5[20];
	__local int localTempPos5[20];
	
	__local float localTempInst430[429];
	__local int localTempPos430[429];
	__local float localTempInst110[110];
	__local int localTempPos110[110];
	__local float localTempInst30[30];
	__local int localTempPos30[30];
	__local float localTempInst10[10];
	__local int localTempPos10[10];
	__local float localTempInst489[2445];
	__local int localTempPos489[2445];
	__local float localTempInst123[123*5];
	__local int localTempPos123[123*5];
	__local float localTempInst31[31*5];
	__local int localTempPos31[31*5];
	__local float localTempInst40[40];
	__local int localTempPos40[40];

	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
		////////////the first step : calculate the distances!!!!
		__global float *row;
		if(currentGroupID<500000)
		{
			row=matrix+width*y;
		}
		else if(currentGroupID>=500000 && currentGroupID<1000000)
		{
			row=matrix2+width*(y-500000);
		}
		else if(currentGroupID>=1000000 && currentGroupID<1500000)
		{
			row=matrix3+width*(y-1000000);
		}
		else
		{
			row=matrix4+width*(y-1500000);
		}
		localInstanceNoSort[y*height+y]=0;
		const __global float *rowAnother;
		for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
		{
			float sum=0;
			if(i<500000)
			{
				rowAnother=matrix+width*i;
			}
			else if(i>=500000 && i<1000000)
			{
				rowAnother=matrix2+width*(i-500000);
			}
			else if(i>=1000000 && i<1500000)
			{
				rowAnother=matrix3+width*(i-1000000);
			}
			else
			{
				rowAnother=matrix4+width*(i-1500000);
			}
			
		   for(uint j=0;j<width;j+=1)
			{
				sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
			}
			localInstanceNoSort[y*height+i] = sum;
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
		{
			localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		//////////////////////////////////////2002350=1955*1024+430
		for(uint index=0;index<1955;index++)
		{
			//get 1024 datas from 2002350 every time .
			uint t=get_local_id(0);
			if(index*1024+t<currentGroupID)
			{
				localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t];
				localTempPos199[t]=index*1024+t;
			}
			else
			{
				localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t+1];
				localTempPos199[t]=index*1024+t+1;
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/////////////////////circle0 ,handle with 1024 datas!
			//use the 52 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			float data[5];
			int pos[5];
			int ind=0;
			if(get_local_id(0)<52)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<1024)
					{
						data[ind]=localTempInst199[k0];
						pos[ind]=localTempPos199[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==52)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempInst199[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst199[the1st];
						pos[tempMaxPos]=localTempPos199[the1st];
					}
					the1st+=1;
				}
				//result of circle0
				if(get_local_id(0)<51)
				{
					for(size_t finalk0=0;finalk0<k;finalk0++)
					{
						localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
						localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
					}
				}
				else
				{
					for(size_t finalk0=0;finalk0<k-1;finalk0++)
					{
						localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
						localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
					}
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/////////////////////circle1 ,handle with 514 datas!
			//use the 13 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<13)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<259)
					{
						data[ind]=localTempInst50[k0];
						pos[ind]=localTempPos50[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==12 && the1st>18)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					the1st+=1;
				}
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/////////////////////circle2 ,handle with 65 datas!
			//use the 4 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<4)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<65)
					{
						data[ind]=localTempInst15[k0];
						pos[ind]=localTempPos15[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==4)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					the1st+=1;
				}
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/////////////////////circle3 ,handle with 20 datas!
			//use the 1 thread of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<1)
			{
				for(size_t k0=0;k0<k;k0++,ind++)
				{
					data[ind]=localTempInst15[k0];
					pos[ind]=localTempPos15[k0];
				}
			
				int the1st=k;
				int thelast=20;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos5[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst5[the1st];
						pos[tempMaxPos]=localTempPos5[the1st];
					}
					the1st+=1;
				}
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					distanceK[index*k+finalk0]=data[finalk0];
					positionK[index*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		////////////////////////////get the last 430 or 429 datas from 2002350 .
		uint t=get_local_id(0);
		uint tt=1955*1024+t;
		if(t<429)
		{
			if(currentGroupID<1955*1024)
			{
				
				localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
				localTempPos430[t]=tt+1;
			}
			else
			{
				if(tt<currentGroupID)
				{
					localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt];
					localTempPos430[t]=tt;
				}
				else
				{
					localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
					localTempPos430[t]=tt+1;
				}
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle0 ,handle with  430 or 429 datas!
		//use the 22 threads of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		float data[5];
		int pos[5];
		int ind=0;
		if(get_local_id(0)<22)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=localTempInst430[k0];
				pos[ind]=localTempPos430[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==21 && the1st==429)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst430[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst430[the1st];
					pos[tempMaxPos]=localTempPos430[the1st];
				}
				the1st+=1;
			}
			//result circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst110[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos110[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle1 ,handle with  110 datas!
		//use the 5 threads of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<6)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=localTempInst110[k0];
				pos[ind]=localTempPos110[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==5 && the1st==10)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst110[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst110[the1st];
					pos[tempMaxPos]=localTempPos110[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst30[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos30[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle2 ,handle with  30 datas!
		//use the 2 threads of a group ,every thread handles 15 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<2)
		{
			for(size_t k0=get_local_id(0)*15;k0<get_local_id(0)*15+k;k0++,ind++)
			{
				data[ind]=localTempInst110[k0];
				pos[ind]=localTempPos110[k0];
			}
		
			int the1st=get_local_id(0)*15+k;
			int thelast=get_local_id(0)*15+15;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst30[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst30[the1st];
					pos[tempMaxPos]=localTempPos30[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle3 ,handle with 20 datas!
		//use the 1 thread of a group ,every thread handles 10 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<1)
		{
			for(size_t k0=0;k0<k;k0++,ind++)
			{
				data[ind]=localTempInst10[k0];
				pos[ind]=localTempPos10[k0];
			}
		
			int the1st=k;
			int thelast=10;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempPos10[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst10[the1st];
					pos[tempMaxPos]=localTempPos10[the1st];
				}
				the1st+=1;
			}
			//result of circle2
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				distanceK[1955*k+finalk0]=data[finalk0];
				positionK[1955*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		////////////result of a group :1956*5 data
		/////////////////////circle0,handle with 1956*5 datas!
		//use the 489 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<489)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst489[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos489[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/////////////////////circle1,handle with 489*5 datas!
		//use the 123 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<123)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==122)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst123[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos123[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/////////////////////circle2,handle with 123*5 datas!
		//use the 31 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<31)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==30 && the1st>614)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst31[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos31[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle3,handle with 31*5 datas!
		//use the 8 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<8)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==7 && the1st>154)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst40[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos40[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/////////////////////circle4,handle with 40 datas!
		//use the 2 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<2)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle5,handle with 10 datas!
		//use the 1 thread of a group ,every thread handles 10 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<1)
		{
			for(size_t k0=0;k0<k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=k;
			int thelast=10;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
				positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
	}
}


#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int height2=500000;
const int height3=2350;
//const int height2=2002350; 2002350=500000*4+2350
const int k=5;

int64_t system_time()
{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC,&t);
	return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}

int main( int argc, const char** argv)
{
	
	shrQAStart(argc, (char **)argv);
	// set logfile name and start logs
	shrSetLogFileName ("oclKnn4Robin.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	//Get the NVIDIA platform
	cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn2002350advance.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 printf("cannot build program successfully!\n");
		 size_t logSize;
		 ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,0, NULL, &logSize);
		 char *log = (char*)malloc(logSize);
		 ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,logSize, log, NULL);
		 printf("%s\n", log);
		 free(log);
		 exit(-1);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix,*matrix2,*matrix3,*matrix4;
	 const int matrixSizeNoFloat=width*height2;
	 const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
	 matrix=(float*)malloc(matrixSize);
	 memset(matrix,0,matrixSize);
	 matrix2=(float*)malloc(matrixSize);
	 memset(matrix2,0,matrixSize);
	 matrix3=(float*)malloc(matrixSize);
	 memset(matrix3,0,matrixSize);
	 const size_t matrixSize4=width*height3*sizeof(float);
	 matrix4=(float*)malloc(matrixSize4);
	 memset(matrix4,0,matrixSize4);
	 //shrFillArray(matrix,matrixSizeNoFloat);
	 size_t resultPosSize=height2*k*sizeof(int);
	 int *resultPosMatrix=(int*)malloc(resultPosSize);
	 memset(resultPosMatrix,0,resultPosSize);
	 size_t resultInstanceSize=height2*k*sizeof(float);
	 float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
	 shrFillArray(resultInstanceMatrix,height2*k);

	 size_t localsize[2]={1024,1};
	 size_t globalsize[2]={1024,1024};

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 cl_mem matrix2_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix2, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix2_buffer, CL_FALSE, 0, matrixSize, matrix2, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_mem matrix3_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix3, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix3_buffer, CL_FALSE, 0, matrixSize, matrix3, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_mem matrix4_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize4, matrix4, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix4_buffer, CL_FALSE, 0, matrixSize4, matrix4, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&matrix2_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&matrix3_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&matrix4_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int), &height2);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&result_ins_buffer);

	 size_t NoSortSize=height2*height2*sizeof(float);
	 float *NoSort=(float*)malloc(NoSortSize);
	 memset(NoSort,0,NoSortSize);
	 cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&NoSort_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t KDistanceSize=1956*5*sizeof(float);
	 float *KDistance=(float*)malloc(KDistanceSize);
	 memset(KDistance,0,KDistanceSize);
	 cl_mem KDistance_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KDistanceSize,KDistance,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KDistanceSize, KDistance, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, sizeof(cl_mem), (void*)&KDistance_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t KPositinSize=1956*5*sizeof(float);
	 float *KPosition=(float*)malloc(KPositinSize);
	 memset(KPosition,0,KPositinSize);
	 cl_mem KPosition_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KPositinSize,KPosition,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KPositinSize, KPosition, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, sizeof(cl_mem), (void*)&KPosition_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 clFinish(commandQueue);
	 int64_t time_start=system_time();
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
	 clFinish(commandQueue);
	 int64_t time_end=system_time();
	 printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position2002350.txt",ios::out);
	 ofstream instanceOutfile("distance2002350.txt",ios::out);
	 for(size_t r=0;r<height2;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 free(matrix);
	 free(resultPosMatrix);
	 free(resultInstanceMatrix);
	 free(NoSort);
	 free(KDistance);
	 free(KPosition);
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);


	 return 0;
}


第一次WriteBuffer通過了,第二次WriteBuffer時返回 -4.超過限制?!





相關文章