讀取大型稀疏矩陣&KNN演算法的OpenCL加速版本

元氣少女緣結神發表於2017-06-14

https://www.eriksmistad.no/getting-started-with-opencl-and-gpu-computing/

一、讀取稀疏矩陣。先以一個簡單的稀疏矩陣為例，讀進這個稀疏矩陣

即這個稀疏矩陣本來其實有4行12列。前面第1列0不用管，是標籤，是不要讀進矩陣的。

#include <stdio.h>
//#include <iostream>
#include <fstream>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
using namespace std;
using namespace cv;
void main(){
	int width = 12, height = 4;
	float *matrix = new float[width*height];
	memset(matrix, 0,width*height * sizeof(float));
	

	ifstream infile("simple_data.txt");
	vector<string> vec_str;
	string line;
	size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
	//char *end;
	int lineID=0;
	while (getline(infile, line))
	{
		if (line.size() == 0)
		{
			continue;
		}
		for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
		{
			pos_maohao = line.find_first_of(":");
			pos_kongge = line.find_first_of(" ");
			if (pos_maohao == string::npos || pos_kongge == string::npos)
			{
				continue;
			}
			string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
			string tempData = line.substr(pos_maohao+1, 8);
			line = line.substr(pos_maohao + 9);

			//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
			int ind = atoi(tempIndex.c_str());
			float data = atof(tempData.c_str());
			cout << ind << ":"<< data << " ";
			matrix[lineID*width + ind] = data;
	    }
		lineID += 1;
		cout << endl;
	}
	
	cout << endl << endl;
	cout << "the matrix is..." << endl;
	for (int i = 0; i < height;i++)
	{
		for (int j = 0; j < width;j++)
		{
			cout << matrix[i*width + j] << " ";
		}
		cout << endl;
	}
	
	infile.close();
	delete[] matrix;
}

即可。

一、寫好的第一個KNN版本(先只拿前k個距離而不是前k個最小距離因為只是先測試一下是否寫錯)：

在算一個200(高height)X931(寬width)的矩陣matrix，對於每一行去算它與另199行的距離，(它與自己這一行的距離為0 不用算)，對於每一行可以得到一個200維的向量，取向量的前k個結果作為最終的距離矩陣instanceMatrix的結果(應該是200Xk的矩陣)，這k個結果的位置矩陣positionMatrix(附帶的，其實這個不用算) localsize是[1][512] globalsize是[100][1024]即有200個工作組我是讓每個工作組去算一行與另199行的距離。結果距離矩陣instanceMatrix中只有第1列是正確的另k-1列是無意義的值位置矩陣positionMatrix中k-1列無意義的值。

cl部分：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort)
{
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	// no sort
	if(get_local_id(0)<k)
		{
			instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
	
}

main函式部分：

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos,lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}

		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 //send arguments to kernel file  | CL_MEM_COPY_HOST_PTR
	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 //ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={1,512};
	 size_t globalsize[2]={100,1024};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}

因為檢查了cl部分自我感覺沒錯，但距離矩陣中只有第1列是正確的，後面4列均是無意義的值，所以請教大神。大神說是main出錯了，kernel的啟動方式那裡size_t localsize[2]={1,512}; 應該倒過來{512，1},當然相應的globalsize也要倒過來，不然每一維度上無法整除。這樣不改動kernel結果就可以正確。

if(get_local_id(0)<k)
		{
			instanceMatrix[currentGroupID*k+get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			positionMatrix[currentGroupID*k+get_local_id(0)]=get_local_id(0);
		}

或者如果不像大神那樣改動的話，就把這裡的所有0換成1 換個方向即可。結果一致。

ps：原來這個localsize[2]={M,N}實際是NXM 記住了。

ps：CL_DEVICE_MAX_WORK_ITEM_SIZES：1024X1024X64是對localsize每個維度的限制，即[0]<1024，[1]<1024，[2]<64， globalsize基本上是無限的，不用擔心。 我之前以為那個就是globalsize的限制。太傻。

ps：CL_DEVICE_MAX_WORK_GROUP_SIZE: 1024 是說每個CU上三個維度總共最多1024個工作組。就是說localsize[0]*[1]*[2]<=1024！！！

另外大神誇了我，這個kernel寫得很清晰，有進步。好開心！！！

這個版本與knn的區別就是取前k個最小的，而我取的直接就是前k個，這個後續再改就好了。

另外大神說，我這個忘記考慮一點了不然可以節省一半的時間，除了我自己想到的“對每個group的511個執行緒拿當前行的地址是重複的511次還有自己組的groupID也重複去拿了還有別的行的地址也是重複拿了”這裡外，還有更重要的一點，我這個版本，第1行去算它與第8行的距離，第8行還會算一次它與第1行的距離，所以重複了一倍的計算！我目前想的改進就是“讓每個group代表行只訪問後續的group代表行即可解決你說的那個重複一半的問題不讓它向前追溯 ”當然這個是後話（等我把取前k個最小寫完編譯執行都正確的情況下再來對這個部分提高效能）。

另外大神還說了一個要注意事項：

即group 0讀取第888行的時候, 和group 777讀取888行的時候,可能前者導致的888行被載入到cache中, 到了777個group需要888行的時候,888行實際上已經不在cache中了.導致需要從視訊記憶體晶片被重複讀取到cache一次(儘量不是這種情況).

二、第二個KNN版本(即仍舊是200X931的稀疏矩陣，取前k個最小距離及其相應位置，先不要求這k個最小距離一定要按從小到大排列)

我本來用的是CUDA裡有的thrust下的sort.h，我想用sort_by_key()這個函式，於是在main.cpp前加入了#include <thrust/sort.h> 然後在cl檔案中直接用sort_by_key()，這樣距離和距離對應的位置可以跟著自動排序，但結果在編譯kernel時報錯：ptxas application ptx input, line16; fatal error :Parsing error near '[]': syntax error

ptxas: fatal error :Ptx assembly aborted due to errors

我檢查了下，並不關第16行或者附近的[]的問題，我想應該是與網上比較多的“ptx, line 150; fatal error :Parsing error near '-': syntax error”這個報錯就是因為路徑中文，與什麼150行的- 根本沒關係。我感覺我這個也與[]沒關係。但具體什麼原因谷歌不到？！！！！！！

這個kernel：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199)	 
{
	///////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///////erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else if(get_local_id(0)>currentGroupID)
		{
			localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
		else
		{
			;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	////////extract the first k datas!!!
	  //solution1:
	//sort()! but it cannot work!!!
	if(get_local_id(0)==0)
	{
		//thrust::sort(localTempInst199,localTempInst199+199);
		//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
		sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	if(get_local_id(0)<k)
	{
		instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
		positionMatrix[currentGroupID*k+get_local_id(0)]=localTempPos199[get_local_id(0)];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
}

所以我想如果這個函式sort_by_key()不報錯那麼這個版本的knn是可以用的。

三、第三個KNN版本

思路(我們公司的大神提供的思路，比我自己想的另一個好很多)：在想一個從N(非常非常大)個數裡拿出前k個最小值和對應位置(k<<N)。先用小一點的數實驗：N=199，k=5,thread_num=10,每個執行緒負責20個數(thread9其實只負責19個數)。對於某個thread，比如負責的數是:2,1，7，3，9，5，10，2，7，3，6，5.9，9，3.5，7.2，8，8.8，17，1，12。這20個數對應的位置分別是8，1，3，7，6，24，48，15，0，12，41，26，59，33，14，57，89，57，31，55. 先把前k個數放進一個data陣列裡[2，1，7，3，9] 同時將對應位置前k個放進pos陣列[8，1，3，7，6]。先算data的最大值max為9.然後這個執行緒將第6個數(第6個數為5)與max比較，如果小於max，則替換。(pos中對應地方也要替換)。結果data為[2，1，7，3，5],pos為[8，1，3，7，24]然後又計算這個data的最大值max=7，將第7個數(為10)與此max比較，大於max不用替換。繼續將第8個數2與max=7比較，小於它，又替換掉7，此時data變成[2，1，2，3，5] pos變成[8，1，15，7，24]....此執行緒遍歷到第20個數結束時，此執行緒將負責的20個數變成了只有k=5個。。。共10個執行緒，所以199個數變成了50個數。然後進行第2輪，需要三個執行緒，每個執行緒負責20個數。。。

按照這個思路，我開始寫cl檔案，寫完了但結果不正確：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199,
								  __local float *localTempInst50,__local int *localTempPos50,
   							  __local float *localTempInst15,__local int *localTempPos15,
   							  __local float *data,__local int *pos )	 
{
	///////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///////erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else if(get_local_id(0)>currentGroupID)
		{
			localTempInst199[get_local_id(0)+1]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
		else
		{
			;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	////////extract the first k datas!!!
	/*  //solution1:
	//sort()! but it cannot work!!!
	if(get_local_id(0)==0)
	{
		//thrust::sort(localTempInst199,localTempInst199+199);
		//thrust::sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
		sort_by_key(localTempInst199,localTempInst199+199,localTempPos199);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	if(get_local_id(0)<k)
	{
		instanceMatrix[currentGroupID*k+get_local_id(0)]=localTempInst199[get_local_id(0)];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//solution2:use these instead!
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st==199)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==49)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st==15)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			
			the1st+=1;
		}
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
		
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
}

main.cpp檔案：

/*
 * main.cpp
 *breaf:knn algorithm with OpenCL accelaration
 *  Created on: 2017年6月13日
 *      Author: root
 */
//#define __CL_ENABLE_EXCEPTIONS
#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <thrust/sort.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}
		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {

		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 //send arguments to kernel file  | CL_MEM_COPY_HOST_PTR
	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 //ciErrNum |= clSetKernelArg(ckKernel, 6, k * sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 14, k*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 15, k*sizeof(int), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={512,1};
	 size_t globalsize[2]={1024,200};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	   //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}

但結果執行完不是正確的。

因為總共存在5處bug：host上1處，cl上4處（大神幫我看到4處自己只檢查出1處）

1：host上的bug：

size_t globalsize[2]={1024,200};

我要的是200個，在紙上規劃好了結果這裡心急寫了200，實際是100,100*2=200。

2：cl上4處bug：

a：從200個距離中去掉與自己這一行的距離0時，我以為越過了那個空位，實際沒有，大神幫我點出此處，然後自己修改正確。

b：本來10個執行緒處理199個數，最後一個執行緒是處理19個數，我就寫的19而不是199！相應的後面我寫的9而不是49.後來才改過來。

c：第2個for中 break的條件應該是==50就break！

d：對於每個執行緒data和pos我設定的是__local型，每個執行緒都可以操作這個，導致data race！應該設定成每個執行緒獨享的而不是local。

修改好後全部正確並穩定：

修改後的cl檔案：

__kernel void knnForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,
								  __local float *localSizeInstance,__local float *localInstanceNoSort,
								  __local float *localTempInst199,__local int *localTempPos199,
								  __local float *localTempInst50,__local int *localTempPos50,
   							  __local float *localTempInst15,__local int *localTempPos15)	 
{
	///////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	
	for(uint anotherID=0;anotherID<height;anotherID++)
	{
		if(anotherID==currentGroupID)
		{
			localInstanceNoSort[anotherID]=0;
			continue;
		}
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
         barrier(CLK_LOCAL_MEM_FENCE);
         if (get_local_id(0) < stride) 
        	{
            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
        	}
        }
        
     	if (get_local_id(0) == 0)
       	{ 
     			localInstanceNoSort[anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	
	
	///////erase the 0 instance (the instance from the current row to iteself).
	if(get_local_id(0)<height-1)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort[get_local_id(0)+1];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	
	
	////////extract the first k min datas!!!
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st>198)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(get_local_id(0)==0)
	{
		 for(int r=0;r<50;r++)
		 {
				 int instanceData=localTempPos50[r];
				 printf("%d  \n",instanceData);
		 }
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//circle1
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==50)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	
	
	//circle2
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st>14)
			{
				break;
			}
		
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			
			the1st+=1;
		}
		
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
}

main.cpp檔案：

const int width=931;
const int height=200;
const int k=5;

void readTxt(const char *txtname,float *matrix)
{
		//int width = 931, height = 200;
		//float *matrix = new float[width*height];
		//memset(matrix, 0,width*height * sizeof(float));

		//ifstream infile("fea_sparse.txt");
		ifstream infile(txtname);
		vector<string> vec_str;
		string line;
		size_t curpos, pos_maohao, pos_kongge, pos_enter, prepos;//lineLength;
		//char *end;
		int lineID=0;
		while (getline(infile, line))
		{
			if (line.size() == 0)
			{
				continue;
			}
			for (size_t maohao1 = 0; maohao1 < line.size(); maohao1 = pos_maohao+7)
			{
				pos_maohao = line.find_first_of(":");
				pos_kongge = line.find_first_of(" ");
				if (pos_maohao == string::npos || pos_kongge == string::npos)
				{
					continue;
				}
				string tempIndex = line.substr(pos_kongge + 1, pos_maohao - pos_kongge-1);
				string tempData = line.substr(pos_maohao+1, 8);
				line = line.substr(pos_maohao + 9);

				//int ind = static_cast<int>(strtol(tempIndex.c_str(), &end, 10));
				int ind = atoi(tempIndex.c_str());
				float data = atof(tempData.c_str());
				//cout << ind << ":"<< data << " ";
				matrix[lineID*width + ind] = data;
		    }
			lineID += 1;
			//cout << endl;
		}

		/*
		cout << endl << endl;
		cout << "the matrix is..." << endl;
		for (int i = 0; i < height;i++)
		{
			for (int j = 0; j < width;j++)
			{
				cout << matrix[i*width + j] << " ";
			}
			cout << endl;
		}
		 */
		infile.close();
		//delete[] matrix;
}

int main( int argc, const char** argv)
{
    shrQAStart(argc, (char **)argv);
    // set logfile name and start logs
    shrSetLogFileName ("oclKnn4Robin.txt");
    shrLog("%s Starting...\n\n", argv[0]);
    //Get the NVIDIA platform
    cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {

		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knnForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix = new float[width*height];
	 const size_t matrixSize=width*height*sizeof(float);
	 memset(matrix, 0,matrixSize);
	 const char *txtfile="fea_sparse.txt";
	 readTxt(txtfile,matrix);
	 int *resultPosMatrix=new int[height*k];
	 size_t resultPosSize=height*k*sizeof(int);
	 memset(resultPosMatrix,0,resultPosSize);
	 float *resultInstanceMatrix=new float[height*k];
	 size_t resultInstanceSize=height*k*sizeof(float);
	 memset(resultInstanceMatrix,0,resultInstanceSize);

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, 512*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, height*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, 199*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, 199*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, 50*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, 50*sizeof(int), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 12, 15*sizeof(float), 0);
	 ciErrNum |= clSetKernelArg(ckKernel, 13, 15*sizeof(int), 0);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t localsize[2]={512,1};
	 size_t globalsize[2]={1024,100};
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);

	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position.txt",ios::out);
	 ofstream instanceOutfile("instance.txt",ios::out);
	 for(size_t r=0;r<height;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 delete [] matrix;
	 delete [] resultPosMatrix;
	 delete [] resultInstanceMatrix;
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 return 0;
}

(截圖不完全)距離結果和位置結果(即這一行與第幾行的距離)：

原稀疏矩陣：

另外大神說：

四、第4個KNN版本(仍舊是200X931的小測資料，但是計算距離時採用向下追溯節約時間)

__kernel void knn200ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__global float *localInstanceNoSort0)	 
{
	///////////the first step : calculate the distances!!!!
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0); //0--199
	__global float *row=matrix+width*currentGroupID;
	//__global float localInstanceNoSort0[40000];
	__local float localSizeInstance[512];
	localInstanceNoSort0[currentGroupID*height+currentGroupID]=0;
	for(uint anotherID=currentGroupID+1;anotherID<height;anotherID++)
	{
		float sum=0;
		const __global float *rowAnother=matrix+width*anotherID;
		for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
		{
			sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		localSizeInstance[get_local_id(0)]=sum;
		for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
		{
			 barrier(CLK_LOCAL_MEM_FENCE);
			 if (get_local_id(0) < stride) 
			 {
			    localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
			 }
       	}	 
     	if (get_local_id(0) == 0)
       	 { 
     		localInstanceNoSort0[currentGroupID*height+anotherID] = localSizeInstance[0];
      	 }
    	 barrier(CLK_LOCAL_MEM_FENCE);
	}
	for(uint forward0=get_local_id(0);forward0<currentGroupID;forward0++)
	{
		localInstanceNoSort0[currentGroupID*height+forward0]=localInstanceNoSort0[forward0*height+currentGroupID];
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	//////////erase the 0 instance (the instance from the current row to iteself).
	__local float localTempInst199[199];
	__local int localTempPos199[199];
	if(get_local_id(0)<height-1)
	{
		if(get_local_id(0)<currentGroupID)
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)];
			localTempPos199[get_local_id(0)]=get_local_id(0);
		}
		else
		{
			localTempInst199[get_local_id(0)]=localInstanceNoSort0[currentGroupID*height+get_local_id(0)+1];
			localTempPos199[get_local_id(0)]=get_local_id(0)+1;
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	//////////extract the first k min datas!!!
	 __local float localTempInst50[50];
	 __local int localTempPos50[50];
	//use the 10 threads of a group ,every thread include 20 datas,k=5
	//circle0 ,handle with 199 datas!
	if(get_local_id(0)<10)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst199[k0];
			pos[ind]=localTempPos199[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==9 && the1st>198)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst199[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst199[the1st];
				pos[tempMaxPos]=localTempPos199[the1st];
			}
			the1st+=1;
		}
		//result of circle0
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(get_local_id(0)==0)
	{
		 for(int r=0;r<50;r++)
		 {
				 int instanceData=localTempPos50[r];
				 printf("%d  \n",instanceData);
		 }
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	//circle1
	 __local float localTempInst15[15];
	 __local int localTempPos15[15];
	if(get_local_id(0)<3)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst50[k0];
			pos[ind]=localTempPos50[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(get_local_id(0)==2 && the1st==50)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst50[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst50[the1st];
				pos[tempMaxPos]=localTempPos50[the1st];
			}
			the1st+=1;
		}
		//result of circle1
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
			localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f %f %f %f %f %f %f %f %f %f %f\n",(int)currentGroupID,localTempInst15[0],localTempInst15[1],localTempInst15[2],localTempInst15[3],localTempInst15[4],localTempInst15[5],localTempInst15[6],localTempInst15[7],localTempInst15[8],localTempInst15[9],localTempInst15[10],localTempInst15[11],localTempInst15[12],localTempInst15[13],localTempInst15[14]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
	//circle2
	if(get_local_id(0)<1)
	{
		//copy the forward k data and position
		float data[5];
		int pos[5];
		size_t ind=0;
		for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
		{
			data[ind]=localTempInst15[k0];
			pos[ind]=localTempPos15[k0];
		}
		int the1st=get_local_id(0)*20+k;
		int thelast=get_local_id(0)*20+20;
		while(the1st<thelast)
		{
			if(the1st>14)
			{
				break;
			}
			//max from k
			float kmaxdata=data[0];
			int kmaxPos=pos[0];
			int tempMaxPos=0;
			for(size_t first=1;first<k;first++)
			{
				if(data[first]>kmaxdata)
				{
					kmaxdata=data[first];
					kmaxPos=pos[first];
					tempMaxPos=first;
				}
			}
			if(localTempInst15[the1st]<kmaxdata)
			{
				data[tempMaxPos]=localTempInst15[the1st];
				pos[tempMaxPos]=localTempPos15[the1st];
			}
			the1st+=1;
		}
		//result of circle2
		for(size_t finalk0=0;finalk0<k;finalk0++)
		{
			instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
			positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
		}
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	/*
	if(currentGroupID<200 && get_local_id(0)==0)
	{
	  printf("group:%d data:%f %f %f %f %f\n\n",(int)currentGroupID,instanceMatrix[currentGroupID*k+0],instanceMatrix[currentGroupID*k+1],instanceMatrix[currentGroupID*k+2],instanceMatrix[currentGroupID*k+3],instanceMatrix[currentGroupID*k+4]);
	}
	barrier(CLK_LOCAL_MEM_FENCE);
	*/
}

結果與前面一致，是正確的。時間第3個版本是:13262us，而此版本是:4960us！！！！！！！！整整一倍多的時間啊！！！！其實還可以繼續優化的，不用每次比較k箇中的最大值，上次沒有變的最大值下次就不用比，這樣會更快！

大神說：她說的這個手工緩衝我下次再去想！

在寫這個時，我開始是對localInstanceNoSort0[]這個是local型，OpenCL中只有barrier(CLK_LOCAL_MEM_FENCE)讓同一個group內所有執行緒都到達這個柵欄點，再一起往下走。但opencl沒有CLK_GLOBAL_MEM_FENCE這種說法，所以我後面的group中要拿前面某個group的結果，如果前面那個group還沒有算完怎麼給這個group！而且即使拿到了，第2個group的結果是data[]陣列裡，而第199個group的結果也是data[]。只是這兩個data是並行，裡面的東西不一樣而已。但後面第199個group的data[]的前198個數要替換成第2個group的data[]的前198個數。雖然data是global型，但第199個group的data[0]要替換成第2個group的data[0]，這無法做到啊因為分不清誰是誰的！所以我將這個變數改成了global型別，進行了一個變樣，不過實質是達到了這個目的，這樣就行了。也就是說幾個Work Group，那麼必須通過全域性變數進行通訊。

於是我開始是：__global float localInstanceNoSort0[40000]; 但報錯說OpenCL不支援這樣，然後我又在host端

ciErrNum |= clSetKernelArg(ckKernel, 6, 40000*sizeof(float), 0);

結果報錯說：<kernel>:error:global variables in function scope must have static storage class 所以後來我在host改成正經的建立一個新陣列，建立此陣列的buffer然後寫進kernel的第7個引數！這樣才行了！所以我想應該是：kernel中要使用global型別的陣列變數，此變數必須是從host傳進來的實際變數，而不是像傳local型一樣給個大小賦0傳進kernel就好了！

五、第5個KNN版本(2002350X931的矩陣，即假設稀疏矩陣還原後的樣子)

__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)	 
{
	__local float localSizeInstance[931];
	__local float localTempInst50[1024*5];
	__local float localTempPos50[1024*5];
	__local float localTempInst15[32*5];
	__local float localTempPos15[32*5];
	__local float localTempInst5[5*5];
	__local float localTempPos5[5*5];
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
			////////////the first step : calculate the distances!!!!
			__global float *row=matrix+width*y;
			localInstanceNoSort[y*height+y]=0;
			for(uint i=y+1;i<height;i+=1)
			{
				float sum=0;
				const __global float *rowAnother=matrix+width*i;
			   for(uint j=get_local_id(0);j<width;j+=get_local_size(0))
				{
					sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
				}
				barrier(CLK_LOCAL_MEM_FENCE);
				
				localSizeInstance[get_local_id(0)]=sum;
				for (uint stride = get_local_size(0) / 2; stride > 0; stride /= 2)
				{
		         barrier(CLK_LOCAL_MEM_FENCE);
		         if (get_local_id(0) < stride) 
		        	{
		            localSizeInstance[get_local_id(0)] += localSizeInstance[get_local_id(0) + stride];
		        	}
		        }
		     	if (get_local_id(0) == 0)
		       	{ 
		     			localInstanceNoSort[y*height+i] = localSizeInstance[0];
		      	 }
		    	 barrier(CLK_LOCAL_MEM_FENCE);
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
			{
				localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/////////////////erase the 0 instance (the instance from the current row to iteself).
			__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
			for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
			{
				if(t<currentGroupID)
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
					localTempPos199[t]=t;
				}
				else
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
					localTempPos199[t]=t+1;
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			///////////////////extract the first k min datas!!!
			//use the 1024 threads of a group ,every thread include 1956 datas,k=5
			//circle0 ,handle with 2002349 datas!
			//copy the forward k data and position
			float data[5];
			int pos[5];
			size_t ind=0;
			for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
			{
				data[ind]=localTempInst199[k0];
				pos[ind]=localTempPos199[k0];
			}
			int the1st=get_local_id(0)*1956+k;
			int thelast=get_local_id(0)*1956+1956;
			while(the1st<thelast)
			{
				if(get_local_id(0)==1023 && the1st>2002348)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst199[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst199[the1st];
					pos[tempMaxPos]=localTempPos199[the1st];
				}
				the1st+=1;
			}
			//result of circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			//circle1
			if(get_local_id(0)<32)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32*k+k;
				int thelast=get_local_id(0)*32*k+32*k;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<5)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32+k;
				int thelast=get_local_id(0)*32+32;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<1)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
				{
					data[ind]=localTempInst5[k0];
					pos[ind]=localTempPos5[k0];
				}
				
				int the1st=k;
				int thelast=25;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
					positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
	}
}

但這個kernel不夠好，改成下面這樣執行緒利用率更高：

__kernel void knn2002350ForRobin(__global float *matrix,int width,int height,int k,__global int *positionMatrix,__global float *instanceMatrix,__local float *localInstanceNoSort)	 
{
	__local float localSizeInstance[931];
	__local float localTempInst50[1024*5];
	__local float localTempPos50[1024*5];
	__local float localTempInst15[32*5];
	__local float localTempPos15[32*5];
	__local float localTempInst5[5*5];
	__local float localTempPos5[5*5];
	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
			////////////the first step : calculate the distances!!!!
			__global float *row=matrix+width*y;
			localInstanceNoSort[y*height+y]=0;
			for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
			{
				float sum=0;
				const __global float *rowAnother=matrix+width*i;
			   for(uint j=0;j<width;j+=1)
				{
					sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
				}
				localInstanceNoSort[y*height+i] = sum;
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
			{
				localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/////////////////erase the 0 instance (the instance from the current row to iteself).
			__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];
			for(uint t=get_local_id(0);t<height-1;t+=get_local_size(0))
			{
				if(t<currentGroupID)
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t];
					localTempPos199[t]=t;
				}
				else
				{
					localTempInst199[t]=localInstanceNoSort[currentGroupID*height+t+1];
					localTempPos199[t]=t+1;
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			///////////////////extract the first k min datas!!!
			//use the 1024 threads of a group ,every thread include 1956 datas,k=5
			//circle0 ,handle with 2002349 datas!
			//copy the forward k data and position
			float data[5];
			int pos[5];
			size_t ind=0;
			for(size_t k0=get_local_id(0)*1956;k0<get_local_id(0)*1956+k;k0++,ind++)
			{
				data[ind]=localTempInst199[k0];
				pos[ind]=localTempPos199[k0];
			}
			int the1st=get_local_id(0)*1956+k;
			int thelast=get_local_id(0)*1956+1956;
			while(the1st<thelast)
			{
				if(get_local_id(0)==1023 && the1st>2002348)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst199[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst199[the1st];
					pos[tempMaxPos]=localTempPos199[the1st];
				}
				the1st+=1;
			}
			//result of circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle1
			if(get_local_id(0)<32)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32*k;k0<get_local_id(0)*32*k+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32*k+k;
				int thelast=get_local_id(0)*32*k+32*k;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<5)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0)*32;k0<get_local_id(0)*32*+k;k0++,ind++)
				{
					data[ind]=localTempInst50[k0];
					pos[ind]=localTempPos50[k0];
				}
				
				int the1st=get_local_id(0)*32+k;
				int thelast=get_local_id(0)*32+32;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			//circle2
			if(get_local_id(0)<1)
			{
				//copy the forward k data and position
				float data[5];
				int pos[5];
				size_t ind=0;
				for(size_t k0=get_local_id(0);k0<get_local_id(0)+k;k0++,ind++)
				{
					data[ind]=localTempInst5[k0];
					pos[ind]=localTempPos5[k0];
				}
				
				int the1st=k;
				int thelast=25;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					
					if(localTempInst15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					
					the1st+=1;
				}
				
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
					positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		
	}
}

main：

#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int height2=2002350;
const int k=5;

int64_t system_time()
{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC,&t);
	return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}

int main( int argc, const char** argv)
{
	shrQAStart(argc, (char **)argv);
	// set logfile name and start logs
	shrSetLogFileName ("oclKnn4Robin.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	//Get the NVIDIA platform
	cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn2002350.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 // write out standard error, Build Log and PTX, then return error
		 shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
		 oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
		 oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclknn.ptx");
		 return(EXIT_FAILURE);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix;
	 const int matrixSizeNoFloat=width*height2;
	 const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
	 matrix=(float*)malloc(matrixSize);
	 shrFillArray(matrix,matrixSizeNoFloat);
	 size_t resultPosSize=height2*k*sizeof(int);
	 int *resultPosMatrix=(int*)malloc(resultPosSize);
	 memset(resultPosMatrix,0,resultPosSize);
	 size_t resultInstanceSize=height2*k*sizeof(float);
	 float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
	 shrFillArray(resultInstanceMatrix,height2*k);

	 size_t localsize[2]={1024,1};
	 size_t globalsize[2]={1024,1024};

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(int), &height2);
	 ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(cl_mem), (void*)&result_ins_buffer);

	 size_t NoSortSize=height2*height2*sizeof(float);
	 float *NoSort=(float*)malloc(NoSortSize);
	 memset(NoSort,0,NoSortSize);
	 cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(cl_mem), (void*)&NoSort_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 clFinish(commandQueue);
	 int64_t time_start=system_time();
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
	 clFinish(commandQueue);
	 int64_t time_end=system_time();
	 printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position2002350.txt",ios::out);
	 ofstream instanceOutfile("distance2002350.txt",ios::out);
	 for(size_t r=0;r<height2;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 free(matrix);
	 free(resultPosMatrix);
	 free(resultInstanceMatrix);
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);
	 
	 return 0;
}

報錯：clBuildProgram():ptxas:error:Entry function use too much shared data!!!

大神說：

__local float localTempInst199[2002349];
			__local int localTempPos199[2002349];

用得太多了！應該將這多行分段完成！(效率不高)

於是拆分成一節一節：即將原矩陣分成4個矩陣傳進來，因為大神說：比如CL_DEVICE_MAX_MEM_ALLOC_SIZE:最大不能超過500M,而原本需要600MB的緩衝區，可以拆分成兩個300MB的緩衝區傳進kernel。

（1）NV的卡總是一次可以分配最多1/4的視訊記憶體大小，所以你如果使用2GB視訊記憶體的卡，一次能分配在512MB左右。此時可以考慮換一個較大視訊記憶體的卡（例如4GB視訊記憶體的，可以分配1GB一次）。

（2）CUDA沒有此限制，可以一次性分配將近全部的（如果你是2GB的卡，能分配0MB 到大約2GB，一次性）。可以考慮轉換到CUDA.

我也不再一下算某行與另2002350的距離，而是算其與1024個行的距離，利用迴圈遍歷完2002350!

__kernel void knn2002350ForRobin(__global float *matrix,__global float *matrix2,__global float *matrix3,__global float *matrix4,
											int width,int height,int k,__global int *positionMatrix,
											__global float *instanceMatrix,__global float *localInstanceNoSort,
											__local float *distanceK,__local float *positionK)	 
{
	__local float localTempInst199[1024];
	__local int localTempPos199[1024];
	__local float localTempInst50[51*5+4];
	__local float localTempPos50[51*5+4];
	__local float localTempInst15[65];
	__local int localTempPos15[65];
	__local float localTempInst5[20];
	__local int localTempPos5[20];
	
	__local float localTempInst430[429];
	__local int localTempPos430[429];
	__local float localTempInst110[110];
	__local int localTempPos110[110];
	__local float localTempInst30[30];
	__local int localTempPos30[30];
	__local float localTempInst10[10];
	__local int localTempPos10[10];
	__local float localTempInst489[2445];
	__local int localTempPos489[2445];
	__local float localTempInst123[123*5];
	__local int localTempPos123[123*5];
	__local float localTempInst31[31*5];
	__local int localTempPos31[31*5];
	__local float localTempInst40[40];
	__local int localTempPos40[40];

	uint currentGroupID=get_group_id(1)*get_num_groups(0)+get_group_id(0);
	for (size_t y = currentGroupID; y < height; y += get_local_size(0))
	{
		////////////the first step : calculate the distances!!!!
		__global float *row;
		if(currentGroupID<500000)
		{
			row=matrix+width*y;
		}
		else if(currentGroupID>=500000 && currentGroupID<1000000)
		{
			row=matrix2+width*(y-500000);
		}
		else if(currentGroupID>=1000000 && currentGroupID<1500000)
		{
			row=matrix3+width*(y-1000000);
		}
		else
		{
			row=matrix4+width*(y-1500000);
		}
		localInstanceNoSort[y*height+y]=0;
		const __global float *rowAnother;
		for(uint i=get_local_id(0)+y+1;i<height;i+=get_local_size(0))
		{
			float sum=0;
			if(i<500000)
			{
				rowAnother=matrix+width*i;
			}
			else if(i>=500000 && i<1000000)
			{
				rowAnother=matrix2+width*(i-500000);
			}
			else if(i>=1000000 && i<1500000)
			{
				rowAnother=matrix3+width*(i-1000000);
			}
			else
			{
				rowAnother=matrix4+width*(i-1500000);
			}
			
		   for(uint j=0;j<width;j+=1)
			{
				sum+=(row[j]-rowAnother[j])*(row[j]-rowAnother[j]);
			}
			localInstanceNoSort[y*height+i] = sum;
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		for(uint forward0=get_local_id(0);forward0<y;forward0+=get_local_size(0))
		{
			localInstanceNoSort[y*height+forward0]=localInstanceNoSort[forward0*height+y];
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		//////////////////////////////////////2002350=1955*1024+430
		for(uint index=0;index<1955;index++)
		{
			//get 1024 datas from 2002350 every time .
			uint t=get_local_id(0);
			if(index*1024+t<currentGroupID)
			{
				localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t];
				localTempPos199[t]=index*1024+t;
			}
			else
			{
				localTempInst199[t]=localInstanceNoSort[currentGroupID*height+index*1024+t+1];
				localTempPos199[t]=index*1024+t+1;
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			/////////////////////circle0 ,handle with 1024 datas!
			//use the 52 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			float data[5];
			int pos[5];
			int ind=0;
			if(get_local_id(0)<52)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<1024)
					{
						data[ind]=localTempInst199[k0];
						pos[ind]=localTempPos199[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==52)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempInst199[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst199[the1st];
						pos[tempMaxPos]=localTempPos199[the1st];
					}
					the1st+=1;
				}
				//result of circle0
				if(get_local_id(0)<51)
				{
					for(size_t finalk0=0;finalk0<k;finalk0++)
					{
						localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
						localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
					}
				}
				else
				{
					for(size_t finalk0=0;finalk0<k-1;finalk0++)
					{
						localTempInst50[get_local_id(0)*k+finalk0]=data[finalk0];
						localTempPos50[get_local_id(0)*k+finalk0]=pos[finalk0];
					}
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/////////////////////circle1 ,handle with 514 datas!
			//use the 13 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<13)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<259)
					{
						data[ind]=localTempInst50[k0];
						pos[ind]=localTempPos50[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==12 && the1st>18)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos50[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst50[the1st];
						pos[tempMaxPos]=localTempPos50[the1st];
					}
					the1st+=1;
				}
				//result of circle1
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst15[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos15[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/////////////////////circle2 ,handle with 65 datas!
			//use the 4 threads of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<4)
			{
				for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
				{
					if(k0<65)
					{
						data[ind]=localTempInst15[k0];
						pos[ind]=localTempPos15[k0];
					}
					else
					{
						break;
					}
				}
			
				int the1st=get_local_id(0)*20+k;
				int thelast=get_local_id(0)*20+20;
				while(the1st<thelast)
				{
					if(get_local_id(0)==4)
					{
						break;
					}
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos15[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst15[the1st];
						pos[tempMaxPos]=localTempPos15[the1st];
					}
					the1st+=1;
				}
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					localTempInst5[get_local_id(0)*k+finalk0]=data[finalk0];
					localTempPos5[get_local_id(0)*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
			
			
			
			/////////////////////circle3 ,handle with 20 datas!
			//use the 1 thread of a group ,every thread handles 20 datas,k=5
			//copy the forward k data and position
			//float data[5];
			//int pos[5];
			ind=0;
			if(get_local_id(0)<1)
			{
				for(size_t k0=0;k0<k;k0++,ind++)
				{
					data[ind]=localTempInst15[k0];
					pos[ind]=localTempPos15[k0];
				}
			
				int the1st=k;
				int thelast=20;
				while(the1st<thelast)
				{
					//max from k
					float kmaxdata=data[0];
					int kmaxPos=pos[0];
					int tempMaxPos=0;
					for(size_t first=1;first<k;first++)
					{
						if(data[first]>kmaxdata)
						{
							kmaxdata=data[first];
							kmaxPos=pos[first];
							tempMaxPos=first;
						}
					}
					if(localTempPos5[the1st]<kmaxdata)
					{
						data[tempMaxPos]=localTempInst5[the1st];
						pos[tempMaxPos]=localTempPos5[the1st];
					}
					the1st+=1;
				}
				//result of circle2
				for(size_t finalk0=0;finalk0<k;finalk0++)
				{
					distanceK[index*k+finalk0]=data[finalk0];
					positionK[index*k+finalk0]=pos[finalk0];
				}
			}
			barrier(CLK_LOCAL_MEM_FENCE);
		}
		
		////////////////////////////get the last 430 or 429 datas from 2002350 .
		uint t=get_local_id(0);
		uint tt=1955*1024+t;
		if(t<429)
		{
			if(currentGroupID<1955*1024)
			{
				
				localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
				localTempPos430[t]=tt+1;
			}
			else
			{
				if(tt<currentGroupID)
				{
					localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt];
					localTempPos430[t]=tt;
				}
				else
				{
					localTempInst430[t]=localInstanceNoSort[currentGroupID*height+tt+1];
					localTempPos430[t]=tt+1;
				}
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle0 ,handle with  430 or 429 datas!
		//use the 22 threads of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		float data[5];
		int pos[5];
		int ind=0;
		if(get_local_id(0)<22)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=localTempInst430[k0];
				pos[ind]=localTempPos430[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==21 && the1st==429)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst430[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst430[the1st];
					pos[tempMaxPos]=localTempPos430[the1st];
				}
				the1st+=1;
			}
			//result circle0
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst110[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos110[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle1 ,handle with  110 datas!
		//use the 5 threads of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<6)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=localTempInst110[k0];
				pos[ind]=localTempPos110[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==5 && the1st==10)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst110[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst110[the1st];
					pos[tempMaxPos]=localTempPos110[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst30[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos30[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle2 ,handle with  30 datas!
		//use the 2 threads of a group ,every thread handles 15 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<2)
		{
			for(size_t k0=get_local_id(0)*15;k0<get_local_id(0)*15+k;k0++,ind++)
			{
				data[ind]=localTempInst110[k0];
				pos[ind]=localTempPos110[k0];
			}
		
			int the1st=get_local_id(0)*15+k;
			int thelast=get_local_id(0)*15+15;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempInst30[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst30[the1st];
					pos[tempMaxPos]=localTempPos30[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle3 ,handle with 20 datas!
		//use the 1 thread of a group ,every thread handles 10 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<1)
		{
			for(size_t k0=0;k0<k;k0++,ind++)
			{
				data[ind]=localTempInst10[k0];
				pos[ind]=localTempPos10[k0];
			}
		
			int the1st=k;
			int thelast=10;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(localTempPos10[the1st]<kmaxdata)
				{
					data[tempMaxPos]=localTempInst10[the1st];
					pos[tempMaxPos]=localTempPos10[the1st];
				}
				the1st+=1;
			}
			//result of circle2
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				distanceK[1955*k+finalk0]=data[finalk0];
				positionK[1955*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		////////////result of a group :1956*5 data
		/////////////////////circle0,handle with 1956*5 datas!
		//use the 489 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<489)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
		
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst489[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos489[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/////////////////////circle1,handle with 489*5 datas!
		//use the 123 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<123)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==122)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst123[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos123[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/////////////////////circle2,handle with 123*5 datas!
		//use the 31 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<31)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==30 && the1st>614)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst31[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos31[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle3,handle with 31*5 datas!
		//use the 8 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<8)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				if(get_local_id(0)==7 && the1st>154)
				{
					break;
				}
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst40[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos40[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		/////////////////////circle4,handle with 40 datas!
		//use the 2 thread of a group ,every thread handles 20 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<2)
		{
			for(size_t k0=get_local_id(0)*20;k0<get_local_id(0)*20+k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=get_local_id(0)*20+k;
			int thelast=get_local_id(0)*20+20;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				localTempInst10[get_local_id(0)*k+finalk0]=data[finalk0];
				localTempPos10[get_local_id(0)*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
		
		/////////////////////circle5,handle with 10 datas!
		//use the 1 thread of a group ,every thread handles 10 datas,k=5
		//copy the forward k data and position
		//float data[5];
		//int pos[5];
		ind=0;
		if(get_local_id(0)<1)
		{
			for(size_t k0=0;k0<k;k0++,ind++)
			{
				data[ind]=distanceK[k0];
				pos[ind]=positionK[k0];
			}
			int the1st=k;
			int thelast=10;
			while(the1st<thelast)
			{
				//max from k
				float kmaxdata=data[0];
				int kmaxPos=pos[0];
				int tempMaxPos=0;
				for(size_t first=1;first<k;first++)
				{
					if(data[first]>kmaxdata)
					{
						kmaxdata=data[first];
						kmaxPos=pos[first];
						tempMaxPos=first;
					}
				}
				if(distanceK[the1st]<kmaxdata)
				{
					data[tempMaxPos]=distanceK[the1st];
					pos[tempMaxPos]=distanceK[the1st];
				}
				the1st+=1;
			}
			//result circle1
			for(size_t finalk0=0;finalk0<k;finalk0++)
			{
				instanceMatrix[currentGroupID*k+finalk0]=data[finalk0];
				positionMatrix[currentGroupID*k+finalk0]=pos[finalk0];
			}
		}
		barrier(CLK_LOCAL_MEM_FENCE);
		
	}
}

#include <stdio.h>
#include <stdlib.h>
//#include <CL/cl.hpp>
#include <CL/cl.h>
#include <time.h>
#include <fstream>
#include <vector>
#include "oclUtils.h"
#include "shrQATest.h"
using namespace std;

const int width=931;
const int height=200;
const int height2=500000;
const int height3=2350;
//const int height2=2002350; 2002350=500000*4+2350
const int k=5;

int64_t system_time()
{
	struct timespec t;
	clock_gettime(CLOCK_MONOTONIC,&t);
	return (int64_t)(t.tv_sec)*1e9+t.tv_nsec;
}

int main( int argc, const char** argv)
{
	
	shrQAStart(argc, (char **)argv);
	// set logfile name and start logs
	shrSetLogFileName ("oclKnn4Robin.txt");
	shrLog("%s Starting...\n\n", argv[0]);
	//Get the NVIDIA platform
	cl_platform_id cpPlatform;
	 cl_int ciErrNum = oclGetPlatformID(&cpPlatform);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Get the devices
	 cl_uint uiNumDevices;
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
	 ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	//Create the context
	 cl_context cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_device_id device ;
	 clGetDeviceIDs(cpPlatform,CL_DEVICE_TYPE_GPU,1,&device,NULL);
	 cl_command_queue commandQueue = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
	 if (ciErrNum != CL_SUCCESS)
	{
			shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
			return ciErrNum;
	}

	 //read kernel file and build it
	 size_t program_length;
	 char* source_path = shrFindFilePath("knn2002350advance.cl", argv[0]);
	 oclCheckError(source_path != NULL, shrTRUE);
	 char *source = oclLoadProgSource(source_path, "", &program_length);
	 oclCheckError(source != NULL, shrTRUE);
	 // create the program
	 cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **)&source, &program_length, &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 // build the program
	 ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
	 if (ciErrNum != CL_SUCCESS)
	 {
		 printf("cannot build program successfully!\n");
		 size_t logSize;
		 ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,0, NULL, &logSize);
		 char *log = (char*)malloc(logSize);
		 ciErrNum = clGetProgramBuildInfo(cpProgram, device, CL_PROGRAM_BUILD_LOG,logSize, log, NULL);
		 printf("%s\n", log);
		 free(log);
		 exit(-1);
	 }
	 cl_kernel ckKernel = clCreateKernel(cpProgram, "knn2002350ForRobin", &ciErrNum);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 //host data
	 float *matrix,*matrix2,*matrix3,*matrix4;
	 const int matrixSizeNoFloat=width*height2;
	 const size_t matrixSize=matrixSizeNoFloat*sizeof(float);
	 matrix=(float*)malloc(matrixSize);
	 memset(matrix,0,matrixSize);
	 matrix2=(float*)malloc(matrixSize);
	 memset(matrix2,0,matrixSize);
	 matrix3=(float*)malloc(matrixSize);
	 memset(matrix3,0,matrixSize);
	 const size_t matrixSize4=width*height3*sizeof(float);
	 matrix4=(float*)malloc(matrixSize4);
	 memset(matrix4,0,matrixSize4);
	 //shrFillArray(matrix,matrixSizeNoFloat);
	 size_t resultPosSize=height2*k*sizeof(int);
	 int *resultPosMatrix=(int*)malloc(resultPosSize);
	 memset(resultPosMatrix,0,resultPosSize);
	 size_t resultInstanceSize=height2*k*sizeof(float);
	 float *resultInstanceMatrix=(float*)malloc(resultInstanceSize);
	 shrFillArray(resultInstanceMatrix,height2*k);

	 size_t localsize[2]={1024,1};
	 size_t globalsize[2]={1024,1024};

	 cl_mem matrix_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix, &ciErrNum);
	 cl_mem result_pos_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultPosSize,resultPosMatrix,&ciErrNum);
	 cl_mem result_ins_buffer=clCreateBuffer(cxGPUContext,CL_MEM_WRITE_ONLY,resultInstanceSize,resultInstanceMatrix,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix_buffer, CL_FALSE, 0, matrixSize, matrix, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 cl_mem matrix2_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix2, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix2_buffer, CL_FALSE, 0, matrixSize, matrix2, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_mem matrix3_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize, matrix3, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix3_buffer, CL_FALSE, 0, matrixSize, matrix3, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 cl_mem matrix4_buffer=clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY ,matrixSize4, matrix4, &ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, matrix4_buffer, CL_FALSE, 0, matrixSize4, matrix4, 0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 ciErrNum  = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&matrix_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&matrix2_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&matrix3_buffer);
	 ciErrNum  = clSetKernelArg(ckKernel, 3, sizeof(cl_mem), (void*)&matrix4_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(int),  &width);
	 ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int), &height2);
	 ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(int), &k);
	 ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(cl_mem), (void*)&result_pos_buffer);
	 ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(cl_mem), (void*)&result_ins_buffer);

	 size_t NoSortSize=height2*height2*sizeof(float);
	 float *NoSort=(float*)malloc(NoSortSize);
	 memset(NoSort,0,NoSortSize);
	 cl_mem NoSort_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,NoSortSize,NoSort,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, NoSort_buffer, CL_FALSE, 0, NoSortSize, NoSort, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 9, sizeof(cl_mem), (void*)&NoSort_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t KDistanceSize=1956*5*sizeof(float);
	 float *KDistance=(float*)malloc(KDistanceSize);
	 memset(KDistance,0,KDistanceSize);
	 cl_mem KDistance_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KDistanceSize,KDistance,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KDistanceSize, KDistance, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 10, sizeof(cl_mem), (void*)&KDistance_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 size_t KPositinSize=1956*5*sizeof(float);
	 float *KPosition=(float*)malloc(KPositinSize);
	 memset(KPosition,0,KPositinSize);
	 cl_mem KPosition_buffer=clCreateBuffer(cxGPUContext,CL_MEM_READ_WRITE,KPositinSize,KPosition,&ciErrNum);
	 ciErrNum = clEnqueueWriteBuffer(commandQueue, KDistance_buffer, CL_FALSE, 0, KPositinSize, KPosition, 0, NULL, NULL);
	 ciErrNum |= clSetKernelArg(ckKernel, 11, sizeof(cl_mem), (void*)&KPosition_buffer);
	 oclCheckError(ciErrNum, CL_SUCCESS);

	 clFinish(commandQueue);
	 int64_t time_start=system_time();
	 clEnqueueNDRangeKernel(commandQueue, ckKernel, 2, NULL, globalsize, localsize,0, NULL, NULL);
	 clFinish(commandQueue);
	 int64_t time_end=system_time();
	 printf("kernel execute time: %f(us)\n",(time_end-time_start)/1e3);

	 int *positionHost=(int*)alloca(resultPosSize);
	 memset(positionHost,0,resultPosSize);
	 float *instanceHost=(float*)alloca(resultInstanceSize);
	 memset(instanceHost,0,resultInstanceSize);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_pos_buffer, CL_TRUE, 0,resultPosSize, positionHost,0, NULL, NULL);
	 ciErrNum = clEnqueueReadBuffer(commandQueue, result_ins_buffer, CL_TRUE, 0,resultInstanceSize, instanceHost,0, NULL, NULL);
	 oclCheckError(ciErrNum, CL_SUCCESS);
	 //save as txt to check.
	 ofstream posOutfile("position2002350.txt",ios::out);
	 ofstream instanceOutfile("distance2002350.txt",ios::out);
	 for(size_t r=0;r<height2;r++)
	 {
		 for(size_t c=0;c<k;c++)
		 {
			 int data=positionHost[r*k+c];
			 float instanceData=instanceHost[r*k+c];
			 posOutfile<<data<<"\t";
			 instanceOutfile<<instanceData<<"\t";
		 }
		 posOutfile<<endl;
		 instanceOutfile<<endl;
	 }
	 posOutfile.close();
	 instanceOutfile.close();

	 free(matrix);
	 free(resultPosMatrix);
	 free(resultInstanceMatrix);
	 free(NoSort);
	 free(KDistance);
	 free(KPosition);
	 clReleaseCommandQueue(commandQueue);
	 clReleaseContext(cxGPUContext);
	 clReleaseProgram(cpProgram);
	 clReleaseKernel(ckKernel);
	 shrLog("%s Successfully end.\n\n", argv[0]);


	 return 0;
}

第一次WriteBuffer通過了，第二次WriteBuffer時返回 -4.超過限制？！

稀疏矩陣
2020-10-15
矩陣
巨大的矩陣（矩陣加速）
2024-08-16
矩陣
演算法學習：矩陣快速冪/矩陣加速
2024-08-11
演算法矩陣
【scipy 基礎】--稀疏矩陣
2023-11-23
矩陣
資料結構：陣列，稀疏矩陣，矩陣的壓縮。應用：矩陣的轉置，矩陣相乘
2020-10-28
資料結構陣列矩陣
資料結構(一)-稀疏矩陣
2020-09-13
資料結構矩陣
cuda 加速矩陣乘法
2024-03-15
矩陣
【精選】矩陣加速
2024-06-07
矩陣
序列（dp+矩陣加速）
2024-08-18
矩陣
巨大的數（dp+矩陣加速）
2024-08-16
矩陣
矩陣快速冪加速最短路
2024-11-04
矩陣
矩陣加速線性遞推
2024-08-19
矩陣
雜項——矩陣加速（進階）
2024-09-17
矩陣
MKL稀疏矩陣運算示例及函式封裝
2023-04-23
矩陣函式封裝
幸運數（dp+矩陣加速）
2024-08-18
矩陣
動態dp & 矩陣加速遞推
2024-08-19
矩陣
演算法-陣列與矩陣
2021-06-30
演算法陣列矩陣
【小白學演算法】2. 稀疏陣列
2021-03-09
演算法陣列
稀疏陣列
2022-03-04
陣列
【資料結構與演算法】——稀疏陣列
2021-03-12
資料結構演算法陣列
演算法之DP——01矩陣
2020-10-04
演算法矩陣
資料結構與演算法之稀疏陣列
2020-10-05
資料結構演算法陣列
資料結構與演算法：稀疏陣列（一）
2020-11-08
資料結構演算法陣列
求任意矩陣的伴隨矩陣
2024-06-18
矩陣
稀疏矩陣之三元組壓縮儲存並且轉置
2020-11-12
矩陣
矩陣分解--超詳細解讀
2020-10-03
矩陣
python 計算矩陣的相關演算法
2020-09-26
Python矩陣演算法
稀疏陣列、佇列
2020-11-20
陣列佇列
20_稀疏陣列
2024-07-01
陣列
鄰接矩陣、度矩陣
2021-12-07
矩陣
大神教你python 讀取檔案並把矩陣轉成numpy的兩種方法
2020-07-12
Python矩陣
奇異矩陣，非奇異矩陣，偽逆矩陣
2020-09-29
矩陣
機器學習中的矩陣向量求導(五) 矩陣對矩陣的求導
2019-05-27
機器學習矩陣求導
資料結構與演算法—稀疏陣列和佇列
2019-07-15
資料結構演算法陣列佇列
MATLAB|讀取一個檔案，並將其不同內容儲存為不同的矩陣
2020-12-07
Matlab矩陣
golang實現稀疏陣列
2020-10-04
Golang陣列
矩陣
2024-04-28
矩陣
KNN演算法實驗
2024-03-31
KNN演算法
基於矩陣分解的協同過濾演算法
2024-04-11
矩陣演算法

讀取大型稀疏矩陣&KNN演算法的OpenCL加速版本

相關文章