OpenMP並行優化高斯樸素貝葉斯演算法 - 通過身高、體重和肺活量推測性別(機器學習)

admiz發表於2020-11-20

目錄

OpenMP並行設計

用OpenMP進行共享記憶體平行計算

使用pragma omp parallel for提高訓練速度

程式程式碼

結果


OpenMP並行設計

在序列程式碼的基礎上,OpenMP通過# pragma omp parallel for num_threads(thread_count) 指令,對所有涉及大型陣列的for迴圈進行多執行緒處理,以此達到並行效果。

用OpenMP進行共享記憶體平行計算

使用pragma omp parallel for提高訓練速度

由於本專案序列需要多次通過for(i=0;i<dataLen;i++)迴圈遍歷整個陣列來求值,因此十分適合使用pragma omp parallel for指令來進行共享記憶體程式設計。該指令表示所有執行緒平分(dataLen / comm_sz)個資料,並以這些資料進行迴圈計算,各執行緒再將資料返回主執行緒,與MPI的MPI_Scatterv()函式異曲同工。

pragma omp parallel for指令後面的子句有:num_threads(thread_count)、reduction(+:maleNum)、shared(dataSet,dataLen) 、private(i)。num_threads(thread_count)表示執行緒數,reduction(+:A)表示maleNum是歸約變數,最後要將所有執行緒計算的A累加。shared(dataSet,dataLen) 表示dataSet,dataLen陣列數所有執行緒都可以共享的,private(i)則表示每個執行緒的i都是私有的,不受其它執行緒影響。

參考程式碼如下(求sum[2]為例):

/*初始化,reduction內的變數一定要有初始值*/

double A,B;

A=0;B=0;

double sum[2]={0,0};

# pragma omp parallel for num_threads(thread_count) \

reduction(+:A) reduction(+:B)\

shared(dataSet,dataLen) private(i)

for(i=0;i<dataLen;i++){ ;

A+=dataSet[i*EIGEN_NUM+1];

B+=dataSet[i*EIGEN_NUM+2];}

# pragma omp barrier

sum[0]=A;sum[1]=B;

程式程式碼

#include <iostream>
#include <vector>
#include <cstdlib>
#include <time.h>
#include <cassert>
#include <cstring>
#include <cmath>
#include<omp.h>


#define PI 3.1415926535898

//單條資料的長度
#define MAX_LINE 20
//資料集的長度(從1開始計算)
#define DATA_LEN 11000000

#define EIGEN_NUM 4

//float dataSet[DATA_LEN * EIGEN_NUM];	//資料集
float (*dataSet)=(float(*))malloc(sizeof(float)*DATA_LEN*EIGEN_NUM);  

int dataLen;//資料集的行數
double maleNum=0;//男性總數
double femaleNum=0;//女性總數

int main(int argc, char **argv) {

	int i=0;
	int j=0;

	double start = omp_get_wtime( );


	/************************讀取檔案************************/

	char buf[MAX_LINE];		//緩衝區
	FILE *fp;				//檔案指標s
	int len;				//行字元個數

		//讀取檔案
		const char* fileLocation="E:\\test\\addVitalCapacityData.csv";
		fp = fopen(fileLocation,"r");
		if(fp  == NULL)
		{
			perror("fp  == NULL");
			exit (1) ;
		}

		//逐行讀取及寫入陣列
		char *token;
		const char s[2] = ",";
		while(fgets(buf,MAX_LINE,fp) != NULL && i< DATA_LEN)
		{
			len = strlen(buf);
			//刪去換行符
			buf[len-1] = '\0';
			//分割字串
			token = strtok(buf, s);
			//繼續分割字串
			j = 0;
			while( token != NULL ) 
			{
				dataSet[i*EIGEN_NUM + j]=atof(token);
				token = strtok(NULL, s);
				j = j+1;
			 }
			i = i + 1;
		}
		dataLen=i;
		printf("%d行4列的資料讀取完畢\n",dataLen);
		fclose(fp);

		

		double readTime = omp_get_wtime( );



	/************************開始OpenMP計算************************/

	int thread_count = strtol(argv[1],NULL,10);

	/***********計算高斯分佈***********/
	char *maenInf[6]={"maleLength","maleWeight","maleVC","femaleLength","femaleWeight","femaleVC"};

	double A,B,C,D,E,F,G;
	A=0;B=0;C=0;D=0;E=0;F=0;G=0;

	double sum[6]={0,0,0,0,0,0};
	double mean[6]={0,0,0,0,0,0};


#	pragma omp parallel for num_threads(thread_count) \
	reduction(+:maleNum) reduction(+:femaleNum) \
	reduction(+:A) reduction(+:B) reduction(+:C)\
	reduction(+:D) reduction(+:E) reduction(+:F)\
	shared(dataSet,dataLen) private(i)
	for(i=0;i<dataLen;i++)
	{
		if(dataSet[i*EIGEN_NUM]==1)
		{
			maleNum=maleNum+1;
			A+=dataSet[i*EIGEN_NUM+1];
			B+=dataSet[i*EIGEN_NUM+2];
			C+=dataSet[i*EIGEN_NUM+3];
		}
		else if(dataSet[i*EIGEN_NUM]==2)
		{
			femaleNum=femaleNum+1;
			D+=dataSet[i*EIGEN_NUM+1];
			E+=dataSet[i*EIGEN_NUM+2];
			F+=dataSet[i*EIGEN_NUM+3];
		}
		else
		{
			printf("dataSet[%d]=%f,性別有誤\n",i*EIGEN_NUM,dataSet[i*EIGEN_NUM]);
		}
		//printf("sum[0]=%f \n",sum[0]);
		//printf("%d行4列的資料求和完畢\n",i);
	}
#	pragma omp barrier
	sum[0]=A;
	sum[1]=B;
	sum[2]=C;
	sum[3]=D;
	sum[4]=E;
	sum[5]=F;


	//printf("maleNum=%.0f\nfemaleNum=%.0f\n",maleNum,femaleNum);


	/*for(i=0;i<6;i++)
	{
		printf("sum[%d]=%.0f\n",i,sum[i]);
	}*/

	//計算平均值
	for(i=0;i<6;i++)
	{
		if(i<3){mean[i]=sum[i]/maleNum;}
		if(i>2){mean[i]=sum[i]/femaleNum;}
		//printf("mean-%s = %.5f \n",maenInf[i],mean[i]);
	}

	//計算累加
	A=0;B=0;C=0;D=0;E=0;F=0;G=0;
	double Sigma[6]={0,0,0,0,0,0};
#	pragma omp parallel for num_threads(thread_count) default(none) \
	reduction(+:A) reduction(+:B) reduction(+:C)\
	reduction(+:D) reduction(+:E) reduction(+:F)\
	shared(dataSet,dataLen,mean) private(i)
	for(i=0;i<dataLen;i++)
	{
		if(dataSet[i*EIGEN_NUM]==1)
		{
			A+=pow(dataSet[i*EIGEN_NUM+1]-mean[0] , 2 );
			B+=pow(dataSet[i*EIGEN_NUM+2]-mean[1] , 2 );
			C+=pow(dataSet[i*EIGEN_NUM+3]-mean[2] , 2 );
		}
		else if(dataSet[i*EIGEN_NUM]==2)
		{
			D+=pow(dataSet[i*EIGEN_NUM+1]-mean[3] , 2 );
			E+=pow(dataSet[i*EIGEN_NUM+2]-mean[4] , 2 );
			F+=pow(dataSet[i*EIGEN_NUM+3]-mean[5] , 2 );
		}
		else
		{
			printf("dataSet[i*EIGEN_NUM]=%f,性別有誤",dataSet[i*EIGEN_NUM]);
		}
	}
#	pragma omp barrier
	Sigma[0]=A;
	Sigma[1]=B;
	Sigma[2]=C;
	Sigma[3]=D;
	Sigma[4]=E;
	Sigma[5]=F;


	//計算標準差
	double standardDeviation[6];	//標準差
	double sexNum;//各性別人數
	for(i=0;i<6;i++){
		if(i<3){sexNum=maleNum;}
		if(i>=3){sexNum=femaleNum;}
		standardDeviation[i]=sqrt(Sigma[i]/sexNum);
		//printf("Sigma[%d]=%f maleNum=%f",i,Sigma[i],sexNum);
		//printf("第%d個標準差=%.5f\n",i,standardDeviation[i]);
		}



	/*********** 樸素貝葉斯 & 準確率測試 ***********/
	//資料集有肺活量(VC),準確度判斷
	float preSexID;
	float Right=0;
	float Error=0;
	//宣告性別ID判斷函式
	int sexIDResult(float height,float weight,float VC,double *mean,double *standardDeviation);

#	pragma omp parallel for num_threads(thread_count)  default(none) \
	reduction(+:Right) reduction(+:Error) \
	shared(dataSet,dataLen,mean,standardDeviation) private(i,preSexID)
	for(i=0;i<dataLen;i++){
		preSexID=sexIDResult(dataSet[i*EIGEN_NUM+1],dataSet[i*EIGEN_NUM+2],dataSet[i*EIGEN_NUM+3],mean,standardDeviation);
		if(dataSet[i*EIGEN_NUM]==preSexID){
			Right=Right+1;
		}
		else{
			Error=Error+1;
			//printf("預測ID:%.0f  實際ID:%.0f \n",preSexID,receiveBuf[i*EIGEN_NUM]);
			//printf("性別:%.0f,身高:%.2f,體重:%.2f,肺活量:%.0f \n",receiveBuf[i*EIGEN_NUM],receiveBuf[i*EIGEN_NUM+1],receiveBuf[i*EIGEN_NUM+2],receiveBuf[i*EIGEN_NUM+3]);
			}
	}

	printf("Right:%.0f\nError:%.0f\n",Right,Error);
	double accuracy  = Right/(Error+Right);
	printf("Accuracy:%f\n",accuracy);

	double end = omp_get_wtime( );

	//printf("start = %.16g\nend = %.16g\ndiff = %.16g\n", start, end, end - start);
	printf("整體耗時 = %.16f\n", end - start);
	printf("讀取時長 = %.16f\n", readTime - start);
	printf("計算時長 = %.16f\n", end - readTime);

	return 0;
}






/*****************函式*****************/



/***********高斯分佈函式***********/
//求和
double getSum(float *data,int recDatalen,int sex,int column)
{
	double Sum=0;
	for(int i=0;i<(recDatalen/EIGEN_NUM);i++)
	{
		if(data[i*EIGEN_NUM]==sex){
			Sum=Sum+data[i*EIGEN_NUM+column];
		}
	}
	return Sum;
}

//求pow((data[i]-mean),2)的累加
double getSigma(float *data,int recDatalen,double mean,int sex,int column){
	double Sigma=0;
	for(int i=0;i<(recDatalen/EIGEN_NUM);i++){
		if(data[i*EIGEN_NUM]==sex){
			Sigma=Sigma+pow(data[i*EIGEN_NUM+column]-mean , 2 );
			//printf("sex=%d data[i]=%f mean=%f \n",sex,data[i*EIGEN_NUM+column],mean);
		}
	}
	return Sigma;
}



/***********樸素貝葉斯函式***********/

//計算概率p(特徵列column = x | 性別)
double getProbability(double x,int column,int sex,double mean,double standardDeviation)
{
	double Probability;	//計算出的概率
	double u = mean;
	double p = standardDeviation;

	//高數分佈概率密度函式 x:預測變數 u:樣本平均值 p:標準差
	p=pow(p,2);
	Probability = (1 / (2*PI*p)) * exp( -pow((x-u),2) / (2*p) );

	//printf("p(%s=%lf|性別=%s)=%.16lf\n",basicInfo[column],x,gender,Probability);

	return Probability;
}

//返回性別ID結果
int sexIDResult(float height,float weight,float VC,double *mean,double *standardDeviation)
{
	double maleP;//男性概率
	double femaleP;//女性概率
	double a=0.5; //男女比例各50%

	maleP = a * getProbability(height,1,1,mean[0],standardDeviation[0]) * getProbability(weight,2,1,mean[1],standardDeviation[1]) 
		* getProbability(VC,3,1,mean[2],standardDeviation[2]);

	femaleP = a * getProbability(height,1,2,mean[3],standardDeviation[3]) * getProbability(weight,2,2,mean[4],standardDeviation[4]) 
		* getProbability(VC,3,2,mean[5],standardDeviation[5]);

	if(maleP > femaleP){return 1;}
	if(maleP < femaleP){return 2;}
	if(maleP == femaleP){return 0;}
}

結果

相關文章