C均值聚類 C實現 Python實現

偏執狂-發表於2020-12-05

C均值聚類

演算法步驟

  1. 在樣本集合中選擇C個點作為初始類中心;
  2. 在剩下的樣本點中選擇一個,計算其到各個中心點的距離,選取距離最短者將其歸為那個類別;
  3. 選擇下一個樣本,重複2直到計算完所有樣本,若集合不發生變化或達到迭代上限則轉5否則轉4;
  4. 根據當前的類劃分情況重新計算中心點,重複步驟2;
  5. 結束演算法。

C實現

/*
	@Time : 2020/12/4 0:04
	@Author : Li Canghao
	@Name : C_means.py
	@Software : C-Free
*/
#include<stdio.h>
#include<math.h>
const long long maxn = 10004;
typedef struct twoD{
	double x,y; 
}twoD;
double Distance(twoD a,twoD b){					//計算距離 
	return fabs(sqrt((a.x-b.x)*(a.x-b.x) + (a.y-b.y)*(a.y-b.y)));
}
void InitCenters(twoD trains[],int c,twoD centers[]){				//初始化類中心 
	for(int i = 0; i < c; i++){
		centers[i].x = trains[i].x;
		centers[i].y = trains[i].y;
		//printf("%lf %lf\n",centers[i].x,centers[i].y) ;
	}
}
void SearchMinDistance(int index,twoD train,int c,twoD centers[],int belong[],int cnt[]){		//尋找距離最短 
	double mindistance = 1e6;
	int minindex = 0;
	for(int i= 0; i < c; i++){
		if(Distance(train,centers[i])<mindistance){
			mindistance = Distance(train,centers[i]);
			minindex = i;
		}
	}
	belong[index] = minindex;					//該模式屬於minindex類 
	cnt[minindex]++;							//該類模式數增加 
	//printf("belong[%d] = %d  cnt[%d] = %d\n",index,minindex,minindex,cnt[minindex]);
} 

void C_mean(int n,twoD trains[],int c){			//C均值主程式 
	twoD centers[maxn];							//存放類心 
	twoD avg[c];								//用於計算類心 
	int belong[n];								//儲存集合關係 
	int cnt[c];									//儲存一個類有多少模式 
	int counts = 0;								//判斷集合是否不再變化 
	InitCenters(trains,c,centers);
	while(counts < c){
		for(int i = 0; i < c; i++)cnt[i] = 0; 
		for(int i = 0; i < n; i++){				//每一模式與類心距離按照最小距離歸類 
			SearchMinDistance(i,trains[i],c,centers,belong,cnt);
		} 
		for(int i = 0; i < c; i++){				//為計算新的類心初始化 
			avg[i].x = 0;
			avg[i].y = 0;
		}
		for(int i = 0; i < n; i++){				//準備計算每一類的類心 
			avg[belong[i]].x += trains[i].x;
			avg[belong[i]].y += trains[i].y;
		}
		counts = 0;
		for(int i = 0;i < c; i++){
			avg[i].x /= cnt[i];					//計算類心 
			avg[i].y /= cnt[i];
			if(((centers[i].x-avg[i].x)<1e-6)&&((centers[i].y-avg[i].y)<1e-6)){	//如果新的類心與原類心差距十分小,就認為沒有更新 
				counts++;
			}
			//printf("count = %d\n",counts);
			centers[i].x = avg[i].x;			//更新類心 
			centers[i].y = avg[i].y;
		}
		
	}
	printf("\n-----處理完畢,展示結果-----\n");
	for(int i = 0; i < c; i++){
		printf("當前第%d類,聚類中心為:(%lf,%lf) 共有%d個模式,其中的集合為:\n",i+1,centers[i].x,centers[i].y,cnt[i]); 
		for(int j = 0; j < n; j++){
			if(belong[j] == i)printf("\t%d:(%lf,%lf)\n",j+1,trains[j].x,trains[j].y);
		}
	}
} 
int main(){										//測試 
	twoD trains[maxn];
	int n,c;
	printf("請輸入需要分成多少類:");
	scanf("%d",&c);
	printf("請輸入模式總個數:(n > c)");
	scanf("%d",&n);
	printf("請輸入各模式的特徵值(二維):\n");
	for(int i = 0; i < n; i++){
		scanf("%lf%lf",&trains[i].x,&trains[i].y);
	}
	printf("-----開始C均值聚類-----\n");
	C_mean(n,trains,c);
	printf("-----C均值聚類結束-----\n");
	return 0;
}


測試資料 少量樣本
1 2
4 5
7 3
100 20
90 50
-5 6
50 89
2000 414
2000 808
2020 124

Python實現

# -*- codeing = utf-8 -*-
# @Time : 2020/12/4 0:04
# @Author : Li Canghao
# @Name : C_means.py
# @Software : PyCharm
import math
import random
import matplotlib.pyplot as plt   #用於做圖

class twoD:
    x = 0.0
    y = 0.0
    belong = 0      #所屬哪個類

    def __init__(self,x,y):
        self.x = float(x)
        self.y = float(y)
    def __add__(self, other):
        self.x += other.x
        self.y += other.y
    def toString(self):
        return str(self.x) + " " + str(self.y)

def Distance(a,b):
    return math.sqrt((a.x-b.x)**2 + (a.y-b.y)**2)

def GenerateTrains():               #隨機生成訓練集並寫入檔案中
    number = random.randint(100,200)
    try:
        f = open("trains.txt","w")  #因為測試了N次,為了不佔用空間,用了w模式而不是a追加
        try:
            for i in range(number):
                temp = twoD(random.random()*4041-2020,random.random()*4041-2020)  #資料範圍為[-2020,2020]
                f.write(temp.toString())
                if i != number-1:
                    f.write("\n")
            print("-----建立訓練整合功-----")
        finally:
            f.close()
            print("-----檔案關閉-----")
    except Exception as ex:
        print("-----出現異常",ex,"-----")

def ReadTrains():                   #讀取訓練集
    trains = []
    try:
        f = open("trains.txt","r")
        print("-----讀取訓練整合功-----")
        try:
            for line in f.readlines():
                train = line.split()
                temp = twoD(train[0],train[1])
                trains.append(temp)
            return trains
        finally:
            f.close()
            print("-----檔案關閉-----")
    except Exception as ex:
        print("-----出現異常",ex,"-----")

def C_mean(trains,c):
    centers = trains[0:c]                       #[切片],選擇C個點作為初始類心,這裡將前c個作為初始類心
    new_centers =[]                             #記錄新的類心
    numbers = []                                 #桶,記錄一個類有多少個模式
    counts = 0                                  #記錄未變的類的數量
    while counts < c:
        numbers = [0 for i in range(c)]
        #new_centers = [twoD for i in range(c)]  #計算每個類 新的類心  #該寫法有問題,指向同一個twoD,改一個全部都會更改,淺複製
        new_centers = [twoD(0,0) for i in range(c)]  #深複製,不會指向同一個目標
        for i in range(len(trains)):            #遍歷樣本點
            mindistance = 1e7                   #最小距離
            minindex = 0                        #記錄離哪個類心距離近
            for j in range(len(centers)):       #遍歷當前點,找到距離最小的類心
                if Distance(trains[i],centers[j]) < mindistance:
                    mindistance = Distance(trains[i],centers[j])
                    minindex = j
            trains[i].belong = minindex         #歸屬minindex類
            new_centers[minindex].x += trains[i].x         #計算新的類心 先算總的x,y
            new_centers[minindex].y += trains[i].y
            #for n,z in enumerate(new_centers):
               # print(n,z.x,z.y)
            #print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(minindex,new_centers[minindex].x,minindex,new_centers[minindex].y))
            numbers[minindex] += 1              #該類中 模式的數量+1
            '''for n,z in enumerate(numbers):
                print(n,z)'''
            #print("trains[%d].belong = %d,numbers[%d] = %d"%(i,trains[i].belong,minindex,numbers[minindex]))
        '''  for i in new_centers:
                print(i.x,i.y)
            print("-"*30)'''
        for i,center in enumerate(centers):     #遍歷類心,比較新類心和舊類心是否發生變化
            #print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(i,new_centers[i].x,i,new_centers[i].y))
            new_centers[i].x /= float(numbers[i])
            new_centers[i].y /= float(numbers[i])
            #print("new_centers[%d].x = %d,new_centers[%d].y = %d"%(i,new_centers[i].x,i,new_centers[i].y))
            if ((new_centers[i].x - center.x < 1e-6) and (new_centers[i].y - center.y < 1e-6)):
                counts += 1                     #未變的類數量+1
            centers[i] = new_centers[i]         #更新類心
    print("-----處理完畢,展示結果-----")
    colors = ["red","blue","green","coral","tan","yellow","brown","gold","orange","peru"]
    marks = ["+","x","o","v","^","<",">","1","2","3"]
    for i,center in enumerate(centers):
        print("當前第%d類,類心為:(%d,%d) 共有%d個模式,它們分別是:"%(i + 1,center.x,center.y,numbers[i]))
        for j,train in enumerate(trains):
            if train.belong == i:
                print("\t%d:(%d,%d)"%(j+1,train.x,train.y))
                plt.scatter(train.x,train.y,marker = marks[i],c = colors[i])
    plt.show()



'''def ShowPlot():
    for i in range(c):
        plt.scatter()'''

print("-----準備建立訓練集-----")
GenerateTrains()
print("-----準備讀取訓練集-----")
trains = ReadTrains()
print(len(trains))
c = int(input("請輸入需要分成多少類"))
print("-----C均值聚類開始-----")
C_mean(trains,c)
print("-----C均值聚類結束-----")
#ShowPlot()


測試資料在GenerateTrains()函式中建立

結果

為了能直觀的看出來,截了Python實現中產生的圖,這裡c=5
在這裡插入圖片描述

後話

一天學完Python並實現上面這個演算法真是太nice了(頭暈目眩)

相關文章