marry_data 資料
from numpy import *
import operator
from os import listdir
def knn_class(inx, dataset, labels, k):
dataset_size = dataset.shape[0] # shape return size
diff_mat = tile(inx, (dataset_size, 1)) - dataset # tile() 計算距離
sq_diff_mat = diff_mat**2 # python ** == ^ 這裡平方算距離
sq_distances = sq_diff_mat.sum(axis = 1) # axis = 0 -> 列 asix = 1 -> 行 按列累和 (x^2 + y^2)
distances = sq_distances**0.5 # (x^2 + y^2)開方算距離
sorted_dist_indicies = distances.argsort() # 距離計算 argsort() 函式返回從小到大的索引值
# 選取 K 個距離最小的點 進行分類 並且統計各個分類的數量
class_count = {}
for i in range(k): # [0, k-1]
vote_label = labels[sorted_dist_indicies[i]]
class_count[vote_label] = class_count.get(vote_label, 0) + 1
sorted_class_count = sorted(class_count.iteritems(), key=operator.itemgetter(1), reverse=True)
return sorted_class_count[0][0]
def file_to_matrix(filename):
love_dict = {'largeDoses' : 3, 'smallDoses' : 2, 'didntLike': 1}
file = open(filename)
lines = file.readlines()
lines_num = len(lines)
data_matrix = zeros((lines_num, 3)) # -> lines_num * 3 matrix
class_label = []
idx = 0
for line in lines:
line = line.strip() # 刪除空白字元
msgs = line.split('\t')
data_matrix[idx, :] = msgs[0:3] # 放入 對應的行中 40920 8.326976 0.953952 largeDoses
if (msgs[-1].isdigit()):
class_label.append(int(msgs[-1]))
else:
class_label.append(love_dict.get(msgs[-1])) # 獲取得到該資料的 lable 對應的編號 3 2 1
idx += 1
return data_matrix, class_label
# 把資料歸一化到 [0, 1]
def auto_norm(data_set):
min_vals = data_set.min(0)
max_vals = data_set.max(0)
ranges = max_vals - min_vals
norm_data = zeros(shape(data_set))
row_size = data_set.shape[0]
print('row_size', row_size)
# 這裡歸一化的演算法思路: [x, y] z 在 x,y 之間
# 結果 = (z - x) / (y - x) 比如: [1, 9] z = 4 -> = (4 - 1) / (9 - 1)
norm_data = data_set - tile(min_vals, (row_size, 1))
norm_data = norm_data / tile (ranges, (row_size, 1))
return norm_data, ranges, min_vals
# 檔案中的資料格式
# 40920 8.326976 0.953952 largeDoses
# 14488 7.153469 1.673904 smallDoses
# 26052 1.441871 0.805124 didntLike
# 75136 13.147394 0.428964 didntLike
# 38344 1.669788 0.134296 didntLike
# 72993 10.141740 1.032955 didntLike
def date_class_test():
ratio = 0.1 # 這裡用 90% 的資料來訓練 10% 資料留作驗證
data_matrix, class_label = file_to_matrix('./marry_data')
norm_matrix, ranges, min_vals = auto_norm(data_matrix) # 資料歸一化 使得資料都在 [0,1] 之間 影響因子相同
norm_size = norm_matrix.shape[0]
test_num = int(norm_size * ratio)
error_count = 0.0
for i in range(test_num):
result = knn_class(norm_matrix[i, :], norm_matrix[test_num:norm_size, :],
class_label[test_num:norm_size], 3)
if (result != class_label[i]):
error_count += 1.0
return (error_count / float(test_num)) * 100
print('error_count: %d') % (date_class_test()) + '%' # 5.0%