import csv
import random
#------------------函数-----------------------------------
# 用于打开文件
def open_file(file):
with open(file) as file:
datas = list(csv.reader(file))
random.shuffle(datas) # 随机打乱
return datas
# 数据预处理
def pretreatment(datas):
for i in range(4):
list2 = []
for j in range(len(datas)):
list2.append(datas[j][i])
for k in range(len(datas)):
num = float(datas[k][i])
num = num / (float(max(list2)) + float(min(list2)))
datas[k][i] = str(num)
return datas
# 欧式距离计算
def euclidean_distance(vec1, vec2):
distance = 0.0
for i in range(len(vec1)-1):
distance += (float(vec1[i]) - float(vec2[i]))**2
return distance ** 0.5
# 找出K个近邻
def get_neighbors(train, test_sample, num_neighbors):
distances = []
for train_row in train:
dist = euclidean_distance(test_sample,train_row)
distances.append((train_row,dist))
distances.sort(key=lambda tup:tup[1])
neighbors = []
for i in range(num_neighbors):
neighbors.append(distances[i][0])
return neighbors
# 通过等权投票的方式选出最佳
def vote(neighbors):
dict = {'Iris-setosa':0,'Iris-versicolor':0,'Iris-virginica':0}
for i in neighbors:
dict[i[4]]+=1
for k, v in dict.items():
if v == max(dict.values()):
return k
# 评价模型(正确率)
def evaluation(test_set,pred):
count = 0.0
for i in range(len(test_set)):
if test_set[i][-1] == pred[i]:
count += 1
print('正确个数:'+str(count))
print('正确率:'+str(count / len(test_set) * 100) + '%')
# KNN模型
def knn(estimated_set,validation_set):
pred = []
for i in range(0, len(validation_set)):
pred.append(vote(get_neighbors(estimated_set, validation_set[i][:4], K)))
i += 1
evaluation(validation_set,pred)
# ----------------------主程序-----------------------------------
# 打开文件,返回的是打乱后的数据集
datas = open_file('database.csv')
# 预处理
# datas = pretreatment(datas)
# 划分训练集和测试集
n = len(datas) // 3
test_set = datas[0:n]
train_set = datas[n:]
estimated_set = train_set[:n]
validation_set = train_set[n:]
# 取 K 值
K = 5
# 训练模型,用于调K值
# knn(estimated_set,validation_set)
# 评价模型
knn(train_set,test_set)
评论0