import numpy as np
import matplotlib.pyplot as plt
# 返回距离样本最近的质心的下标索引
def group_one(sample, centers):
distance_vect = np.sum((sample-centers)**2, axis=1)
return np.argmin(distance_vect)
# 将所有样本分组到k个质心,返回二维列表[[属于分组1的样本][属于分组2的样本]...]
def group_all(data, k, centers):
# 这里使用二维列表,而不是ndarray的原因在于,每个分组的大小,也就是样本的个数是不确定的
# 而array是确定大小的,强行转换这里会变成列表对象的数组,效率低下且更容易出错
# 如果有更好的做法欢迎交流
groups = []
for index in range(k):
groups.append([])
# 对每一个样本进行分组
for sample in data:
index = group_one(sample, centers)
groups[index].append(sample.tolist())
return groups
# 根据样本分组,更新每个质心的位置
def update_centers(data, k, groups):
centers = np.zeros((k, data.shape[1]))
for index in range(k):
centers[index] = np.mean(np.array(groups[index]), axis=0)
return centers
# 检测与上一次迭代的更新差值
def iter_diff(old_centers, new_centers):
return np.sum(np.abs(old_centers - new_centers))
# 生成随机质心
def rand_center(data, k):
# 共k个质心,data.shape[1]是每个数据样本的维度,质心的维度应与样本的维度一致。
centers = np.random.rand(k, data.shape[1])
# rand随机的范围是零到一,要适用于样本的范围需要进行缩放
# 这里使用样本在该维度的最大值作为每个维度上的缩放倍数
scale = np.max(data, axis=0)
centers *= scale
return centers
# 迭代主体函数
def classify(data, k, threshold, max_iter=0):
centers = rand_center(data, k)
loss = float("inf")
iter_count = 0
# 当loss小于阈值,或迭代次数大于指定最大次数时(若不指定则只判断loss足够低)终止
while loss > threshold and ((max_iter == 0) or iter_count < max_iter):
groups = group_all(data, k, centers)
old_centers = centers
centers = update_centers(data, k, groups)
loss = iter_diff(old_centers, centers)
iter_count += 1
print("iter_%d : loss=%f" % (iter_count, loss))
return centers, groups
# 绘图
def paint_result(data, centers, k, groups, debug=False):
c = []
flatten_group = []
for index in range(k):
for item in groups[index]:
c.append(index)
flatten_group.append(item)
groups = np.array(flatten_group)
if debug:
plt.scatter(groups[:, 0], groups[:, 1])
else:
plt.scatter(groups[:, 0], groups[:, 1],c=c)
plt.scatter(centers[:, 0], centers[:, 1], color="red")
plt.show()
def main():
data = np.loadtxt("d:/data.csv", delimiter=",")
data.resize((500, 2))
center, groups = classify(data, 3, 0, 0)
paint_result(data, center, 3, groups)
if __name__ == '__main__':
main()
k-means python实现及数据.zip
需积分: 25 34 浏览量
2018-03-19
22:41:35
上传
评论 1
收藏 13KB ZIP 举报
mottled233
- 粉丝: 512
- 资源: 4