import numpy as np
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets._samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns # 可视化库
# 生成聚类中心点
centers = [[1, 1], [-1, -1], [1, -1]]
# 生成样本数据集
X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0)
# 画布设置
fig = plt.figure(figsize=(12, 5))
fig.subplots_adjust(left=0.02, right=0.98, bottom=0.05, top=0.9)
ax = fig.add_subplot(1, 2, 1)
row, _ = np.shape(X)
# 画子图,未聚类点
for i in range(row):
ax.plot(X[i, 0], X[i, 1], '#4EACC5', marker='.')
# StandardScaler 标准化处理。且是针对每一个特征维度来做的,而不是针对样本。
X = StandardScaler().fit_transform(X)
# 调用密度聚类 DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
# print(db.labels_) # db.labels_为所有样本的聚类索引,没有聚类索引为-1
# print(db.core_sample_indices_) # 所有核心样本的索引
core_samples_mask = np.zeros_like(db.labels_, dtype=bool) # 设置一个样本个数长度的全false向量
core_samples_mask[db.core_sample_indices_] = True # 将核心样本部分设置为true
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# 获取聚类个数。(聚类结果中-1表示没有聚类为离散点)
# 模型评估
print('估计的聚类个数为: %d' % n_clusters_)
print("同质性: %0.3f" % metrics.homogeneity_score(labels_true, labels)) # 每个群集只包含单个类的成员。
print("完整性: %0.3f" % metrics.completeness_score(labels_true, labels)) # 给定类的所有成员都分配给同一个群集。
print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) # 同质性和完整性的调和平均
print("调整兰德指数: %0.3f" % metrics.adjusted_rand_score(labels_true, labels))
print("调整互信息: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels))
print("轮廓系数: %0.3f" % metrics.silhouette_score(X, labels))
sns.set(font='SimHei', style='ticks')
unique_labels = set(labels)
colors = [plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))]
ax = fig.add_subplot(1, 2, 2)
for k, col in zip(unique_labels, colors):
if k == -1: # 聚类结果为-1的样本为离散点
# 使用黑色绘制离散点
col = [0, 0, 0, 1]
class_member_mask = (labels == k) # 将所有属于该聚类的样本位置置为true
xy = X[class_member_mask & core_samples_mask] # 将所有属于该类的核心样本取出,使用大图标绘制
ax.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=14)
xy = X[class_member_mask & ~core_samples_mask] # 将所有属于该类的非核心样本取出,使用小图标绘制
ax.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col), markeredgecolor='k', markersize=6)
plt.title('Estimated number of clusters: %d' % n_clusters_)
sns.despine()
plt.show()
评论0