import numpy as np
import matplotlib.pyplot as plt
import itertools
import random
import datetime
class Hiercluster:
def __init__(self):
pass
def loadcsv(self,trainfile = 'C:\\Users\\DELL\\Desktop\\Frogs_MFCCs.csv'):
self.train_X = np.loadtxt(trainfile,delimiter=',',dtype = float,skiprows=1,usecols=range(0,22))
self.train_Y = np.loadtxt(trainfile,delimiter=',',dtype = str,skiprows=1,usecols=(22))
def Normalize(self):
max_data = np.max(self.train_X,axis=0) #获取每个特征的最大值,为下面规范数据用
min_data = np.min(self.train_X,axis=0) #获取每个特征的最小值,为下面规范数据用
max_set = np.zeros_like(self.train_X); max_set[:] = max_data #以每个特征的最大值,构建一个与训练集结构一样的数据集
min_set = np.zeros_like(self.train_X); min_set[:] = min_data #以每个特征的最小值,构建一个与训练集结构一样的数据集
self.train_X = (self.train_X - min_set)/(max_set - min_set) #规范训练集
def Get1000data(self,train_X,train_Y):
seq = [i for i in range(0,len(train_X))] #生成1到9999元素的list
#argarray = [i for i in range(0,1000)]
dataset = []
Label = []
argarray = random.sample(seq,1000)
#print(argarray)
for i in argarray:
dataset.append(train_X[i])
Label.append(train_Y[i])
return dataset,Label
def Generate_N_data(self,train_X):
return np.mean(train_X,axis = 0)
def PCA(self,train_X,threshold):
MEAN_X_0 = self.Generate_N_data(train_X)
mean_X = np.zeros_like(train_X)
mean_X[:] = MEAN_X_0
train_X = train_X - mean_X
Cov = np.cov(train_X.T)
FeatureValue, FeatureVector = np.linalg.eig(Cov)
Featuresum = FeatureValue.sum()
argarray = FeatureValue.argsort(axis = 0)
Msum = 0
FeatureMvec = []
bound = threshold * Featuresum
Dim = 0
for i in reversed(argarray):
Msum += FeatureValue[i]
FeatureMvec.append([vector[i] for vector in FeatureVector])
if Msum >= bound:
Dim = i
break
FeatureMvec = np.matrix(FeatureMvec)
Destrain_X = train_X * FeatureMvec.T
return Destrain_X
def preprocess(self,X):
classList = {}
IndexList = {}
for i in range(len(X)):
classList[i] = [X[i]]
IndexList[i] = [i]
return classList,IndexList
def getAllDistances(self,A):
A = np.expand_dims(A,axis = 2)
A = A.reshape(1,A.shape[0],A.shape[1])
B = A.swapaxes(0,1)
distances = np.sum(np.square(A - B),axis = 2)
return distances
def getDistances(self,point,train_X):
points = np.zeros_like(train_X)
points[:] = point
EuclideanDistances = np.sum(np.square(train_X - points),axis = 1)
return EuclideanDistances
def Get_argmin(self,distances):
i = [i for i in range(len(distances))]
distances[i,i] = 10
i = distances.argmin()
# print(i)
Attr_len = len(distances)
j = i % Attr_len
i = (i - j ) / Attr_len
if i > j:
return int(j) , int(i)
else:
return int(i) , int(j)
def MajorityCnt(self,Test_Y,Labels):
A = {}
for i in range(len(Test_Y)):
if Labels[i] not in A.keys():
A[Labels[i]] = {}
if Test_Y[i] not in A[Labels[i]].keys():
A[Labels[i]][Test_Y[i]] = 0
A[Labels[i]][Test_Y[i]] += 1
return A
def Purity(self,A,N):
Num = 0
for i in A.keys():
arg_max = max(A[i], key=A[i].get)
Num += A[i][arg_max]
return Num/N
def landam(self,test_Y,Labels):
a,b,c,d = 0,0,0,0
for i, j in itertools.combinations(range(len(Labels)), 2):
if test_Y[i] == test_Y[j]:
if Labels[i] == Labels[j]:
a += 1
else:
b += 1
elif Labels[i] == Labels[j]:
c += 1
else:
d += 1
return (a + d)/(a + b + c +d)
def Evaluate(self,train_Y,Label):
#print(train_Y)
A = self.MajorityCnt(train_Y,Label)
Purity = self.Purity(A,len(train_Y))
Lamda = self.landam(train_Y,Label)
return Purity,Lamda
def HierCluster(self,train_X,Y,k,Labels):
classList,IndexList = self.preprocess(train_X)
Num = 0
X = train_X.copy()
distances = self.getAllDistances(train_X)
while distances.shape[0] != k:
#print("-------------------------------------------------")
i , j = self.Get_argmin(distances)
if i == 1:
Num += 1
# print(distances.shape)
# print (i,j,distances[i][j])
L_i = Labels[i]
L_j = Labels[j]
classList[L_i].extend(classList[L_j])
IndexList[L_i].extend(IndexList[L_j])
del classList[L_j]
del IndexList[L_j]
s = 0
distances = np.delete(distances,j,axis = 0)
distances = np.delete(distances,j,axis = 1)
del Labels[j]
X = np.delete(X,j,axis = 0)
# print(X.shape)
[Point] = np.mean(classList[L_i], axis=0)
X[i] = Point
distance = self.getDistances(Point,X)
distance = sum(distance.tolist(),[])
# print(distance)
distances[i] = distance
distances = distances.T
distances[i] = distance
# print(distances)
i = 0
test = np.zeros(1000)
for val in IndexList :
for j in IndexList[val]:
test[j] = i
i += 1
return test
def PlotGraph(self,X,Label):
fig = plt.figure()
plt.figure(figsize=(8, 5), dpi=80)
ax = plt.subplot(111)
plt.xticks(np.arange(-1,2*np.pi,0.1))
plt.yticks(np.arange(-1,2*np.pi,0.1))
idx_0 = np.where(Label==0)
p1 = ax.scatter(X[idx_0,0].tolist(),X[idx_0,1].tolist(),color = 'black',label= 0,s =10)
idx_1 = np.where(Label==1)
p1 = ax.scatter(X[idx_1,0].tolist(),X[idx_1,1].tolist(),color = 'red',label= 1,s = 10)
idx_2 = np.where(Label==2)
p2 = ax.scatter(X[idx_2,0].tolist(),X[idx_2,1].tolist(),color ='green',label= 2,s = 10)
idx_3 = np.where(Label==3)
p3 = ax.scatter(X[idx_3,0].tolist(),X[idx_3,1].tolist(),color ='blue',label = 3,s = 10)
idx_4 = np.where(Label==4)
p3 = ax.scatter(X[idx_4,0].tolist(),X[idx_4,1].tolist(),color ='yellow',label=4,s = 10)
idx_5 = np.where(Label==5)
p3 = ax.scatter(X[idx_5,0].tolist(),X[idx_5,1].tolist(),color ='darkorange',label=5,s = 10)
idx_6 = np.where(Label==6)
p3 = ax.scatter(X[idx_6,0].tolist(),X[idx_6,1].tolist(),color ='pink',label=6,s = 10)
idx_7 = np.where(Label==7)
p3 = ax.scatter(X[idx_7,0].tolist(),X[idx_7,1].tolist(),color ='navy',label=7,s = 10)
idx_8 = np.where(Label==8)
p3 = ax.scatter(X[idx_8,0].tolist(),X[idx_8,1].tolist(),color ='slategray',label=8,s = 10)
idx_9 = np.where(Label==9)
p3 = ax.scatter(X[idx_9,0].tolist(),X[idx_9,1].tolist(),color ='cyan',label=9,s = 10)
plt.show()
def writecsv1(self,Y,k,trainfile = '.\\Kmeans.csv'):
np.savetxt(trainfile, Y, fmt = '%d')
USV = Hiercluster()
USV.loadcsv()
USV.Normalize()
A = USV.PC