# -*- coding: utf-8 -*-
import numpy as np
import os
import random
import cv2
import codecs
import copy
from PIL import Image
import datetime
from scipy.spatial.distance import pdist
def plus(dataSet,k=2):#kmeans++选取初始聚类中心
lenth, dim = dataSet.shape #获取数据维度
_max = np.max(dataSet,axis=0) #线性映射最大值 axis=0列最大值
_min = np.min(dataSet,axis=0)
centers = []
centers.append( ( _min + ( _max - _min )*(np.random.rand(dim))) )#生成一个随机向量
centers=np.array(centers)#为了保证centers的矩阵结构,而不是向量结构
begin=datetime.datetime.now()
for i in range(1,k):
print('随机中心:',i)
print(datetime.datetime.now())
distanceS =[]
for row in dataSet:
distanceS.append( np.min(np.linalg.norm(row - centers,axis=1)) )#计算离多个中心的距离里面最近的那个..
#蒙特卡罗法, 假设总距离由各个距离条组成,落在距离条长的上面概率大,可用概率求长条,这里反过来用
temp=sum(distanceS)*np.random.rand()
for j in range(lenth):
temp-=distanceS[j]#依次剥离距离条
if temp<0:
centers=np.append(centers,[dataSet[j]],axis=0)#保持0轴不塌陷
break
f=codecs.open('初始聚类中心','w','utf-8')#将选取的聚类中心保存
for i in range(len(centers)):
f.write(str(centers[i].tolist())+'\n')
f.close()
print(datetime.datetime.now()-begin)
return centers
def kmeansplus(dataSet,*,k,maxIter=300):
#initialize with ++
centers = plus(dataSet,k)
#centers=load_data('初始聚类中心',1)
def getLabel(data):
distanceS = np.linalg.norm(data - centers,axis=1) #注意axis是等于1的...
return np.where(distanceS == np.min(distanceS))[0][0]
labels = np.ones(len(dataSet))
j=0
begin=datetime.datetime.now()
print(begin)
while 1 and j<maxIter:
print('kmeans迭代:',j)
print(datetime.datetime.now())
j+=1
label_new = np.array(list(map(getLabel,dataSet))) #生成新的标签
if sum(np.abs(labels - label_new)) == 0: #判断标签是否改变
break
labels = label_new
for i in range(k):
if len(dataSet[labels == i])!=0:
centers[i] = np.mean(dataSet[labels == i],axis=0) #更新聚类中心
print(datetime.datetime.now()-begin)
SSE = sum([ sum([ (j-centers[i]).dot(j-centers[i]) for j in dataSet[labels==i]]) for i in range(k)])#计算误差平方和
print("SSE: ",SSE)
return label_new,centers
def get_class(name):#根据文件名获取所属类别
#F:\自动获取k_means聚类参数k值的算法\imagedata\001000-001262\001000\001000b07256.jpg
site=len(name)-1
while name[site]!='\\' and name[site]!='/':
site=site-1
return name[site+1:site+7]
def kmeans(dataSet,*,k,maxIter=300):
#随机初始化
temp=copy.deepcopy(dataSet)
random.shuffle(temp)
centers=copy.deepcopy(temp[:k])
del temp
def getLabel(data):
distanceS = np.linalg.norm(data - centers,axis=1) #注意axis是等于1的...
return np.where(distanceS == np.min(distanceS))[0][0]
labels = np.ones(len(dataSet))
j=0
begin=datetime.datetime.now()
print(begin)
while 1 and j<maxIter:
print('kmeans迭代:',j)
print(datetime.datetime.now())
j+=1
label_new = np.array(list(map(getLabel,dataSet))) #生成新的标签
if sum(np.abs(labels - label_new)) == 0: #判断标签是否改变
break
labels = label_new
for i in range(k):
if len(dataSet[labels == i])!=0:
centers[i] = np.mean(dataSet[labels == i],axis=0) #更新聚类中心
print(datetime.datetime.now()-begin)
SSE = sum([ sum([ (j-centers[i]).dot(j-centers[i]) for j in dataSet[labels==i]]) for i in range(k)])#计算误差平方和
print("SSE: ",SSE)
return label_new,centers
def get_rate(name,rank):#计算召回率准确率
right=0
name=name[10:14]
for j in range(len(rank)):
temp=rank[j][10:14]
if name==temp:#表示所属分类
right+=1
return right/len(rank)
#将folder路径下的图像修改为英文名
def changename(folder):
filename=get_filename(folder)
length=len(filename)
for i in range(length):
img=Image.open(filename[i])
name=filename[i][0:len(filename[i])-12]+str(i)+'.jpg'
img.save(name)
os.remove(filename[i])
#提取单个图像的sift特征
def getSift(name):
#img=Image.open(name)
#img.save('main.jpg')#转化为全中文路径
img = cv2.imread(name)
#os.remove('main.jpg')
gray= cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
sift = cv2.xfeatures2d.SIFT_create()
kp,des=sift.detectAndCompute(gray, None)#获取关键点(http://www.cnblogs.com/cj695/p/4041399.html)和sift特征描述子
if type(des)==type(None):
return des
#rootsift--https://www.cnblogs.com/wangguchangqing/p/9471103.html
eps=1e-7
des/=(des.sum(axis=1, keepdims=True)+eps)
des=np.sqrt(des)
#des/=(np.linalg.norm(des,axis=1,ord=2)+eps)#可选选项
'''
print(des.shape)
print(des[0])
img=cv2.drawKeypoints(gray,kp,img)#在图像上绘制关键点
plt.imshow(img),plt.show()
'''
return des
#提取文件夹下所有图像的特征&&保存
def get_sift_folder(path,siftname):
#提取sift特征
filename=get_filename(path)
random.shuffle(filename)#随机打乱顺序
length=len(filename)
#print(length)
f=codecs.open(siftname,'w','utf-8')
num_error=0
for i in range(length):
sift=getSift(filename[i])
if type(sift)==type(None):#图片加载失败的情况
print(filename[i])
num_error+=1
continue
for j in range(sift.shape[0]):
f.write(str(sift[j].tolist())+'\n')
if i%100==0:
print(i)
f.close()
if num_error>0:
print('num_error',num_error)
#加载数据
def load_data(filename,loadtype=1):
result=[]
with open(filename,encoding='utf-8') as f:
if loadtype==1:#只加载数据
for line in f.readlines():
temp=line.strip('\n').strip('[').strip(']').strip(' ').split(',')
for i in range(len(temp)):
temp[i]=float(temp[i])
temp=np.array(temp)
result.append(temp)
result=np.array(result)
elif loadtype==2:#加载文件名和数据
for line in f.readlines():
temp=line.strip('\n').split('[',1)#分开文件名和数据
t=[]
t.append(temp[0].strip(' '))
t.append(temp[1].strip('[').strip(']').strip(' ').split(','))
for i in range(len(t[1])):
t[1][i]=float(t[1][i])
t[1]=np.array(t[1])
result.append(t)
return result
def is_folder_button(path):#判断path是不是最底层文件夹
full=os.listdir(path) ##列出文件夹下所有的目录与文件
if len(full)==0:
return 0
for i in full:
temp_path=os.path.join(path,i) ##path与path下的相对路径合并
if os.path.isdir(temp_path): #如果是文件夹--说明path不是最底层文件夹
return 0
return 1
def folder_bottom(path):#递归找到所有最底层文件夹
res=[]
full=os.listdir(path)
for i in full:
评论0