function [label, center, bCon, sumD, D] = litekmeans(X, k, varargin)
%LITEKMEANS K-means clustering, accelerated by matlab matrix operations.
%
% label = LITEKMEANS(X, K) partitions the points in the N-by-P data matrix
% X into K clusters. This partition minimizes the sum, over all
% clusters, of the within-cluster sums of point-to-cluster-centroid
% distances. Rows of X correspond to points, columns correspond to
% variables. KMEANS returns an N-by-1 vector label containing the
% cluster indices of each point.
%
% [label, center] = LITEKMEANS(X, K) returns the K cluster centroid
% locations in the K-by-P matrix center.
%
% [label, center, bCon] = LITEKMEANS(X, K) returns the bool value bCon to
% indicate whether the iteration is converged.
%
% [label, center, bCon, SUMD] = LITEKMEANS(X, K) returns the
% within-cluster sums of point-to-centroid distances in the 1-by-K vector
% sumD.
%
% [label, center, bCon, SUMD, D] = LITEKMEANS(X, K) returns
% distances from each point to every centroid in the N-by-K matrix D.
%
% [ ... ] = LITEKMEANS(..., 'PARAM1',val1, 'PARAM2',val2, ...) specifies
% optional parameter name/value pairs to control the iterative algorithm
% used by KMEANS. Parameters are:
%
% 'Distance' - Distance measure, in P-dimensional space, that KMEANS
% should minimize with respect to. Choices are:
% {'sqEuclidean'} - Squared Euclidean distance (the default)
% 'cosine' - One minus the cosine of the included angle
% between points (treated as vectors). Each
% row of X SHOULD be normalized to unit. If
% the intial center matrix is provided, it
% SHOULD also be normalized.
%
% 'Start' - Method used to choose initial cluster centroid positions,
% sometimes known as "seeds". Choices are:
% {'sample'} - Select K observations from X at random (the default)
% 'cluster' - Perform preliminary clustering phase on random 10%
% subsample of X. This preliminary phase is itself
% initialized using 'sample'. An additional parameter
% clusterMaxIter can be used to control the maximum
% number of iterations in each preliminary clustering
% problem.
% matrix - A K-by-P matrix of starting locations; or a K-by-1
% indicate vector indicating which K points in X
% should be used as the initial center. In this case,
% you can pass in [] for K, and KMEANS infers K from
% the first dimension of the matrix.
%
% 'MaxIter' - Maximum number of iterations allowed. Default is 100.
%
% 'Replicates' - Number of times to repeat the clustering, each with a
% new set of initial centroids. Default is 1. If the
% initial centroids are provided, the replicate will be
% automatically set to be 1.
%
% 'clusterMaxIter' - Only useful when 'Start' is 'cluster'. Maximum number
% of iterations of the preliminary clustering phase.
% Default is 10.
%
%
% Examples:
%
% fea = rand(500,10);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50);
%
% fea = rand(500,10);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50, 'Replicates', 10);
%
% fea = rand(500,10);
% [label, center, bCon, sumD, D] = litekmeans(fea, 5, 'MaxIter', 50);
% TSD = sum(sumD);
%
% fea = rand(500,10);
% initcenter = rand(5,10);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50, 'Start', initcenter);
%
% fea = rand(500,10);
% idx=randperm(500);
% [label, center] = litekmeans(fea, 5, 'MaxIter', 50, 'Start', idx(1:5));
%
%
% See also KMEANS
%
% [Cite] Deng Cai, "Litekmeans: the fastest matlab implementation of
% kmeans," Available at:
% http://www.zjucadcg.cn/dengcai/Data/Clustering.html, 2011.
%
% version 2.0 --December/2011
% version 1.0 --November/2011
%
% Written by Deng Cai (dengcai AT gmail.com)
if nargin < 2
error('litekmeans:TooFewInputs','At least two input arguments required.');
end
[n, p] = size(X);
pnames = { 'distance' 'start' 'maxiter' 'replicates' 'onlinephase' 'clustermaxiter'};
dflts = {'sqeuclidean' 'sample' [] [] 'off' [] };
[eid,errmsg,distance,start,maxit,reps,~,clustermaxit] = getargs(pnames, dflts, varargin{:});
if ~isempty(eid)
error(sprintf('litekmeans:%s',eid),errmsg);
end
if ischar(distance)
distNames = {'sqeuclidean','cosine'};
j = strcmpi(distance, distNames);
j = find(j);
if length(j) > 1
error('litekmeans:AmbiguousDistance', ...
'Ambiguous ''Distance'' parameter value: %s.', distance);
elseif isempty(j)
error('litekmeans:UnknownDistance', ...
'Unknown ''Distance'' parameter value: %s.', distance);
end
distance = distNames{j};
else
error('litekmeans:InvalidDistance', ...
'The ''Distance'' parameter value must be a string.');
end
center = [];
if ischar(start)
startNames = {'sample','cluster'};
j = find(strncmpi(start,startNames,length(start)));
if length(j) > 1
error(message('litekmeans:AmbiguousStart', start));
elseif isempty(j)
error(message('litekmeans:UnknownStart', start));
elseif isempty(k)
error('litekmeans:MissingK', ...
'You must specify the number of clusters, K.');
end
if j == 2
if floor(.1*n) < 5*k
j = 1;
end
end
start = startNames{j};
elseif isnumeric(start)
if size(start,2) == p
center = start;
elseif (size(start,2) == 1 || size(start,1) == 1)
center = X(start,:);
else
error('litekmeans:MisshapedStart', ...
'The ''Start'' matrix must have the same number of columns as X.');
end
if isempty(k)
k = size(center,1);
elseif (k ~= size(center,1))
error('litekmeans:MisshapedStart', ...
'The ''Start'' matrix must have K rows.');
end
start = 'numeric';
else
error('litekmeans:InvalidStart', ...
'The ''Start'' parameter value must be a string or a numeric matrix or array.');
end
% The maximum iteration number is default 100
if isempty(maxit)
maxit = 100;
end
% The maximum iteration number for preliminary clustering phase on random
% 10% subsamples is default 10
if isempty(clustermaxit)
clustermaxit = 10;
end
% Assume one replicate
if isempty(reps) || ~isempty(center)
reps = 1;
end
if ~(isscalar(k) && isnumeric(k) && isreal(k) && k > 0 && (round(k)==k))
error('litekmeans:InvalidK', ...
'X must be a positive integer value.');
elseif n < k
error('litekmeans:TooManyClusters', ...
'X must have more rows than the number of clusters.');
end
bestlabel = [];
sumD = zeros(1,k);
bCon = false;
for t=1:reps
switch start
case 'sample'
center = X(randsample(n,k),:);
case 'cluster'
Xsubset = X(randsample(n,floor(.1*n)),:);
[~, center] = litekmeans(Xsubset, k, varargin{:}, 'start','sample', 'replicates',1 ,'MaxIter',clustermaxit);
case 'numeric'
end
last = 0;label=1;
it=0;
switch distance
case 'sqeuclidean'
while any(label ~= last) && it<maxit
last = label;
bb = full(sum(center.*center,2)');
ab = full(X*center');
D = bb(ones(1,n),:) - 2*ab;
[val,label]
Matlab-使用Matlab实现的聚类算法-Clustering.zip
4 浏览量
2024-02-22
20:27:55
上传
评论
收藏 12KB ZIP 举报
__AtYou__
- 粉丝: 1716
- 资源: 566