#include<stdio.h>
#include<math.h>
#include<string.h>
#include<stdlib.h>
#include<vector>
#include<queue>
using namespace std;
#define MAX 1000
#define W 2 //模式向量的维数
#define K 3 //预期聚类中心个数
#define THETA_N 2 //每一个聚类域中最少的样本数量
#define THETA_S 1 //一个聚类域中样本距离分布的标准差
int THETA_C = 4; //两个聚类中心之间的最小距离
#define L 0 //再一次迭代运算中可以合并的聚类中心的最多对数
#define I 6 //迭代运算的最多次数
#define DIVIDE_FACTOR 0.5
typedef struct{
double featureVector[W+1];
int classification;
}sampleNode;
sampleNode *sn;
int iterNum; //记录当前迭代的次数
int sampleAmount; //样本的数量
class ClusterNode
{
public:
int samples[MAX];//存储属于本类的样本的序号
double insideAverageDistance; //模式样本与聚类中心间的平均距离
double maxStdDiffVector; //样本距离的标准差向量的最大分量值
sampleNode center; //该类的聚类中心
int amount; //样本数量
int number; // 该聚类是第number个聚类
ClusterNode(){
number = 0;
center = sn[number];
amount = 0;
int i;
memset(samples, 0, sizeof(samples));
}
double getInsideAverageDistance(){
int i, j;
double temp1;
double temp2;
temp1 = 0;
for(i=1; i<=amount; i++){
temp2 = 0;
for(j=1; j<=W; j++){
temp2 += pow(sn[samples[i]].featureVector[j]-center.featureVector[j], 2);
}
temp1 += sqrt(temp2);
}
this->insideAverageDistance = temp1/amount;
return this->insideAverageDistance;
}
void calSTDDiffVecor(){ //计算每个聚类中样本距离的标准差向量以及其最大分量
int i, j;
double temp;
double max;
double stdDiffVector[W+1];
for(i=1; i<=W; i++){
temp = 0;
for(j=1; j<=amount; j++){
temp += pow(sn[samples[j]].featureVector[i]-center.featureVector[i], 2);
}
stdDiffVector[i] = sqrt(temp/amount);
}
max = 0;
for(i=1; i<=W; i++){
if(stdDiffVector[i] > max){
max = stdDiffVector[i];
}
}
this->maxStdDiffVector = max;
}
};
vector<ClusterNode> cn; //聚类向量
class DistanceOfCenters
{
public:
int i;
int j;
double distance;
DistanceOfCenters(){
i = 0;
j = 0;
distance = 0.0;
}
DistanceOfCenters(int i, int j){
this->i = i;
this->j = j;
distance = getDistance(cn[i].center, cn[j].center);
}
double getDistance(sampleNode a, sampleNode b)
{
int i;
double dis;
dis = 0.0;
for(i=1; i<=W; i++){
dis += pow(a.featureVector[i]-b.featureVector[i],2);
}
return sqrt(dis);
}
friend bool operator<(const DistanceOfCenters &a, const DistanceOfCenters &b){
return a.distance > b.distance;
}
};
double totalAverageDistance; //全部模式样本对其相应聚类中心的总平均距离
int NC = 1; //当前聚类个数
int divideFlag = 0; //标志是否分裂
priority_queue<DistanceOfCenters> que;
void inputSample()
{ //样本输入函数
int i, j;
printf("请输入样本的数量:\n");
scanf("%d", &sampleAmount);
sn = (sampleNode*)malloc(sizeof(sampleNode)*(sampleAmount+1));
printf("\n请依次输入每个样本的特征值,特征值用空格隔开,样本用回车符隔开:\n");
for(i=1; i<=sampleAmount; i++){
printf("第%2d个样本:", i);
for(j=1; j<=W; j++){
scanf("%lf", &sn[i].featureVector[j]);
}
}
}
double getDistance(sampleNode a, sampleNode b)
{
int i;
double dis;
dis = 0.0;
for(i=1; i<=W; i++){
dis += pow(a.featureVector[i]-b.featureVector[i],2);
}
return sqrt(dis);
}
void initClassify()
{
int i, j;
double temp[MAX];
double min;
for(i=1; i<=NC; i++){
ClusterNode node;
node.center = sn[i];
node.amount = 0;
node.number = i;
memset(node.samples, 0, sizeof(node.samples));
cn.push_back(node);
}
for(i=1; i<=sampleAmount; i++){
for(j=1; j<=cn.size(); j++){
temp[j] = getDistance(sn[i], cn[j-1].center);
}
min = MAX;
for(j=1; j<=cn.size(); j++){
if(temp[j] < min){
min = temp[j];
cn[j-1].samples[++cn[j-1].amount] = i;
}
}
}
vector<ClusterNode>::iterator p;
for(p=cn.begin(); p!=cn.end(); p++){
if((*p).amount < THETA_N){
NC = NC-1;
cn.erase(p, p+1);
}
}
for(i=1; i<=NC; i++){
cn[i-1].number = i;
}
}
void reClassify()
{
int i, j, t;
double temp[MAX];
double min;
for(i=1; i<=NC; i++){
cn[i-1].amount = 0;
cn[i-1].number = i;
memset(cn[i-1].samples, 0, sizeof(cn[i-1].samples));
}
for(i=1; i<=sampleAmount; i++){
for(j=1; j<=NC; j++){
temp[j] = getDistance(sn[i], cn[j-1].center);
}
min = MAX;
for(j=1; j<=NC; j++){
if(temp[j] < min){
min = temp[j];
t = j;
}
}
cn[t-1].samples[++cn[t-1].amount] = i;
}
vector<ClusterNode>::iterator p;
for(p=cn.begin(); p!=cn.end(); p++){
if((*p).amount < THETA_N){
NC = NC-1;
cn.erase(p, p+1);
}
}
for(i=1; i<=NC; i++){
cn[i-1].number = i;
}
divideFlag = 0;
}
void reCalCenter()
{
int i, j, k;
double value[W+1];
for(i=1; i<=cn.size(); i++){
j = 1;
for(k=0; k<=W; k++){
value[k] = 0.0;
}
while(cn[i-1].samples[j] != 0){
for(k=1; k<=W; k++){
value[k] += sn[cn[i-1].samples[j]].featureVector[k];
}
j++;
}
for(k=1; k<=W; k++){
cn[i-1].center.featureVector[k] = value[k]/cn[i-1].amount;
}
}
}
void calTotalAverageDistance()
{ //计算全部模式样本对其相应聚类中心的总平均距离
int i;
double temp;
temp = 0.0;
for(i=1; i<=NC; i++){
temp += cn[i-1].getInsideAverageDistance();
}
totalAverageDistance = temp / NC;
}
void divideCluster()
{
int i, k;
for(i=1; i<=NC; i++){
if(cn[i-1].maxStdDiffVector>THETA_S){
if((NC<=K/2) || ((cn[i-1].insideAverageDistance>totalAverageDistance)
&&cn[i-1].amount>2*(THETA_N+1))){
divideFlag = 1;
ClusterNode newCluster;
newCluster.amount = 0;
newCluster.center = cn[i-1].center;
for(k=1; k<=K; k++){
newCluster.center.featureVector[k] += DIVIDE_FACTOR*cn[i-1].maxStdDiffVector;
}
for(k=1; k<=K; k++){
cn[i-1].center.featureVector[k] -= DIVIDE_FACTOR*cn[i-1].maxStdDiffVector;
}
cn.push_back(newCluster);
}
}
}
NC = cn.size(); //更新聚类数量
}
void mergeCluster()
{
int i, j, k, l;
double temp;
for(l=1; l<=L; l++){
for(i=1; i<NC; i++){
for(j=i+1; j<=NC; j++){
DistanceOfCenters newNode(i, j);
que.push(newNode);
}
}
if(!que.empty() && (que.top().distance<THETA_C)){
DistanceOfCenters cur = que.top();
que.pop();
for(k=1; k<=K; k++){
i = cur.i;
j = cur.j;
temp = cn[i-1].amount*cn[i-1].center.featureVector[k]+cn[j-1].amount*cn[j-1].center.featureVector[k];
cn[i-1].center.featureVector[k] = temp / (cn[i-1].amount+cn[j-1].amount);
}
vector<ClusterNode>::iterator p;
p = cn.begin();
p += j-1;
cn.erase(p, p+1); //删除第聚类j
for(i=1; i<=cn.size(); i++){ //为聚类重新编号
cn[i-1].number = i;
}
NC--;
while(!que.empty()){ //队列清空
que.pop();
}
}
}
}
void outputResult()
{
int i, j;
printf("\n聚类编号 样本编号\n");
for(i=1; i<=NC; i++){
printf("%d ", i);
for(j=1; j<=cn[i-1].amount; j++){
printf("%d ", cn[i-1].samples[j]);
}
printf("\n");
}
printf("\n迭代次数:%d", I);
printf("\n预期聚类中心个数:%d\n", K);
}
void process(){ //总的执行过程
int i, k;
iterNum = 1; //记录第一次迭代
inputSample(); //第一步:输入N个模式样本
initClassify(); //第二步和第三步:将N个模式样本分给最近的聚类Sj
reCalCenter(); //第四步:修正各聚类中心
for(i=1; i<=NC; i++){ //第五步:计算各聚类域Sj中诸聚类中心间的平均距离
cn[i-1].getInsideAverageDistance();
}
calTotalAverageDistance(); //第六步:计算全部模式样本对其相应聚类中心的总平均距离
while(1){
while(NC <= K/2){
for(i=1; i<=NC; i++){
cn[i-1].calSTDDiffVecor(); //第八步和第九步:计算每聚类中样本距离的标准差向量及最大分量
}
divideCluster(); //第十步:分裂聚类
if(divideFlag == 1){ //如果进行了分裂计算
reClassify(); //跳回第二步