/*
* cluster.c -- cluster main program
*
* $Log: cluster.c,v $
* Revision 1.23 1993/02/03 07:43:07 stolcke
* added code vector output for cluster
*
* Revision 1.22 1993/01/20 10:28:02 stolcke
* faster O(n^2) clustering
*
* Revision 1.21 1993/01/16 01:44:34 stolcke
* use triangular distance matrix (save 1/2 the memory)
*
* Revision 1.20 1993/01/16 00:27:07 stolcke
* make dflag printing more efficient
*
* Revision 1.19 1993/01/16 00:03:30 stolcke
* avoid unnecessary reallocation of patterns
* skip non-root distance entries more efficiently
*
* Revision 1.18 1992/02/12 07:13:13 stolcke
* -E option added
*
* Revision 1.17 91/07/15 12:56:00 stolcke
* width option added
*
* Revision 1.16 91/07/14 01:10:54 stolcke
* curses support added
* graphing routines moved to separate file
*
* Revision 1.15 91/07/10 21:26:54 stolcke
* saber-cleaned and bad bug in allocation macro fixed
*
* Revision 1.14 91/04/20 16:17:59 stolcke
* second release (beta)
*
* Revision 1.13 91/04/18 19:03:39 stolcke
* some more error checks
*
* Revision 1.12 91/04/18 18:28:21 stolcke
* general cleanup and partial rewrite
*
* Revision 1.11 91/04/18 13:29:29 stolcke
* merged pca into cluster
*
* Revision 1.10 91/04/12 02:09:48 stolcke
* name quoting added
*
* Revision 1.9 90/12/06 21:47:46 stolcke
* fixed averaging with D/C values
*
* Revision 1.8 90/12/06 14:32:52 stolcke
* support for don't care values added
* fixed bug when total_distance = 0
*
* Revision 1.7 90/11/12 20:31:22 stolcke
* two initialization fixed
*
* Revision 1.6 90/11/12 18:43:26 stolcke
* workaround to avoid old Sun compiler bug
*
* Revision 1.5 90/11/12 16:40:34 stolcke
* added metric selection and scaling
*
* Revision 1.4 90/11/11 18:14:27 stolcke
* -n flag no longer needed
*
* Revision 1.3 90/11/11 02:30:14 stolcke
* no more seek needed, can use in pipe now
*
* Revision 1.2 90/11/10 23:44:02 stolcke
* added new features (see manpage)
*
*/
#if !defined(lint) && !defined(SABER)
static char rcsid[] = "$Header: /usr/src/local/conn/cluster/RCS/cluster.c,v 1.23 1993/02/03 07:43:07 stolcke Exp $";
#endif /* not lint */
#include <stdio.h>
#include <string.h>
#include <math.h>
#include "alloc.h"
#include "error.h"
#include "cluster-no-noise.h"
#define NONE (-2)
#define BUFSIZE 256
#ifndef SCALE
#define SCALE "_SCALE_"
#endif
#ifndef DONTCARE
#define DONTCARE "D/C"
#endif
#ifndef MAXFLOAT
#define MAXFLOAT ((float)3.40282346638528860e+38)
#endif
static FLOAT distance();
static FLOAT root();
static FLOAT cure_distance();
static FLOAT cure_one_distance();
static void merge();
static void repInTree();
static void print_rep_points();
static char buffer[BUFSIZE]; /* temporary string read buffer */
#define PCA "pca"
char *program; /* program name */
int vflag = 0; /* print explanatory messages */
static int pflag = 0; /* do PCA instead of HCA */
static int dflag = 0; /* print distances */
static int tflag = 0; /* print tree in ASCII */
static int Tflag = 0; /* display tree using curses */
static int gflag = 0; /* print tree in graph(1) format */
static int bflag = 0; /* print tree in graph(1) format, with breaks */
static int Bflag = 0; /* print patterns as bit vectors */
static int sflag = 0; /* suppress scaling */
int Eflag = 0; /* print eigenvalues */
static int width = 0; /* width of ASCII tree representation */
static int norm = 2; /* which l-norm to use for distance metric */
static int csize = 0; /* number of clusters */
static int rsize = 10; /* number of rep points in a cluster */
static float alpha = 0.3; /* shrinking factor alpha */
static
usage()
{
if (!pflag)
fprintf(stderr,
"usage: %s [-dtTgbBsv] [-w width] [-n norm] [-k no-cluster] [-r rep-size] [-a alpha] [vectorfile [namesfile]]\n", program);
else
fprintf(stderr,
"usage: %s [-Esv] [-e eigenbase] [-c pcs] [vectorfile [namesfile]]\n", PCA);
exit(2);
}
main(argc, argv)
int argc;
char *argv[];
{
FILE *fp, *pfile;
char *efile = NULL;
char *comps = NULL;
FLOAT **pattern = NULL;
int lpat, npat;
char **name = NULL;
int opt;
extern char *optarg;
extern int optind;
int *pinfo, i;
char fname[80];
/* who are we ? */
if ((program = strrchr(argv[0], '/')) == NULL)
program = argv[0];
else
program += 1;
if (strcmp(program, PCA) == 0)
pflag = 1;
while ((opt = getopt(argc, argv, "pdtTgbBvsn:a:r:k:e:c:w:E")) != -1)
switch (opt) {
case 'p':
pflag = 1;
break;
case 'd':
if (pflag)
usage();
dflag = 1;
break;
case 't':
if (pflag)
usage();
tflag = 1;
break;
case 'T':
if (pflag)
usage();
Tflag = 1;
break;
case 'g':
if (pflag)
usage();
gflag = 1;
break;
case 'b':
if (pflag)
usage();
bflag = 1;
gflag = 1;
break;
case 'B':
if (pflag)
usage();
Bflag = 1;
break;
case 'v':
vflag = 1;
break;
case 'n':
if (pflag)
usage();
if (sscanf(optarg, "%d", &norm) != 1 || norm < 0)
usage();
break;
case 'k':
if (pflag)
usage();
if (sscanf(optarg, "%d", &csize) != 1 || csize < 0)
usage();
break;
case 'a':
if (pflag)
usage();
if (sscanf(optarg, "%f", &alpha) != 1 || alpha < 0 || alpha > 1.0)
usage();
break;
case 'r':
if (pflag)
usage();
if (sscanf(optarg, "%d", &rsize) != 1 || rsize < 0)
usage();
break;
case 'e':
if (!pflag)
usage();
efile = optarg;
break;
case 'c':
if (!pflag)
usage();
comps = optarg;
break;
case 's':
sflag = 1;
break;
case 'w':
if (pflag)
usage();
if (sscanf(optarg, "%d", &width) != 1 || width <= 0)
usage();
break;
case 'E':
if (!pflag)
usage();
Eflag = 1;
break;
case '?':
usage();
break;
}
if (!(pflag || dflag || tflag || Tflag || gflag || Bflag))
dflag = tflag = vflag = 1; /* default behavior */
if (optind + 2 < argc)
usage();
if (!(optind < argc) || !strcmp(argv[optind], "-"))
fp = stdin;
else
IfErr(fp = fopen(argv[optind], "r")) {
fprintf(stderr, "%s: cannot open file %s\n", argv[0], argv[optind]);
exit(1);
}
IfErr(read_pattern(fp, &pattern, &lpat, &npat, &name)) {
fprintf(stderr, "%s: %s: cannot read pattern\n", program, ERR_MSG);
exit(1);
}
if (vflag)
fprintf(stderr, "read %d patterns: size = %d\n", npat, lpat);
if (optind + 1 < argc) {
IfErr (name = new_array_of(npat, char *)) {;
fprintf(stderr, "%s: not enough core for name array\n", program);
exit(1);
}
if (!strcmp(argv[optind + 1], "-"))
fp = stdin;
else
IfErr(fp = fopen(argv[optind + 1], "r")) {
fprintf(stderr, "%s: cannot open file %s\n",
program, argv[optind + 1]);
exit(1);
}
IfErr(read_names(fp, name, npat)) {
fprintf(stderr, "%s: %s: cannot read names\n", program, ERR_MSG);
exit(1);
}
}
IfErr(pinfo = new_array_of(npat, int)) {
fprintf(stderr, "%s: not enough core\n", program);
exit(1);
}
sprintf(fname, "%s-partition", argv[optind]);
IfErr(pfile = fopen(fname, "w")) {
fprintf(stderr, "%s: cannot open file %s\n", argv[0], argv[optind]);
exit(1);
}
cluster(pattern, name, lpat, npat, pinfo);
for (i=0; i<npat; ++i)
fprintf(pfile, "%d\n", pinfo[i]);
free(pinfo);
fclose(pfile);
exit(0);
}
/* skip blanks and next end of line */
skip_blanks(fp)
FILE *fp;
{
char c;
while ((c = getc(fp)) == ' ' || c == '\t');
if (c != '\n')
ungetc(c, fp);
return c;
}
read_names(fp, name, npat)
FILE *fp;
char **name;
int npat;
{
register int i;
for (i = 0; i < npat; i++) {
char c = skip_blanks(fp);
if (c == '\"') {
getc(fp);
fgets(buffer, sizeo