/*----------------------------------------------------------------------
File : istree.c
Contents: item set tree management
Author : Christian Borgelt
History : 22.01.1996 file created
07.02.1996 _child, _count, ist_addlvl, and ist_count
09.02.1996 ist_rule programmed and debugged
10.02.1996 empty rule bodies made optional
28.03.1996 support made relative to number of item sets
25.06.1996 function _count optimized
23.11.1996 rule extraction redesigned
24.11.1996 rule selection criteria added
18.08.1997 normalized chi^2 measure added
parameter minlen added to function ist_init()
15.01.1998 confidence comparison changed to >=
23.01.1998 integer support computation changed (ceil)
26.01.1998 condition added to set extension in _child
10.02.1998 bug in computation of EM_INFO fixed
11.02.1998 parameter 'minval' added to function ist_init()
14.05.1998 item set tree navigation functions added
08.08.1998 item appearances considered for rule selection
20.08.1998 deferred child node vector allocation added
02.09.1998 several assertions added
05.09.1998 bug concerning node id fixed
07.09.1998 function ist_hedge added
22.09.1998 bug in rule extraction (item appearances) fixed
23.09.1998 computation of chi^2 measure simplified
05.02.1999 long int changed to int
25.08.1999 rule extraction simplified
05.11.1999 rule evaluation measure EM_AIMP added
08.11.1999 parameter 'aval' added to function ist_rule
11.11.1999 rule consequents moved to first field
01.12.1999 bug in node reallocation fixed
01.04.2001 functions ist_set and ist_getcntx added,
functions _count and _getsupp improved
28.12.2001 sort function moved to module tract
07.02.2002 tree clearing removed, counting improved
08.02.2002 child creation improved (check of body support)
10.02.2002 IST_IGNORE bugs fixed (ist_set and ist_hedge)
11.02.2002 memory usage minimization option added
12.02.2002 ist_first and ist_last replaced by ist_next
19.02.2002 transaction tree functions added
09.10.2002 bug in function ist_hedge fixed (conf. comp.)
12.03.2003 parameter lift added to function ist_rule
17.07.2003 check of item usage added (function ist_check)
18.07.2003 maximally frequent item set filter added
11.08.2003 item set filtering generalized (ist_filter)
15.08.2003 renamed new to cur in ist_addlvl (C++ compat.)
14.11.2003 definition of F_HDONLY changed to INT_MIN
02.12.2003 skipping unnecessary subtrees added (_stskip)
03.12.2003 bug in ist_check for rule mining fixed
12.12.2003 padding for 64 bit architecture added
09.05.2004 additional selection measure for sets added
----------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <float.h>
#include <math.h>
#include <assert.h>
#include "istree.h"
#ifdef STORAGE
#include "storage.h"
#endif
/*----------------------------------------------------------------------
Preprocessor Definitions
----------------------------------------------------------------------*/
#define LN_2 0.69314718055994530942 /* ln(2) */
#define EPSILON 1e-12 /* to cope with roundoff errors */
#define BLKSIZE 32 /* block size for level vector */
#define F_HDONLY INT_MIN /* flag for head only item in path */
#define F_SKIP INT_MIN /* flag for subtree skipping */
#define ID(n) ((int)((n)->id & ~F_HDONLY))
#define HDONLY(n) ((int)((n)->id & F_HDONLY))
/*----------------------------------------------------------------------
Type Definitions
----------------------------------------------------------------------*/
typedef double EVALFN (double head, double body, double post);
/* function to compute an additional evaluation measure */
/*----------------------------------------------------------------------
Auxiliary Functions
----------------------------------------------------------------------*/
static int _bsearch (int *vec, int n, int id)
{ /* --- binary search for an item */
int i, k; /* left and middle index */
assert(vec && (n > 0)); /* check the function arguments */
for (i = 0; i < n; ) { /* while the range is not empty */
k = (i + n) >> 1; /* get index of middle element */
if (vec[k] > id) n = k;
else if (vec[k] < id) i = k+1;
else return k; /* adapt range boundaries or return */
} /* the index the id. was found at */
return -1; /* return 'not found' */
} /* _bsearch() */
/*--------------------------------------------------------------------*/
static void _count (ISNODE *node, int *set, int cnt, int min)
{ /* --- count transaction recursively */
int i; /* vector index */
int *map, n; /* identifier map and its size */
ISNODE **vec; /* child node vector */
assert(node /* check the function arguments */
&& (cnt >= 0) && (set || (cnt <= 0)));
if (node->offset >= 0) { /* if a pure vector is used */
if (node->chcnt == 0) { /* if this is a new node */
n = node->offset; /* get the index offset */
while ((cnt > 0) && (*set < n)) {
cnt--; set++; } /* skip items before first counter */
while (--cnt >= 0) { /* traverse the transaction's items */
i = *set++ -n; /* compute counter vector index */
if (i >= node->size) return;
node->cnts[i]++; /* if the counter exists, */
} } /* count the transaction */
else if (node->chcnt > 0) { /* if there are child nodes */
vec = (ISNODE**)(node->cnts +node->size);
n = ID(vec[0]); /* get the child node vector */
min--; /* one item less to the deepest nodes */
while ((cnt > min) && (*set < n)) {
cnt--; set++; } /* skip items before first child */
while (--cnt >= min) { /* traverse the transaction's items */
i = *set++ -n; /* compute child vector index */
if (i >= node->chcnt) return;
if (vec[i]) _count(vec[i], set, cnt, min);
} /* if the child exists, */
} } /* count the transaction recursively */
else { /* if an identifer map is used */
map = node->cnts +(n = node->size);
if (node->chcnt == 0) { /* if this is a new node */
while (--cnt >= 0) { /* traverse the transaction's items */
if (*set > map[n-1]) return; /* if beyond last item, abort */
i = _bsearch(map, n, *set++);
if (i >= 0) node->cnts[i]++;
} } /* find index and count transaction */
else if (node->chcnt > 0) { /* if there are child nodes */
vec = (ISNODE**)(map +n); /* get id. map and child vector */
if (node->chcnt < n) /* if a secondary id. map exists */
map = (int*)(vec +(n = node->chcnt));
min--; /* one item less to the deepest nodes */
while (--cnt >= min) { /* traverse the transaction's items */
if (*set > map[n-1]) return; /* if beyond last item, abort */
i = _bsearch(map, n, *set++);
if ((i >= 0) && vec[i]) _count(vec[i], set, cnt, min);
} /* search fo