/* fpt.c (release mode)
*
* Use threshold for finding large itemsets with supports >= the threshold.
* This is the implementation using the FP-tree structure according to the paper:
* Jiawei Han, Yiwen Yin: Mining Frequent Patterns without Candidate Generation,
* ACM SIGMOD 2000, pages 1-12.
*
* Program Input:
* A configuration file consisting of 6 parameters
* 1. User specified maximum size of itemset to be mined
* If this value is not larger than zero or
* is greater than the greatest transaction size in the DB,
* then it will be set to the greatest transaction size.
* 2. Normalized support threshold, range: (0, 1]
* 3. Total number of different items in the DB
* 4. Total number of transactions in the DB
* 5. Data file name
* 6. Result file name for storing the large itemsets
*
* Program Output:
* The frequent itemsets are displayed from small to large sizes.
* For each size, the itemsets are sorted in descending order of their supports.
* The support of each large itemset will also be displayed (enclosed by a bracket).
* It will output to both screen (standard output) and the result file.
*
*/
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include <sys/resource.h>
#include <sys/times.h>
#include <unistd.h>
/***** Data Structure *****/
/* Description:
* Each node of an FP-tree is represented by a 'FPnode' structure.
* Each node contains an item ID, count value of the item, and
* node-link as stated in the paper.
*
*/
typedef struct FPnode *FPTreeNode; /* Pointer to a FP-tree node */
typedef struct Childnode *childLink; /* Pointer to children of a FP-tree node */
/*
* Children of a FP-tree node
*/
typedef struct Childnode {
FPTreeNode node; /* A child node of an item */
childLink next; /* Next child */
} ChildNode;
/*
* A FP-tree node
*/
typedef struct FPnode {
int item; /* ID of the item.
Value of ID is within the range [0, m-1]
where m is the total number of different items in the database. */
int count; /* Value of count of the item.
This is the number of transactions containing items in the portion
of the path reaching this node. */
int numPath; /* Number of leaf nodes in the subtree
rooted at this node. It is used to
check whether there is only a single path
in the FPgrowth function. */
FPTreeNode parent; /* Pointer to parent node */
childLink children; /* Pointer to children */
FPTreeNode hlink; /* Horizontal link to next node with same item */
} FPNode;
/*
* A list to store large itemsets in descending order of their supports.
* It stores all the itemsets of supports >= threshold.
*/
typedef struct Itemsetnode *LargeItemPtr;
typedef struct Itemsetnode {
int support;
int *itemset;
LargeItemPtr next;
} ItemsetNode;
void FPgrowth(FPTreeNode T, FPTreeNode *headerTableLink, int headerSize, int *baseItems, int baseSize);
/***** Global Variables *****/
LargeItemPtr *largeItemset; /* largeItemset[k-1] = array of large k-itemsets */
int *numLarge; /* numLarge[k-1] = no. of large k-itemsets found. */
int *support1; /* Support of 1-itemsets */
int *largeItem1; /* 1-itemsets */
FPTreeNode root=NULL; /* Initial FP-tree */
FPTreeNode *headerTableLink; /* Corresponding header table */
int expectedK; /* User input upper limit of itemset size to be mined */
int realK; /* Actual upper limit of itemset size can be mined */
int threshold; /* User input support threshold */
int numItem; /* Number of items in the database */
int numTrans; /* Number of transactions in the database */
char dataFile[100]; /* File name of the database */
char outFile[100]; /* File name to store the result of mining */
/******************************************************************************************
* Function: destroyTree
*
* Description:
* Destroy the FPtree rooted by a node and free the allocated memory.
* For each tree node being visited, all the child nodes
* are destroyed in a recursive manner before the destroy of the node.
*
* Invoked from:
* destroy()
*
* Input Parameter:
* node -> Root of the tree/subtree to be destroyed.
*/
void destroyTree(FPTreeNode node)
{
childLink temp1, temp2;
if (node == NULL) return;
temp1 = node->children;
while(temp1 != NULL) {
temp2 = temp1->next;
destroyTree(temp1->node);
free(temp1);
temp1 = temp2;
}
free(node);
return;
}
/******************************************************************************************
* Function: destroy
*
* Description:
* Free memory of following variables.
* - largeItemset
* - numLarge
* - headerTableLink
* - root
*
* Invoked from:
* main()
*
* Functions to be invoked:
* destroyTree() -> Free memory from the FP-tree, root.
*
* Global variables (read only):
* - realK
*/
void destroy()
{
LargeItemPtr aLargeItemset;
int i;
for (i=0; i < realK; i++) {
aLargeItemset = largeItemset[i];
while (aLargeItemset != NULL) {
largeItemset[i] = largeItemset[i]->next;
free(aLargeItemset->itemset);
free(aLargeItemset);
aLargeItemset = largeItemset[i];
}
}
free(largeItemset);
free(numLarge);
free(headerTableLink);
destroyTree(root);
return;
}
/******************************************************************************************
* Function: swap
*
* Description:
* Swap x-th element and i-th element of each of the
* two arrays, support[] and itemset[].
*
* Invoked from:
* q_sortD()
* q_sortA()
*
* Functions to be invoked: None
*
* Input Parameters:
* support -> Corresponding supports of the items in itemset.
* itemset -> Array of items.
* x, i -> The two indexes for swapping.
*
* Global variable: None
*/
void swap(int *support, int *itemset, int x, int i)
{
int temp;
temp = support[x];
support[x] = support[i];
support[i] = temp;
temp = itemset[x];
itemset[x] = itemset[i];
itemset[i] = temp;
return;
}
/******************************************************************************************
* Function: q_sortD
*
* Description:
* Quick sort two arrays, support[] and the corresponding itemset[],
* in descending order of support[].
*
* Invoked from:
* pass1()
* genConditionalPatternTree()
* q_sortD()
*
* Functions to be invoked:
* swap()
* q_sortD()
*
* Input Parameters:
* low -> lower bound index of the array to be sorted
* high -> upper bound index of the array to be sorted
* size -> size of the array
* length -> length of an itemset
*
* In/Out Parameters:
* support[] -> array to be sorted
* itemset[] -> array to be sorted
*/
void q_sortD(int *support, int *itemset, int low,int high, int size)
{
int pass;
int highptr=high++; /* highptr records the last element */
/* the first element in list is always served as the pivot */
int pivot=low;
if(low>=highptr) return;
do {
/* Find out, from the head of support[],
* the 1st element value not larger than the pivot's
*/
pass=1;
while(pass==1) {
if(++low<size) {
if(support[low] > support[pivot])
pass=1;
else pass=0;
} else pass=0;
}
/* Find out, from the tail of support[],
* the 1st element value not smaller than the pivot's
*/
pass=1;
while(pass==1) {
if(high-->0) {
if(support[high] < support[pivot])
pass=1;
else pass=0;
} else pass=0;
}
/* swap elements pointed by low pointer & high pointer */
if(low<high)
swap(support, itemset, low, high);
} while(low<=high);
swap(support, itemset, pivot, high);
/* divide list into two for further sorting */
q_sortD(support, itemset, pivot, high-1, size);
q_sortD(support, itemset, high+1, highptr, size);
return;
}
/******************************************************************************************
* Function: q_sortA
*
* Description:
* Quick sort two arrays, indexList[] and the corresponding freqItemP[],
* in ascending order of indexList[].
*
* Invoked from:
* buildTre