/*
* This file is part of JBIRCH.
*
* JBIRCH is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* JBIRCH is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with JBIRCH. If not, see <http://www.gnu.org/licenses/>.
*
*/
/*
* CFTree.java
* Copyright (C) 2009 Roberto Perdisci (roberto.perdisci@gmail.com)
*/
package edu.gatech.gtisc.jbirch.cftree;
import java.util.*;
import java.io.*;
import net.sourceforge.sizeof.*;
/**
* This is an implementation of the BIRCH clustering algorithm described in:
*
* T. Zhang, R. Ramakrishnan, and M. Livny.
* "BIRCH: A New Data Clustering Algorithm and Its Applications"
* Data Mining and Knowledge Discovery, 1997.
*
* @author Roberto Perdisci (roberto.perdisci@gmail.com)
* @version 0.1
*
*/
public class CFTree {
/**
* Used when computing if the tree is reaching memory limit
*/
private static final double MEM_LIM_FRAC = 10;
/**
* Centroid Distance D0
*/
public static final int D0_DIST = 0;
/**
* Centroid distance D1
*/
public static final int D1_DIST = 1;
/**
* Cluster Distance D2
*/
public static final int D2_DIST = 2;
/**
* Cluster Distance D3
*/
public static final int D3_DIST = 3;
/**
* Cluster Distance D4
*/
public static final int D4_DIST = 4;
/**
* The root node of the CFTree
*/
private CFNode root;
/**
* dummy node that points to the list of leaves. used for fast retrieval of final subclusters
*/
private CFNode leafListStart = null;
/**
* keeps count of the instances inserted into the tree
*/
private int instanceIndex = 0;
/**
* if true, the tree is automatically rebuilt every time the memory limit is reached
*/
private boolean automaticRebuild = false;
/**
* the memory limit used when automatic rebuilding is active
*/
private long memLimit = (long)Math.pow(1024, 3); // default = 1GB
/**
* used when automatic rebuilding is active
*/
private long periodicMemLimitCheck = 100000; // checks if memeory limit is exceeded every 100,000 insertions
/**
*
* @param maxNodeEntries parameter B
* @param distThreshold parameter T
* @param distFunction must be one of CFTree.D0_DIST,...,CFTree.D4_DIST, otherwise it will default to D0_DIST
* @param applyMergingRefinement if true, activates merging refinement after each node split
*/
public CFTree(int maxNodeEntries, double distThreshold, int distFunction, boolean applyMergingRefinement) {
if(distFunction < D0_DIST || distFunction > D4_DIST)
distFunction = D0_DIST;
root = new CFNode(maxNodeEntries,distThreshold,distFunction,applyMergingRefinement,true);
leafListStart = new CFNode(0,0,distFunction,applyMergingRefinement,true); // this is a dummy node that points to the fist leaf
leafListStart.setNextLeaf(root); // at this point root is the only node and therefore also the only leaf
}
/**
*
* @return the current memory limit used to trigger automatic rebuilding
*/
public long getMemoryLimit() {
return memLimit;
}
/**
* Gets the start of the list of leaf nodes (remember: the first node is a dummy node)
*
* @return
*/
public CFNode getLeafListStart() {
return this.leafListStart;
}
/**
*
* @param limit memory limit in bytes
*/
public void setMemoryLimit(long limit) {
this.memLimit = limit;
}
/**
*
* @param limit memory limit in Mbytes
*/
public void setMemoryLimitMB(long limit) {
this.memLimit = limit*1024*1024;
}
/**
*
* @param auto if true, and memory limit is reached, the tree is automatically rebuilt with larger threshold
*/
public void setAutomaticRebuild(boolean auto) {
this.automaticRebuild = auto;
}
/**
*
* @param period the number of insert operations after which we check whether the tree has reached the memory limit
*/
public void setPeriodicMemLimitCheck(long period) {
this.periodicMemLimitCheck = period;
}
/**
* Inserts a single pattern vector into the CFTree
*
* @param x the pattern vector to be inserted in the tree
* @return true if insertion was successful
*/
public boolean insertEntry(double[] x) {
instanceIndex++;
if(automaticRebuild && (instanceIndex % periodicMemLimitCheck)==0) {
// rebuilds the tree if we reached or exceeded memory limits
rebuildIfAboveMemLimit();
}
return insertEntry(x,instanceIndex);
}
/**
* Insert a pattern vector with a specific associated pattern vector index.
* This method does not use periodic memory limit checks.
*
* @param x the pattern vector to be inserted in the tree
* @param index a specific index associated to the pattern vector x
* @return true if insertion was successful
*/
public boolean insertEntry(double[] x, int index) {
CFEntry e = new CFEntry(x,index);
// System.out.println("Inserting " + e);
return insertEntry(e);
}
/**
* Inserts an entire CFEntry into the tree. Used for tree rebuilding.
*
* @param e the CFEntry to insert
* @return true if insertion happened without problems
*/
private boolean insertEntry(CFEntry e) {
boolean dontSplit = root.insertEntry(e);
if(!dontSplit) {
// if dontSplit is false, it means there was not enough space to insert the new entry in the tree,
// therefore wee need to split the root to make more room
splitRoot();
if(automaticRebuild) {
// rebuilds the tree if we reached or exceeded memory limits
rebuildIfAboveMemLimit();
}
}
return true; // after root is split, we are sure x was inserted correctly in the tree, and we return true
}
/**
* Every time we split the root, we check whether the memory limit imposed on the tree
* has been reached. In this case, we automatically increase the distance threshold and
* rebuild the tree.
*
* It is worth noting that since we only check memory consumption only during root split,
* and not for all node splits (for performance reasons), we cannot guarantee that
* the memory limit will not be exceeded. The tree may grow significantly between a
* root split and the next.
* Furthermore, the computation of memory consumption using the SizeOf class is only approximate.
*
* Notice also that if the threshold grows to the point that all the entries fall into one entry
* of the root (i.e., the root is the only node in the tree, and has only one sub-cluster)
* the automatic rebuild cannot decrease the memory consumption (because increasing the threshold
* has not effect on reducing the size of the tree), and if Java runs out of memory
* the program will terminate.
*
* @return true if rebuilt
*/
private boolean rebuildIfAboveMemLimit() {
if(hasReachedMemoryLimit(this,memLimit)) {
// System.out.println("############## Size of Tree is reaching or has exceeded the memory limit");
// System.out.println("############## Rebuilding the Tree...");
// System.out.println("############## Current Threshold = " + root.getDistThreshold());
double newThreshold = computeNewThreshold(leafListStart, root.getDistFunction(), root.getDistThreshold());
// System.out.println("############## New Threshold = " + newThreshold);
CFTree newTree = this.rebuildTree(root.getMaxNodeEntries(), newThreshold, root.getDistFunction(), root.applyMergingRefinement(), false);
copyTree(newTree);
return true;
}
return false;
}
/**
* Splits the root to accommodate a new entry. The height of the tree grows by one.
*/
private void splitRoot() {
// the split happens by finding the two entries in this node that