/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/*
* Evaluation.java
* Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
*
*/
package weka.classifiers;
import weka.classifiers.evaluation.NominalPrediction;
import weka.classifiers.evaluation.ThresholdCurve;
import weka.classifiers.pmml.consumer.PMMLClassifier;
import weka.classifiers.xml.XMLClassifier;
import weka.core.Drawable;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.RevisionHandler;
import weka.core.RevisionUtils;
import weka.core.Summarizable;
import weka.core.Utils;
import weka.core.Version;
import weka.core.converters.ConverterUtils.DataSink;
import weka.core.converters.ConverterUtils.DataSource;
import weka.core.pmml.PMMLFactory;
import weka.core.pmml.PMMLModel;
import weka.core.xml.KOML;
import weka.core.xml.XMLOptions;
import weka.core.xml.XMLSerialization;
import weka.estimators.Estimator;
import weka.estimators.KernelEstimator;
import java.beans.BeanInfo;
import java.beans.Introspector;
import java.beans.MethodDescriptor;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.lang.reflect.Method;
import java.util.Date;
import java.util.Enumeration;
import java.util.Random;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
/**
* Class for evaluating machine learning models. <p/>
*
* ------------------------------------------------------------------- <p/>
*
* General options when evaluating a learning scheme from the command-line: <p/>
*
* -t filename <br/>
* Name of the file with the training data. (required) <p/>
*
* -T filename <br/>
* Name of the file with the test data. If missing a cross-validation
* is performed. <p/>
*
* -c index <br/>
* Index of the class attribute (1, 2, ...; default: last). <p/>
*
* -x number <br/>
* The number of folds for the cross-validation (default: 10). <p/>
*
* -no-cv <br/>
* No cross validation. If no test file is provided, no evaluation
* is done. <p/>
*
* -split-percentage percentage <br/>
* Sets the percentage for the train/test set split, e.g., 66. <p/>
*
* -preserve-order <br/>
* Preserves the order in the percentage split instead of randomizing
* the data first with the seed value ('-s'). <p/>
*
* -s seed <br/>
* Random number seed for the cross-validation and percentage split
* (default: 1). <p/>
*
* -m filename <br/>
* The name of a file containing a cost matrix. <p/>
*
* -l filename <br/>
* Loads classifier from the given file. In case the filename ends with ".xml",
* a PMML file is loaded or, if that fails, options are loaded from XML. <p/>
*
* -d filename <br/>
* Saves classifier built from the training data into the given file. In case
* the filename ends with ".xml" the options are saved XML, not the model. <p/>
*
* -v <br/>
* Outputs no statistics for the training data. <p/>
*
* -o <br/>
* Outputs statistics only, not the classifier. <p/>
*
* -i <br/>
* Outputs information-retrieval statistics per class. <p/>
*
* -k <br/>
* Outputs information-theoretic statistics. <p/>
*
* -p range <br/>
* Outputs predictions for test instances (or the train instances if no test
* instances provided and -no-cv is used), along with the attributes in the specified range
* (and nothing else). Use '-p 0' if no attributes are desired. <p/>
*
* -distribution <br/>
* Outputs the distribution instead of only the prediction
* in conjunction with the '-p' option (only nominal classes). <p/>
*
* -r <br/>
* Outputs cumulative margin distribution (and nothing else). <p/>
*
* -g <br/>
* Only for classifiers that implement "Graphable." Outputs
* the graph representation of the classifier (and nothing
* else). <p/>
*
* -xml filename | xml-string <br/>
* Retrieves the options from the XML-data instead of the command line. <p/>
*
* -threshold-file file <br/>
* The file to save the threshold data to.
* The format is determined by the extensions, e.g., '.arff' for ARFF
* format or '.csv' for CSV. <p/>
*
* -threshold-label label <br/>
* The class label to determine the threshold data for
* (default is the first label) <p/>
*
* ------------------------------------------------------------------- <p/>
*
* Example usage as the main of a classifier (called FunkyClassifier):
* <code> <pre>
* public static void main(String [] args) {
* runClassifier(new FunkyClassifier(), args);
* }
* </pre> </code>
* <p/>
*
* ------------------------------------------------------------------ <p/>
*
* Example usage from within an application:
* <code> <pre>
* Instances trainInstances = ... instances got from somewhere
* Instances testInstances = ... instances got from somewhere
* Classifier scheme = ... scheme got from somewhere
*
* Evaluation evaluation = new Evaluation(trainInstances);
* evaluation.evaluateModel(scheme, testInstances);
* System.out.println(evaluation.toSummaryString());
* </pre> </code>
*
*
* @author Eibe Frank (eibe@cs.waikato.ac.nz)
* @author Len Trigg (trigg@cs.waikato.ac.nz)
* @version $Revision: 6346 $
*/
public class Evaluation
implements Summarizable, RevisionHandler {
/** The number of classes. */
protected int m_NumClasses;
/** The number of folds for a cross-validation. */
protected int m_NumFolds;
/** The weight of all incorrectly classified instances. */
protected double m_Incorrect;
/** The weight of all correctly classified instances. */
protected double m_Correct;
/** The weight of all unclassified instances. */
protected double m_Unclassified;
/*** The weight of all instances that had no class assigned to them. */
protected double m_MissingClass;
/** The weight of all instances that had a class assigned to them. */
protected double m_WithClass;
/** Array for storing the confusion matrix. */
protected double [][] m_ConfusionMatrix;
/** The names of the classes. */
protected String [] m_ClassNames;
/** Is the class nominal or numeric? */
protected boolean m_ClassIsNominal;
/** The prior probabilities of the classes */
protected double [] m_ClassPriors;
/** The sum of counts for priors */
protected double m_ClassPriorsSum;
/** The cost matrix (if given). */
protected CostMatrix m_CostMatrix;
/** The total cost of predictions (includes instance weights) */
protected double m_TotalCost;
/** Sum of errors. */
protected double m_SumErr;
/** Sum of absolute errors. */
protected double m_SumAbsErr;
/** Sum of squared errors. */
protected double m_SumSqrErr;
/** Sum of class values. */
protected double m_SumClass;
/** Sum of squared class values. */
protected double m_SumSqrClass;
/*** Sum of predicted values. */
protected double m_SumPredicted;
/** Sum of squared predicted values. */
protected double m_SumSqrPredicted;
/** Sum of predicted * class values