/* Heritrix
*
* $Id: Heritrix.java,v 1.142.2.1 2006/09/18 20:42:55 stack-sf Exp $
*
* Created on May 15, 2003
*
* Copyright (C) 2003 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.URL;
import java.net.URLConnection;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.TimeZone;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.LogManager;
import java.util.logging.Logger;
import javax.management.Attribute;
import javax.management.AttributeList;
import javax.management.AttributeNotFoundException;
import javax.management.DynamicMBean;
import javax.management.InstanceAlreadyExistsException;
import javax.management.InstanceNotFoundException;
import javax.management.InvalidAttributeValueException;
import javax.management.MBeanInfo;
import javax.management.MBeanNotificationInfo;
import javax.management.MBeanOperationInfo;
import javax.management.MBeanRegistration;
import javax.management.MBeanRegistrationException;
import javax.management.MBeanServer;
import javax.management.MBeanServerFactory;
import javax.management.MalformedObjectNameException;
import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.ReflectionException;
import javax.management.RuntimeOperationsException;
import javax.management.openmbean.CompositeData;
import javax.management.openmbean.CompositeDataSupport;
import javax.management.openmbean.CompositeType;
import javax.management.openmbean.OpenDataException;
import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
import javax.management.openmbean.OpenMBeanInfoSupport;
import javax.management.openmbean.OpenMBeanOperationInfoSupport;
import javax.management.openmbean.OpenMBeanParameterInfo;
import javax.management.openmbean.OpenMBeanParameterInfoSupport;
import javax.management.openmbean.OpenType;
import javax.management.openmbean.SimpleType;
import javax.management.openmbean.TabularData;
import javax.management.openmbean.TabularDataSupport;
import javax.management.openmbean.TabularType;
import javax.naming.CompoundName;
import javax.naming.Context;
import javax.naming.NameNotFoundException;
import javax.naming.NamingException;
import javax.naming.NoInitialContextException;
import org.apache.commons.cli.Option;
import org.archive.crawler.admin.CrawlJob;
import org.archive.crawler.admin.CrawlJobErrorHandler;
import org.archive.crawler.admin.CrawlJobHandler;
import org.archive.crawler.datamodel.CredentialStore;
import org.archive.crawler.datamodel.credential.Credential;
import org.archive.crawler.event.CrawlStatusListener;
import org.archive.crawler.framework.AlertManager;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;
import org.archive.crawler.framework.exceptions.InitializationException;
import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
import org.archive.crawler.settings.XMLSettingsHandler;
import org.archive.io.SinkHandler;
import org.archive.io.SinkHandlerLogRecord;
import org.archive.net.UURI;
import org.archive.util.FileUtils;
import org.archive.util.IoUtils;
import org.archive.util.JmxUtils;
import org.archive.util.JndiUtils;
import org.archive.util.PropertyUtils;
import org.archive.util.TextUtils;
import sun.net.www.protocol.file.FileURLConnection;
/**
* Main class for Heritrix crawler.
*
* Heritrix is usually launched by a shell script that backgrounds heritrix
* that redirects all stdout and stderr emitted by heritrix to a log file. So
* that startup messages emitted subsequent to the redirection of stdout and
* stderr show on the console, this class prints usage or startup output
* such as where the web UI can be found, etc., to a STARTLOG that the shell
* script is waiting on. As soon as the shell script sees output in this file,
* it prints its content and breaks out of its wait.
* See ${HERITRIX_HOME}/bin/heritrix.
*
* <p>Heritrix can also be embedded or launched by webapp initialization or
* by JMX bootstrapping. So far I count 4 methods of instantiation:
* <ol>
* <li>From this classes main -- the method usually used;</li>
* <li>From the Heritrix UI (The local-instances.jsp) page;</li>
* <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
* <li>A container such as tomcat or jboss.</li>
* </ol>
*
* @author gojomo
* @author Kristinn Sigurdsson
* @author Stack
*/
public class Heritrix implements DynamicMBean, MBeanRegistration {
/**
* Heritrix logging instance.
*/
private static final Logger logger =
Logger.getLogger(Heritrix.class.getName());
private static final File TMPDIR =
new File(System.getProperty("java.io.tmpdir", "/tmp"));
/**
* Name of the heritrix properties file.
*/
private static final String PROPERTIES = "heritrix.properties";
/**
* Name of the key to use specifying alternate heritrix properties on
* command line.
*/
private static final String PROPERTIES_KEY = PROPERTIES;
/**
* Prefix used on properties we'll add to the System.properties list.
*/
private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
/**
* Instance of web server if one was started.
*/
private static SimpleHttpServer httpServer = null;
/**
* CrawlJob handler. Manages multiple crawl jobs at runtime.
*/
private CrawlJobHandler jobHandler = null;
/**
* Heritrix start log file.
*
* This file contains standard out produced by this main class for startup
* only. Used by heritrix shell script. Name here MUST match that in the
* <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
* wrapper has on this here java heritrix.
*/
private static final String STARTLOG = "heritrix_dmesg.log";
/**
* Default encoding.
*
* Used for content when fetching if none specified.
*/
public static final String DEFAULT_ENCODING = "ISO-8859-1";
/**
* Heritrix stderr/stdout log file.
*
* This file should have nothing in it except messages over which we have
* no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
* startup script directs stderr/stdout here. This is an INTERDEPENDENCY
* this program has with the wrapper shell script. Shell can actually
* pass us an alternate to use for this file.
*/
private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
/**
* Where to w