/* Heritrix
*
* $Id: Heritrix.java,v 1.142.2.1 2006/09/18 20:42:55 stack-sf Exp $
*
* Created on May 15, 2003
*
* Copyright (C) 2003 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.URL;
import java.net.URLConnection;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.TimeZone;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.LogManager;
import java.util.logging.Logger;
import javax.management.Attribute;
import javax.management.AttributeList;
import javax.management.AttributeNotFoundException;
import javax.management.DynamicMBean;
import javax.management.InstanceAlreadyExistsException;
import javax.management.InstanceNotFoundException;
import javax.management.InvalidAttributeValueException;
import javax.management.MBeanInfo;
import javax.management.MBeanNotificationInfo;
import javax.management.MBeanOperationInfo;
import javax.management.MBeanRegistration;
import javax.management.MBeanRegistrationException;
import javax.management.MBeanServer;
import javax.management.MBeanServerFactory;
import javax.management.MalformedObjectNameException;
import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.ReflectionException;
import javax.management.RuntimeOperationsException;
import javax.management.openmbean.CompositeData;
import javax.management.openmbean.CompositeDataSupport;
import javax.management.openmbean.CompositeType;
import javax.management.openmbean.OpenDataException;
import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
import javax.management.openmbean.OpenMBeanInfoSupport;
import javax.management.openmbean.OpenMBeanOperationInfoSupport;
import javax.management.openmbean.OpenMBeanParameterInfo;
import javax.management.openmbean.OpenMBeanParameterInfoSupport;
import javax.management.openmbean.OpenType;
import javax.management.openmbean.SimpleType;
import javax.management.openmbean.TabularData;
import javax.management.openmbean.TabularDataSupport;
import javax.management.openmbean.TabularType;
import javax.naming.CompoundName;
import javax.naming.Context;
import javax.naming.NameNotFoundException;
import javax.naming.NamingException;
import javax.naming.NoInitialContextException;
import org.apache.commons.cli.Option;
import org.archive.crawler.admin.CrawlJob;
import org.archive.crawler.admin.CrawlJobErrorHandler;
import org.archive.crawler.admin.CrawlJobHandler;
import org.archive.crawler.datamodel.CredentialStore;
import org.archive.crawler.datamodel.credential.Credential;
import org.archive.crawler.event.CrawlStatusListener;
import org.archive.crawler.framework.AlertManager;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;
import org.archive.crawler.framework.exceptions.InitializationException;
import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
import org.archive.crawler.settings.XMLSettingsHandler;
import org.archive.io.SinkHandler;
import org.archive.io.SinkHandlerLogRecord;
import org.archive.net.UURI;
import org.archive.util.FileUtils;
import org.archive.util.IoUtils;
import org.archive.util.JmxUtils;
import org.archive.util.JndiUtils;
import org.archive.util.PropertyUtils;
import org.archive.util.TextUtils;
import sun.net.www.protocol.file.FileURLConnection;
/**
* Main class for Heritrix crawler.
*
* Heritrix is usually launched by a shell script that backgrounds heritrix
* that redirects all stdout and stderr emitted by heritrix to a log file. So
* that startup messages emitted subsequent to the redirection of stdout and
* stderr show on the console, this class prints usage or startup output
* such as where the web UI can be found, etc., to a STARTLOG that the shell
* script is waiting on. As soon as the shell script sees output in this file,
* it prints its content and breaks out of its wait.
* See ${HERITRIX_HOME}/bin/heritrix.
*
* <p>Heritrix can also be embedded or launched by webapp initialization or
* by JMX bootstrapping. So far I count 4 methods of instantiation:
* <ol>
* <li>From this classes main -- the method usually used;</li>
* <li>From the Heritrix UI (The local-instances.jsp) page;</li>
* <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
* <li>A container such as tomcat or jboss.</li>
* </ol>
*
* @author gojomo
* @author Kristinn Sigurdsson
* @author Stack
*/
public class Heritrix implements DynamicMBean, MBeanRegistration {
/**
* Heritrix logging instance.
*/
private static final Logger logger =
Logger.getLogger(Heritrix.class.getName());
private static final File TMPDIR =
new File(System.getProperty("java.io.tmpdir", "/tmp"));
/**
* Name of the heritrix properties file.
*/
private static final String PROPERTIES = "heritrix.properties";
/**
* Name of the key to use specifying alternate heritrix properties on
* command line.
*/
private static final String PROPERTIES_KEY = PROPERTIES;
/**
* Prefix used on properties we'll add to the System.properties list.
*/
private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
/**
* Instance of web server if one was started.
*/
private static SimpleHttpServer httpServer = null;
/**
* CrawlJob handler. Manages multiple crawl jobs at runtime.
*/
private CrawlJobHandler jobHandler = null;
/**
* Heritrix start log file.
*
* This file contains standard out produced by this main class for startup
* only. Used by heritrix shell script. Name here MUST match that in the
* <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
* wrapper has on this here java heritrix.
*/
private static final String STARTLOG = "heritrix_dmesg.log";
/**
* Default encoding.
*
* Used for content when fetching if none specified.
*/
public static final String DEFAULT_ENCODING = "ISO-8859-1";
/**
* Heritrix stderr/stdout log file.
*
* This file should have nothing in it except messages over which we have
* no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
* startup script directs stderr/stdout here. This is an INTERDEPENDENCY
* this program has with the wrapper shell script. Shell can actually
* pass us an alternate to use for this file.
*/
private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
/**
* Where to w
没有合适的资源?快使用搜索试试~ 我知道了~
资源详情
资源评论
资源推荐
收起资源包目录
Luncene2.0+Heritrix开发自己的搜索引擎 (1319个子文件)
Heritrix.class 50KB
CrawlJob.class 46KB
FetchHTTP.class 40KB
CrawlController.class 38KB
HttpMethodBase.class 33KB
WorkQueueFrontier.class 28KB
AdaptiveRevisitFrontier.class 26KB
CrawlJobHandler.class 25KB
AbstractFrontier.class 24KB
StatisticsTracker.class 21KB
CrawlURI.class 21KB
UURIFactoryTest.class 20KB
HttpConnection.class 18KB
ComplexType.class 18KB
ExtractorHTML.class 18KB
MirrorWriterProcessor.class 17KB
AdaptiveRevisitHostQueue.class 17KB
ARCReader.class 15KB
JEMBeanHelper.class 15KB
GenericObjectPool.class 14KB
XMLSettingsHandler.class 14KB
StatisticsSummary.class 14KB
JobConfigureUtils.class 14KB
ToeThread.class 13KB
WriterPoolProcessor.class 13KB
ARCWriterTest.class 13KB
DecideRuleSequenceTest.class 13KB
FetchFTP.class 13KB
CrawlOrder.class 13KB
CookieSpecBase.class 13KB
UURIFactory.class 12KB
ExperimentalWARCWriterTest.class 12KB
SettingsHandler.class 12KB
CachedBdbMap.class 12KB
CrawlSettingsSAXSource.class 11KB
RecoveryJournal.class 11KB
CandidateURI.class 11KB
HashtableAList.class 11KB
CrawlSettingsSAXHandler.class 10KB
BdbMultipleWorkQueues.class 10KB
JmxUtils.class 10KB
ExtractorTool.class 10KB
ExperimentalWARCWriter.class 10KB
BdbFrontier.class 10KB
RegexpHTMLLinkExtractor.class 10KB
PreconditionEnforcer.class 10KB
FetchDNS.class 10KB
ARCWriter.class 10KB
ARCWriterProcessor.class 10KB
LogReader.class 10KB
ExperimentalWARCWriterProcessor.class 10KB
AdaptiveRevisitQueueList.class 10KB
FileUtils.class 9KB
ReplayCharSequenceFactory$MultiByteReplayCharSequence.class 9KB
SimpleHttpServer.class 9KB
HttpState.class 9KB
ExtractorHTMLTest.class 9KB
DomainSensitiveFrontier.class 9KB
WriterPoolMember.class 9KB
ArchiveReader.class 9KB
ExtractorUniversal.class 9KB
ArchiveUtils.class 9KB
LinksScoper.class 8KB
WorkQueue.class 8KB
CrawlServer.class 8KB
SelfTestCase.class 8KB
QuotaEnforcer.class 8KB
FPGenerator.class 8KB
XMLSettingsHandlerTest.class 8KB
CrawlMapper.class 8KB
MapTypeTest.class 8KB
ArchiveUtilsTest.class 8KB
CrawlScope.class 8KB
WARCReader.class 8KB
WaitEvaluator.class 7KB
AdaptiveRevisitHostQueueTest.class 7KB
HtmlFormCredential.class 7KB
Warc2Arc.class 7KB
RecordingInputStream.class 7KB
ReplayCharSequenceFactoryTest.class 7KB
Arc2Warc.class 7KB
Cookie.class 7KB
SurtPrefixedDecideRule.class 7KB
FPMergeUriUniqFilter.class 7KB
Checkpointer.class 7KB
HTTPContentDigest.class 7KB
BdbUriUniqFilter.class 7KB
CrawlerSettings.class 7KB
RecordingOutputStream.class 7KB
CredentialStore.class 7KB
BdbUriUniqFilterTest.class 6KB
testwrapper_jsp.class 6KB
UURI.class 6KB
SurtPrefixSet.class 6KB
X.class 6KB
Processor.class 6KB
LaxURI.class 6KB
ArchiveRecord.class 6KB
DataContainer.class 6KB
ANVLRecord.class 6KB
共 1319 条
- 1
- 2
- 3
- 4
- 5
- 6
- 14
seanliuyang
- 粉丝: 3
- 资源: 22
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
评论1