/* Heritrix
*
* $Id: Heritrix.java 6007 2008-10-20 20:17:14Z nlevitt $
*
* Created on May 15, 2003
*
* Copyright (C) 2003 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.crawler;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.InetAddress;
import java.net.URL;
import java.net.URLConnection;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.StringTokenizer;
import java.util.TimeZone;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.LogManager;
import java.util.logging.Logger;
import javax.management.Attribute;
import javax.management.AttributeList;
import javax.management.AttributeNotFoundException;
import javax.management.DynamicMBean;
import javax.management.InstanceAlreadyExistsException;
import javax.management.InstanceNotFoundException;
import javax.management.InvalidAttributeValueException;
import javax.management.MBeanInfo;
import javax.management.MBeanNotificationInfo;
import javax.management.MBeanOperationInfo;
import javax.management.MBeanRegistration;
import javax.management.MBeanRegistrationException;
import javax.management.MBeanServer;
import javax.management.MBeanServerFactory;
import javax.management.MalformedObjectNameException;
import javax.management.NotCompliantMBeanException;
import javax.management.ObjectName;
import javax.management.ReflectionException;
import javax.management.RuntimeOperationsException;
import javax.management.openmbean.CompositeData;
import javax.management.openmbean.CompositeDataSupport;
import javax.management.openmbean.CompositeType;
import javax.management.openmbean.OpenDataException;
import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
import javax.management.openmbean.OpenMBeanInfoSupport;
import javax.management.openmbean.OpenMBeanOperationInfoSupport;
import javax.management.openmbean.OpenMBeanParameterInfo;
import javax.management.openmbean.OpenMBeanParameterInfoSupport;
import javax.management.openmbean.OpenType;
import javax.management.openmbean.SimpleType;
import javax.management.openmbean.TabularData;
import javax.management.openmbean.TabularDataSupport;
import javax.management.openmbean.TabularType;
import javax.naming.CompoundName;
import javax.naming.Context;
import javax.naming.NameNotFoundException;
import javax.naming.NamingException;
import javax.naming.NoInitialContextException;
import org.apache.commons.cli.Option;
import org.archive.crawler.admin.CrawlJob;
import org.archive.crawler.admin.CrawlJobErrorHandler;
import org.archive.crawler.admin.CrawlJobHandler;
import org.archive.crawler.datamodel.CredentialStore;
import org.archive.crawler.datamodel.credential.Credential;
import org.archive.crawler.event.CrawlStatusListener;
import org.archive.crawler.framework.AlertManager;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.exceptions.FatalConfigurationException;
import org.archive.crawler.framework.exceptions.InitializationException;
import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
import org.archive.crawler.settings.XMLSettingsHandler;
import org.archive.io.SinkHandler;
import org.archive.io.SinkHandlerLogRecord;
import org.archive.net.UURI;
import org.archive.util.FileUtils;
import org.archive.util.IoUtils;
import org.archive.util.JmxUtils;
import org.archive.util.JndiUtils;
import org.archive.util.PropertyUtils;
import org.archive.util.TextUtils;
import sun.net.www.protocol.file.FileURLConnection;
/**
* Main class for Heritrix crawler.
*
* Heritrix is usually launched by a shell script that backgrounds heritrix
* that redirects all stdout and stderr emitted by heritrix to a log file. So
* that startup messages emitted subsequent to the redirection of stdout and
* stderr show on the console, this class prints usage or startup output
* such as where the web UI can be found, etc., to a STARTLOG that the shell
* script is waiting on. As soon as the shell script sees output in this file,
* it prints its content and breaks out of its wait.
* See ${HERITRIX_HOME}/bin/heritrix.
*
* <p>Heritrix can also be embedded or launched by webapp initialization or
* by JMX bootstrapping. So far I count 4 methods of instantiation:
* <ol>
* <li>From this classes main -- the method usually used;</li>
* <li>From the Heritrix UI (The local-instances.jsp) page;</li>
* <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
* <li>A container such as tomcat or jboss.</li>
* </ol>
*
* @author gojomo
* @author Kristinn Sigurdsson
* @author Stack
*/
public class Heritrix implements DynamicMBean, MBeanRegistration {
/**
* Heritrix logging instance.
*/
private static final Logger logger =
Logger.getLogger(Heritrix.class.getName());
private static final File TMPDIR =
new File(System.getProperty("java.io.tmpdir", "/tmp"));
/**
* Name of the heritrix properties file.
*/
private static final String PROPERTIES = "heritrix.properties";
/**
* Name of the key to use specifying alternate heritrix properties on
* command line.
*/
private static final String PROPERTIES_KEY = PROPERTIES;
/**
* Prefix used on our properties we'll add to the System.properties list.
*/
private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
/**
* Prefix used on other properties we'll add to the System.properties
* list (after stripping this prefix).
*/
private static final String SYSTEM_PREFIX = "system.";
/**
* Instance of web server if one was started.
*/
private static SimpleHttpServer httpServer = null;
/**
* CrawlJob handler. Manages multiple crawl jobs at runtime.
*/
private CrawlJobHandler jobHandler = null;
/**
* Heritrix start log file.
*
* This file contains standard out produced by this main class for startup
* only. Used by heritrix shell script. Name here MUST match that in the
* <code>bin/heritrix</code> shell script. This is a DEPENDENCY the shell
* wrapper has on this here java heritrix.
*/
private static final String STARTLOG = "heritrix_dmesg.log";
/**
* Default encoding.
*
* Used for content when fetching if none specified.
*/
public static final String DEFAULT_ENCODING = "ISO-8859-1";
/**
* Heritrix stderr/stdout log file.
*
* This file should have nothing in it except messages over which we have
* no control (JVM stacktrace, 3rd-party lib emissions). The wrapper
* startup script directs stderr/stdout here. This is an INTERDEPENDENCY
* this program has with the wrapper
没有合适的资源?快使用搜索试试~ 我知道了~
1.14.2版本的eclipse 工程
共1337个文件
class:695个
java:557个
html:28个
需积分: 4 2 下载量 32 浏览量
2009-02-22
11:37:48
上传
评论
收藏 12.01MB RAR 举报
温馨提示
1.14.2版本的eclipse 工程已配好可以运行了
资源推荐
资源详情
资源评论
收起资源包目录
1.14.2版本的eclipse 工程 (1337个子文件)
heritrix.cacerts 21KB
Heritrix.class 52KB
CrawlJob.class 49KB
FetchHTTP.class 44KB
CrawlController.class 38KB
HttpMethodBase.class 33KB
WorkQueueFrontier.class 29KB
AdaptiveRevisitFrontier.class 29KB
AbstractFrontier.class 28KB
CrawlJobHandler.class 25KB
StatisticsTracker.class 25KB
UURIFactoryTest.class 24KB
CrawlURI.class 22KB
ExtractorHTML.class 19KB
WriterPoolProcessor.class 19KB
ComplexType.class 18KB
HttpConnection.class 18KB
MirrorWriterProcessor.class 17KB
WARCWriterProcessor.class 17KB
AdaptiveRevisitHostQueue.class 17KB
ARCReader.class 16KB
DecideRuleSequenceTest.class 16KB
ArchiveUtils.class 15KB
JEMBeanHelper.class 15KB
StatisticsSummary.class 15KB
XMLSettingsHandler.class 15KB
ARCWriterTest.class 15KB
GenericObjectPool.class 14KB
JerichoExtractorHTML.class 14KB
CachedBdbMap.class 14KB
CrawlOrder.class 13KB
JobConfigureUtils.class 13KB
ToeThread.class 13KB
BdbFrontier.class 13KB
CrawlSettingsSAXHandler.class 13KB
SettingsHandler.class 13KB
FetchFTP.class 13KB
UURIFactory.class 13KB
Kw3WriterProcessor.class 13KB
CookieSpecBase.class 13KB
WARCWriterTest.class 12KB
HashtableAList.class 12KB
PreconditionEnforcer.class 11KB
CrawlSettingsSAXSource.class 11KB
CandidateURI.class 11KB
BdbMultipleWorkQueues.class 11KB
FileUtils.class 11KB
ExtractorHTMLTest.class 11KB
WARCWriter.class 11KB
FetchDNS.class 10KB
ArchiveReader.class 10KB
JmxUtils.class 10KB
RegexpHTMLLinkExtractor.class 10KB
ExtractorTool.class 10KB
LogReader.class 10KB
AdaptiveRevisitQueueList.class 10KB
WorkQueue.class 9KB
WriterPoolMember.class 9KB
SimpleHttpServer.class 9KB
ARCWriter.class 9KB
RecordingOutputStream.class 9KB
DomainSensitiveFrontier.class 9KB
ArchiveUtilsTest.class 9KB
HttpState.class 9KB
QuotaEnforcer.class 9KB
LinksScoper.class 9KB
ExtractorUniversal.class 9KB
RecoveryJournal.class 8KB
JerichoExtractorHTMLTest.class 8KB
ReplayCharSequenceTest.class 8KB
SelfTestCase.class 8KB
FPGenerator.class 8KB
CrawlMapper.class 8KB
MapTypeTest.class 8KB
XMLSettingsHandlerTest.class 8KB
CrawlServer.class 8KB
CrawlScope.class 8KB
PersistProcessor.class 8KB
HtmlFormCredential.class 7KB
WaitEvaluator.class 7KB
AdaptiveRevisitHostQueueTest.class 7KB
Warc2Arc.class 7KB
ExtractorJS.class 7KB
BdbUriUniqFilter.class 7KB
RecordingInputStream.class 7KB
WARCReader.class 7KB
FPMergeUriUniqFilter.class 7KB
Arc2Warc.class 7KB
IoUtils.class 7KB
CrawlerSettings.class 7KB
Cookie.class 7KB
DataContainer.class 7KB
Checkpointer.class 7KB
CredentialStore.class 7KB
PublicSuffixes.class 7KB
HTTPContentDigest.class 7KB
SurtPrefixedDecideRule.class 7KB
BdbUriUniqFilterTest.class 7KB
UURI.class 6KB
X.class 6KB
共 1337 条
- 1
- 2
- 3
- 4
- 5
- 6
- 14
资源评论
sweetheart1986
- 粉丝: 0
- 资源: 5
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功