/*
* FileName CreateUrlUtil.java
* Create Date 2006-5-17
* Author shiwei
* Descript 生成各种类型的URL
* Version
*/
package com.snoics.reptile.link.createUrl;
import java.net.URI;
import java.util.regex.Pattern;
import com.snoics.base.net.SnoicsUrl;
import com.snoics.base.util.RandomSeed;
import com.snoics.base.util.StringClass;
import com.snoics.base.util.regex.Regex;
import com.snoics.reptile.link.TempLink;
import com.snoics.reptile.regex.filter.IRegexFilter;
import com.snoics.reptile.regex.filter.IndexTypeFilter;
import com.snoics.reptile.regex.filter.RemoteUrlFilter;
import com.snoics.reptile.regex.filter.ResolvedUrlFilter;
import com.snoics.reptile.system.common.Common;
import com.snoics.reptile.system.common.CommonObject;
public class BuildUrl implements IBuildUrl{
private Regex regex=new Regex();
private TempLink tempLink=new TempLink();
private CommonObject commonObject=new CommonObject();
private SnoicsUrl snoicsUrl=new SnoicsUrl();
/**
* 取得文件存放的根目录
* @return String
*/
public String getFileRootPath(){
return commonObject.getConfigInfo(Common.CONFIGFILE_NODE_FILEROOTPATH);
}
/**
* 生成相对于website的URL
* @param url 当前URL
* @param parentUrl 上一级页面绝对URL
* @return String
*/
public String buildRelativizeUrl(String url,String parentUrl){
if(url.startsWith("/")){
return url;
}
IRegexFilter remoteUrlFilter=new RemoteUrlFilter();
remoteUrlFilter.setUrl(url);
if(remoteUrlFilter.filter()){
return url;
}
IRegexFilter resolvedUrlFilter=new ResolvedUrlFilter();
resolvedUrlFilter.setUrl(url);
String website="^"+commonObject.getConfigInfo(Common.CONFIGFILE_NODE_WEBSITE);
website=StringClass.getPreString(website,"/");
String websiteRegexString=website+":?[0-9]*"+"/";
if(resolvedUrlFilter.filter()){
url=regex.getReplaceFirst(url,"",websiteRegexString,Pattern.CASE_INSENSITIVE);
url="/"+url;
url=StringClass.getFormatPath(url);
}else{
url=StringClass.getFormatPath(url);
parentUrl=regex.getReplaceFirst(parentUrl,"",websiteRegexString,Pattern.CASE_INSENSITIVE);
parentUrl="/"+parentUrl;
parentUrl=StringClass.getFormatPath(parentUrl);
URI uri=snoicsUrl.getResolved(parentUrl,url);
if(uri==null){
return url;
}else{
url=uri.toString();
}
}
return url;
}
/**
* 生成相对于parentUrl的URL
* @param url 当前URL
* @param parentUrl 上一级页面绝对URL
* @return String
*/
public String buildRelativizeWithParentUrl(String url,String parentUrl){
IRegexFilter resolvedUrlFilter=new ResolvedUrlFilter();
resolvedUrlFilter.setUrl(url);
if(resolvedUrlFilter.filter()){
return url;
}
if(url.startsWith("/")){
return url;
}
url=StringClass.getFormatPath(url);
URI uri=snoicsUrl.getRelativize(parentUrl,url);
if(uri==null){
return url;
}else{
url=uri.toString();
}
return url;
}
/**
* 生成绝对URL
* @param url 当前URL
* @param parentUrl 上一级页面绝对URL
* @return String
*/
public String buildResolvedUrl(String url,String parentUrl){
IRegexFilter remoteUrlFilter=new RemoteUrlFilter();
remoteUrlFilter.setUrl(url);
if(remoteUrlFilter.filter()){
return url;
}
IRegexFilter resolvedUrlFilter=new ResolvedUrlFilter();
resolvedUrlFilter.setUrl(url);
if(resolvedUrlFilter.filter()){
return url;
}else{
url=StringClass.getFormatPath(url);
String website=commonObject.getConfigInfo(Common.CONFIGFILE_NODE_WEBSITE);
if(url.startsWith("/")){
if(website.endsWith("/")){
website=website.substring(0,website.length()-1);
}
return website+url;
}else{
if(parentUrl==null){
URI uri=snoicsUrl.getResolved(website,url);
if(uri==null){
return url;
}else{
url=uri.toString();
}
}else{
URI uri=snoicsUrl.getResolved(parentUrl,url);
if(uri==null){
return url;
}else{
url=uri.toString();
}
}
}
}
return url;
}
/**
* 返回重新生成后的相对于website的URL
* @param url
* @param parentUrl
* @return String
*/
public String buildLocalHtmlRelativizeFileName(String url,String parentUrl){
String pageFlagString=StringClass.getLastString(url,"#");
url=StringClass.getPreString(url,"#");
if(url.equals("")){
url=StringClass.getLastString(parentUrl,"/");
url=StringClass.getPreString(url,"#");
}
String relativizeUrl=buildRelativizeUrl(url,parentUrl);
String relativizeWithWebsiteUrl=buildRelativizeUrl(url,parentUrl);
String newRelativizeUrl="";
String temprelativizeUrl=tempLink.getLink(relativizeWithWebsiteUrl);
if(temprelativizeUrl!=null){
if(!pageFlagString.equals("")){
temprelativizeUrl=temprelativizeUrl+"#"+pageFlagString;
}
return temprelativizeUrl;
}else{
String filename="";
IRegexFilter indexTypeFilter=new IndexTypeFilter();
indexTypeFilter.setUrl(relativizeUrl);
boolean indexFlag=indexTypeFilter.filter();
newRelativizeUrl=StringClass.getPreString(relativizeUrl,"/");
if(newRelativizeUrl.equalsIgnoreCase(relativizeUrl)){
newRelativizeUrl="";
}else{
newRelativizeUrl=StringClass.getFormatPath(newRelativizeUrl+"/");
}
if(indexFlag){
filename="index";
}else{
int filenamelength=32;
try{
filenamelength=Integer.parseInt(commonObject.getConfigInfo(Common.CONFIGFILE_NODE_FILENAMELENGTH));
}catch(Exception e){
}
filename=RandomSeed.getSeed(filenamelength);
}
newRelativizeUrl=newRelativizeUrl+filename+Common.DEFAULT_STATICFILETYPE;
newRelativizeUrl=StringClass.getFormatPath(newRelativizeUrl);
tempLink.addLink(relativizeUrl,newRelativizeUrl);
}
if(!pageFlagString.equals("")){
newRelativizeUrl=newRelativizeUrl+"#"+pageFlagString;
}
return newRelativizeUrl;
}
/**
* 返回重新生成后的相对于parentUrl的URL
* @param url
* @param parentUrl
* @return String
*/
public String buildLocalHtmlRelativizeWithParentFileName(String url,String parentUrl){
String pageFlagString=StringClass.getLastString(url,"#");
url=StringClass.getPreString(url,"#");
if(url.equals("")){
url=StringClass.getLastString(parentUrl,"/");
url=StringClass.getPreString(url,"#");
}
String relativizeUrl=buildRelativizeWithParentUrl(url,parentUrl);
String relativizeWithWebsiteUrl=buildRelativizeUrl(url,parentUrl);
String newRelativizeUrl="";
String temprelativizeUrl=tempLink.getLink(relativizeWithWebsiteUrl);
if(temprelativizeUrl!=null){
temprelativizeUrl=StringClass.getLastString(temprelativizeUrl,"/");
String newurl=StringClass.getPreString(relativizeUrl,"/");
if(newurl.equals(relativizeUrl)){
newurl=temprelativizeUrl;
}else{
newurl=newurl+"/"+temprelativizeUrl;
}
relativizeUrl = StringClass.getFormatPath(newurl);
if(!pageFlagString.equals("")){
relativizeUrl=relativizeUrl+"#"+pageFlagString;
}
return relativizeUrl;
}else{
String filename="";
IRegexFilter indexTypeFilter=new IndexTypeFilter();
indexTypeFilter.setUrl(relativizeUrl);
boolean indexFlag=indexTypeFilter.filter();
newRelativizeUrl=StringClass.getPreString(relativizeUrl,"/");
if(newRelativizeUrl.equalsIgnoreCase(relativizeUrl)){
newRelativizeUrl="";
}else{
newRelativizeUrl=StringClass.getFormatPath(newRelativizeUrl+"/");
}
if(indexFlag){
filename="index";
}else{
int filenamelength=32;
try{
filenamelength=Integer.parseInt(commonObject.getConfigInfo(Common.
没有合适的资源?快使用搜索试试~ 我知道了~
java写的搜索引擎网络爬虫源码
共436个文件
html:217个
class:126个
java:63个
4星 · 超过85%的资源 需积分: 17 122 下载量 36 浏览量
2008-12-26
10:20:22
上传
评论 4
收藏 4.66MB RAR 举报
温馨提示
java写的搜索引擎网络爬虫 可以用来爬镜像网站 可以根据配置抓取JAVASCRIPT连接 等等。
资源推荐
资源详情
资源评论
收起资源包目录
java写的搜索引擎网络爬虫源码 (436个子文件)
run.bat 158B
run.bat 158B
run.bat 158B
BuildUrl.class 8KB
InitSystemImpl.class 6KB
ConfigInfo.class 6KB
ParseHtml.class 5KB
MakeUpUrl.class 5KB
InitSystemImpl.class 5KB
Reptile.class 5KB
BuildUrl.class 4KB
ReplaceAllUrl.class 4KB
ConfigInfo.class 4KB
ParseHtml.class 4KB
FilterAllUrl.class 4KB
CommonObject.class 4KB
MakeUpUrl.class 4KB
Reptile.class 4KB
Link.class 4KB
Common.class 3KB
ReplaceUrl.class 3KB
CreateDownloadUrl.class 3KB
CreateRangeUrl.class 3KB
CreateHTMLFile.class 3KB
Common.class 3KB
CreateBinFile.class 3KB
Cache.class 3KB
IndexTypeFilter.class 3KB
CreateUnDownloadUrl.class 3KB
CreateForbidUrl.class 3KB
ReplaceAllUrl.class 3KB
SrcGetRegexUrl.class 3KB
AhrefGetRegexUrl.class 3KB
FilterAllUrl.class 3KB
CommonObject.class 3KB
Link.class 2KB
UrlUtil.class 2KB
CreateRemoteUrl.class 2KB
SingleRegexlUrl.class 2KB
CreateHTMLFile.class 2KB
CreateDownloadUrl.class 2KB
CreateRangeUrl.class 2KB
ReplaceUrl.class 2KB
CreateBinFile.class 2KB
RemoteUrlFilter.class 2KB
UnDownloadUrlFilter.class 2KB
DownloadUrlFilter.class 2KB
CreateUnDownloadUrl.class 2KB
CreateForbidUrl.class 2KB
IndexTypeFilter.class 2KB
TempLink.class 2KB
Cache.class 2KB
ResolvedUrlFilter.class 2KB
SrcGetRegexUrl.class 2KB
AhrefGetRegexUrl.class 2KB
TempCache.class 2KB
ForbidUrlFilter.class 2KB
CreateRemoteUrl.class 2KB
RangeUrlFilter.class 2KB
UrlUtil.class 2KB
FileOperator.class 2KB
SingleRegexlUrl.class 2KB
DownloadUrlFilter.class 2KB
UnDownloadUrlFilter.class 2KB
RemoteUrlFilter.class 2KB
ResolvedUrlFilter.class 1KB
TempLink.class 1KB
StartReptile.class 1KB
CacheFile.class 1KB
TempCache.class 1KB
ForbidUrlFilter.class 1KB
RangeUrlFilter.class 1KB
UrlRegex.class 1KB
CacheFile.class 1KB
FileOperator.class 973B
StartReptile.class 928B
ReptileSystemException.class 904B
ParseHtmlInfo.class 897B
ReptileRuntimeException.class 867B
Test.class 817B
UrlRegex.class 790B
ConfigParameterImpl.class 779B
SystemConfig.class 684B
UrlRegexMap.class 637B
ICreateUrl.class 631B
IBuildUrl.class 607B
IMakeUpUrl.class 598B
ICreateUrl.class 592B
ParseHtmlInfo.class 576B
NormalMain.class 572B
IBuildUrl.class 569B
IMakeUpUrl.class 559B
Test.class 538B
ReptileSystemException.class 496B
ConfigParameterImpl.class 489B
ReptileRuntimeException.class 464B
ICreateFile.class 457B
IUrlRegex.class 424B
IReplaceUrl.class 418B
ICreateFile.class 417B
共 436 条
- 1
- 2
- 3
- 4
- 5
季枫3518
- 粉丝: 12
- 资源: 88
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
- 3
前往页