package com.luceneheritrixbook.extractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Parser;
import com.luceneheritrixbook.extractor.com163.Extract163Moblie;
import com.luceneheritrixbook.extractor.pconline.mobile.ExtractPconlineMoblie;
public abstract class Extractor {
protected static final String NEWLINE = "\r\n";
/**
* ��ʾ���н������·��
*/
private String outputPath = "";
/**
* ��ʾ��ǰ���ڱ�������ļ�
*/
private String inuputFilePath;
/**
* ��ʾ��ǰ���б�ץȡ����ҳ�ľ����Ŀ¼ ��Heritrix��mirrorĿ¼��ʾ
*/
private String mirrorDir = "";
/**
* ���ڴ�ű�������IJ�ڵ�ͼƬ��Ŀ¼
*/
private String imageDir = "";
/**
* HTMLParser��ʵ��
*/
private Parser parser;
/**
* ��ͼƬ·�����й�ϣ���㷨���������MD5�㷨
*/
protected static final String HASH_ALGORITHM = "md5";
/**
* �ָ��
*/
public static final String SEPARATOR = "======================";
/**
* װ����Ҫ����ҳ�ļ�
*
*/
public void loadFile(String path) {
try {
parser = new Parser(path);
inuputFilePath = path;
parser.setEncoding("GBK");
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* ��ȡ����·��
*/
public String getOutputPath() {
return outputPath;
}
/**
* ��������·����ͨ���ڳ�ʼ��Extractorʱ��Ӧ����
*/
public void setOutputPath(String outputPath) {
this.outputPath = outputPath;
}
public Parser getParser() {
return parser;
}
/**
* ʹ������4ƥ�䲢�����ҳ�е��ַ�
*/
protected String getProp(String pattern, String match, int index) {
Pattern sp = Pattern.compile(pattern);
Matcher matcher = sp.matcher(match);
while (matcher.find()) {
return matcher.group(index);
}
return null;
}
/**
* ���������ڹ�����ʵ�֡� �书����Ҫ�ǽ�����ҳ�ļ� ����Ʒ��Ϣ���浽
*
*/
public abstract void extract();
/**
* ��ȡ���ڴ�����ļ���·��手机
*/
public String getInuputFilePath() {
return inuputFilePath;
}
/**
* ��mirrorĿ¼�¿����ļ������趨��ͼƬĿ¼
* �÷���������Ҫ���ı�
*/
protected boolean copyImage(String image_url, String new_image_file) {
String dirs = image_url.substring(7);
try {
// instance the File as file_in and file_out
File file_in = new File(new File(mirrorDir), dirs);
if (file_in == null || !file_in.exists()) {
file_in = new File("e:\\noimage.jpg");
}
File file_out = new File(new File(imageDir), new_image_file);
FileInputStream in1 = new FileInputStream(file_in);
FileOutputStream out1 = new FileOutputStream(file_out);
byte[] bytes = new byte[1024];
int c;
while ((c = in1.read(bytes)) != -1)
out1.write(bytes, 0, c);
// close
in1.close();
out1.close();
return (true); // if success then return true
} catch (Exception e) {
e.printStackTrace();
return (false); // if fail then return false
}
}
public String getImageDir() {
return imageDir;
}
public void setImageDir(String imageDir) {
this.imageDir = imageDir;
}
public String getMirrorDir() {
return mirrorDir;
}
public void setMirrorDir(String mirrorDir) {
this.mirrorDir = mirrorDir;
}
public void setInuputFilePath(String inuputFilePath) {
this.inuputFilePath = inuputFilePath;
}
// public static void main(String[] args) throws Exception {
//
// Extractor extractor = new Extract163Moblie();
// extractor.setOutputPath("c:\\product\\test\\mobile\\");
// extractor.setImageDir("c:\\product\\test\\image\\");
// extractor.setMirrorDir("F:\\data\\163�ֻ�\\mirror\\");
//
// traverse(extractor, new File("F:\\data\\163�ֻ�\\mirror\\mobile.163.com\\0011\\product\\0011000B\\product"));
// System.out.println(count);
//
// }
static int count = 0;
public static void main(String[] args) throws Exception {
Extractor extractor = new ExtractPconlineMoblie();
extractor.setOutputPath("c:\\product\\mobile\\");
extractor.setImageDir("c:\\product\\image\\");
extractor.setMirrorDir("F:\\");
traverse(extractor, new File("F:\\data\\product.pconline.com.cn\\product"));
System.out.println(count);
}
public static void traverse(Extractor extractor, File path)
throws Exception {
if (path == null) {
return;
}
if (path.isDirectory()) {
String[] files = path.list();
for (int i = 0; i < files.length; i++) {
traverse(extractor, new File(path, files[i]));
}
} else {
if (path.getAbsolutePath().endsWith(".html")
&& path.getAbsolutePath().indexOf("_") == -1) {
System.out.println(path);
count++;
extractor.loadFile(path.getAbsolutePath());
extractor.extract();
}
}
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
开发自己的搜索引擎
共101个文件
class:43个
java:26个
jar:18个
4星 · 超过85%的资源 需积分: 31 183 下载量 38 浏览量
2008-08-02
20:35:54
上传
评论 8
收藏 5.35MB RAR 举报
温馨提示
这是在没有tomcat插件下的搜索引擎实例源码
资源推荐
资源详情
资源评论
收起资源包目录
开发自己的搜索引擎 (101个子文件)
ExtractPconlineMoblie.class 5KB
ProductTextFileProcessor.class 5KB
Extract163Moblie.class 5KB
ProductTextFileProcessor.class 5KB
Extractor.class 5KB
ExtractPconlineMoblie.class 5KB
Extractor.class 5KB
SearchServiceImpl.class 4KB
ExtractPconlineMoblie.class 4KB
Extract163Moblie.class 4KB
Extract163Notebook.class 3KB
Extract163Notebook.class 3KB
Extract163Mp3.class 3KB
Extract163Mp3.class 3KB
Extract163Dv.class 3KB
Extract163Dc.class 3KB
Extract163Dc.class 3KB
Extract163Dv.class 3KB
ProductJDBC.class 3KB
ProductJDBC.class 3KB
StringUtils.class 3KB
StringUtils.class 3KB
BuildWordVacabulary.class 3KB
BuildWordVacabulary.class 3KB
SearchResultDaoImpl$2.class 2KB
ProductDocument.class 2KB
ProductDocument.class 2KB
SearchResult.class 2KB
Product.class 2KB
Product.class 2KB
ProductIndexer.class 2KB
SearchResultDaoImpl.class 2KB
PropertyConfiguration.class 2KB
ProductIndexer.class 2KB
ShowPicServlet.class 2KB
SearchResults.class 1KB
Test1.class 1KB
SetResponseFilter.class 1KB
SearchRequest.class 1KB
SearchResultDaoImpl$1.class 1KB
KeywordProcessor.class 473B
SearchService.class 380B
SearchResultDao.class 248B
.classpath 2KB
spring.jar 1.81MB
xerces.jar 1.73MB
je-analysis-1.4.0.jar 878KB
mysql-connector-java-3.1.12-bin.jar 436KB
lucene-core-2.0.0.jar 394KB
standard.jar 384KB
htmlparser.jar 281KB
jaxen-1.1-beta-6.jar 239KB
commons-lang.jar 190KB
dwr.jar 181KB
commons-collections.jar 171KB
commons-dbcp-1.2.1.jar 105KB
htmllexer.jar 68KB
spring-mock.jar 42KB
commons-pool-1.2.jar 41KB
commons-logging-1.0.4.jar 37KB
commons-fileupload.jar 22KB
jstl.jar 20KB
Extractor.java 5KB
ExtractPconlineMoblie.java 5KB
ProductTextFileProcessor.java 4KB
Extract163Moblie.java 4KB
SearchServiceImpl.java 4KB
StringUtils.java 3KB
Extract163Mp3.java 2KB
Extract163Dv.java 2KB
Extract163Dc.java 2KB
Extract163Notebook.java 2KB
ProductJDBC.java 2KB
BuildWordVacabulary.java 2KB
SearchResultDaoImpl.java 2KB
ProductDocument.java 2KB
SearchResult.java 2KB
Product.java 1KB
PropertyConfiguration.java 1KB
ProductIndexer.java 1KB
SearchResults.java 930B
ShowPicServlet.java 886B
SearchRequest.java 699B
SetResponseFilter.java 673B
Test1.java 555B
SearchService.java 441B
SearchResultDao.java 222B
KeywordProcessor.java 209B
footer.jpg 17KB
logo.jpg 14KB
main.jsp 5KB
detail.jsp 2KB
MANIFEST.MF 39B
.mymetadata 285B
.project 1KB
app.properties 604B
app.properties 604B
web.xml 2KB
applicationContext.xml 1KB
applicationContext.xml 1KB
共 101 条
- 1
- 2
caoxu1987728
- 粉丝: 177
- 资源: 36
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
- 1
- 2
- 3
- 4
前往页