package sample.dw.paper.lucene.index;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
//import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import sample.dw.paper.lucene.util.HTMLDocParser;
/**
* This class is used to create index for html files
*
*/
public class IndexManager {
//the directory that stores html files
private final String dataDir = "F:\\MyJ2EE\\workspace\\luceneweb\\jsp";
//the directory that is used to store lucene index
private final String indexDir = "E:\\nutch-0.7.2\\mobs\\index";
/**
* create index
*/
public boolean createIndex() throws IOException{
if(true == ifIndexExist()){
return true;
}
File dir = new File(dataDir);
if(!dir.exists()){
return false;
}
File[] htmls = dir.listFiles();
Directory fsDirectory = FSDirectory.getDirectory(indexDir, true);
Analyzer analyzer = new MMAnalyzer(2);
IndexWriter indexWriter = new IndexWriter(fsDirectory, analyzer, true);
for(int i = 0; i < htmls.length; i++){
String htmlUrl = htmls[i].getAbsolutePath();
if(htmlUrl.endsWith(".html") || htmlUrl.endsWith(".htm")){
//addDocument(htmlPath, indexWriter);
}
}
indexWriter.optimize();
indexWriter.close();
return true;
}
/**
* Add one document to the lucene index
* @throws UnsupportedEncodingException
*/
public void addDocument(String htmlUrl, IndexWriter indexWriter){
HTMLDocParser htmlParser = new HTMLDocParser(htmlUrl);
String url = htmlParser.getUrl();
String title = htmlParser.getTitle();
Reader content = htmlParser.getContent();
Document document = new Document();
document.add(new Field("url",url,Field.Store.YES,Field.Index.NO));
document.add(new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED));
document.add(new Field("content",content));
try {
indexWriter.addDocument(document);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* judge if the index is already exist
*/
public boolean ifIndexExist(){
File directory = new File(indexDir);
if(0 < directory.listFiles().length){
return true;
}else{
return false;
}
}
public String getDataDir(){
return this.dataDir;
}
public String getIndexDir(){
return this.indexDir;
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
词源码,做搜索引擎需要用到的好东西哦.rar
共27个文件
java:7个
jar:7个
class:7个
需积分: 0 6 下载量 183 浏览量
2008-10-11
15:18:30
上传
评论
收藏 1.49MB RAR 举报
温馨提示
关于做搜索引擎时需要的用到的源代码,可以明确如何把用户所输入的关键字做为一个搜索的源
资源详情
资源评论
资源推荐
收起资源包目录
,做搜索引擎需要用到的好东西哦.rar (27个子文件)
.project 1KB
.mymetadata 291B
WebRoot
WEB-INF
web.xml 994B
lib
lucene-snowball-2.0.jar 83KB
je-analysis-1.4.0.jar 878KB
lucene-highlighter-2.0.jar 23KB
servlet-api.jar 95KB
lucene-analyzers-2.0.jar 57KB
lucene-core-2.0.0.jar 394KB
luceneHtmlParser.jar 39KB
classes
com
lucene
FLucene.class 507B
sample
dw
paper
lucene
search
Segment.class 4KB
SearchManager.class 3KB
SearchResultBean.class 789B
servlet
SearchController.class 2KB
index
IndexManager.class 3KB
util
HTMLDocParser.class 2KB
META-INF
MANIFEST.MF 39B
search.jsp 3KB
.myeclipse
src
com
lucene
FLucene.java 269B
sample
dw
paper
lucene
search
Segment.java 3KB
SearchManager.java 2KB
SearchResultBean.java 416B
servlet
SearchController.java 1KB
index
IndexManager.java 3KB
util
HTMLDocParser.java 1KB
.classpath 903B
共 27 条
- 1
weiliu850708
- 粉丝: 0
- 资源: 1
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功
评论0