package com.wind.model;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.demo.html.HTMLParser;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
public class DBdemo {
// 创建索引
public void indexFiles() {
// 创建索引文件存放路径
File indexDir = new File("D:\\lucene\\index");
try {
Date start = new Date();
// 创建分析器,主要用于从文本中抽取那些需要建立索引的内容,把不需要参与建索引的文本内容去掉.
// 比如去掉一些a the之类的常用词,还有决定是否大小写敏感.
StandardAnalyzer standardAnalyzer = new StandardAnalyzer();
// 参数true用于确定是否覆盖原有索引的
IndexWriter indexWriter = new IndexWriter(indexDir,
standardAnalyzer, true);
indexWriter.setMergeFactor(100);
indexWriter.setMaxBufferedDocs(100);
// 只索引这个Field的前5000个字,默认为10000
indexWriter.setMaxFieldLength(5000);
// 从数据库取出所有纪录
List<News> newsList = NewsManager.getNewss();
for (int i = 0; i < newsList.size(); i++) {
News news = (News) newsList.get(i);
// 在Document方法是创建索引的具体代码
Document doc = Document(news);
indexWriter.addDocument(doc);
}
// Optimize的过程就是要减少剩下的Segment的数量,尽量让它们处于一个文件中.
indexWriter.optimize();
indexWriter.close();
Date end = new Date();
System.out
.println("create index: "
+ (end.getTime() - start.getTime())
+ " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass()
+ "\n with message: " + e.getMessage());
}
}
public static Document Document(News news) throws java.io.IOException {
Document doc = new Document();
// 为news表的主健创建索引,关于Field的几个参数下面有详细解释
Field fieldId = new Field("uid", news.getNewsId().toString(),
Field.Store.YES, Field.Index.UN_TOKENIZED, Field.TermVector.YES);
// 为detail字段创建索引,detail在DB中是clob字段,内容为html文本
String contentHtml = news.getContent();
if (contentHtml == null) {
contentHtml = "";
}
contentHtml = contentHtml.replaceAll(" ", "");
contentHtml = contentHtml.replaceAll("<[^<>]*>", "");
Reader read = new StringReader(contentHtml);
// 用HTMLParser把detail字段中的HTML分析成文本在索引
// HTMLParser这个类可以在lucene的demo中找到
HTMLParser htmlParser = new HTMLParser(read);
BufferedReader breader = new BufferedReader(htmlParser.getReader());
String htmlContent = "";
String tempContent = breader.readLine();
while (tempContent != null && tempContent.length() > 0) {
htmlContent = htmlContent + tempContent;
tempContent = breader.readLine();
}
Field fieldContents = new Field("content", htmlContent,
Field.Store.COMPRESS, Field.Index.TOKENIZED,
Field.TermVector.YES);
// db中的每条纪录对应一个doc,每个字段对应一个field
doc.add(fieldId);
doc.add(fieldContents);
return doc;
}
// 搜索文件,keyword是你在页面上输入的查找关键字,这里查找的是detail字段
public List<News> searchFiles(String keyword) throws ParseException {
String index = "D:\\lucene\\index";
// hitsList用来保存db的纪录,这些纪录可以通过查询结果取到
List<News> hitsList = new ArrayList<News>();
try {
Date start = new Date();
IndexReader reader = IndexReader.open(index);
Searcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer();
QueryParser parser = new QueryParser("content", analyzer);
// 解析查询关键字,比如输入的是以空格等分开的多个查询关键字,这里解析后,可以多条件查询
Query query = parser.parse(keyword);
// hits用来保存查询结果,这里的hits相当于sql中的result
Hits hits = searcher.search(query);
for (int i = 0; i < hits.length(); i++) {
Document doc = hits.doc(i);
// 获得news表的主健
String id = doc.get("uid");
// 根据主健去db中取纪录,返回到hitsList中
News news = null;
news = NewsManager.getNews(id);
// 如果没有找到该纪录,表示该纪录已经不存在,不必添加到hitsList中
System.out.println("ID-------------" + id);
System.out.println("content------------"
+ doc.get("content"));
System.out.println("score------------" + hits.score(i));
if (news != null)
hitsList.add(news);
}
System.out.println("--------------" + hits.length());
searcher.close();
reader.close();
Date end = new Date();
System.out
.println("search files: "
+ (end.getTime() - start.getTime())
+ " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass()
+ "\n with message: " + e.getMessage());
}
return hitsList;
}
// 删除索引
public void deleteIndex() {
String index = "D:\\lucene\\index";
try {
Date start = new Date();
IndexReader reader = IndexReader.open(index);
int numFiles = reader.numDocs();
for (int i = 0; i < numFiles; i++) {
// 这里的删除只是给文档做一个删除标记,你可以看到执行deleteDocument后会产生一个del后缀的文件,
// 用来记录这些标记过的文件
reader.deleteDocument(i);
}
reader.close();
Date end = new Date();
System.out
.println("delete index: "
+ (end.getTime() - start.getTime())
+ " total milliseconds");
} catch (IOException e) {
System.out.println(" caught a " + e.getClass()
+ "\n with message: " + e.getMessage());
}
}
// 恢复已删除的索引
public void unDeleteIndex() {
String index = "D:\\lucene\\index";
try {
IndexReader reader = IndexReader.open(index);
reader.undeleteAll();
reader.close();
} catch (IOException e) {
System.out.println(" caught a " + e.getClass()
+ "\n with message: " + e.getMessage());
}
}
public static void main(String[] args) {
DBdemo demo = new DBdemo();
List<News> aa = new ArrayList();
try {
aa= demo.searchFiles("高");
System.err.println(aa.size());
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}