package org.welkin.lucene;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spell.*;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
/**
* 搜索纠错
*
* @author Hyoutei
* @version 1.0
* @date 2019/1/11 15:54
*/
public class CnSpellChecker implements Closeable {
private Directory spellIndex;
private float bStart;
private float bEnd;
private IndexSearcher searcher;
private final Object searcherLock;
private final Object modifyCurrentIndexLock;
private volatile boolean closed;
private float accuracy;
private StringDistance sd;
private Comparator<SuggestWord> comparator;
private CnSpellChecker(Directory spellIndex, StringDistance sd) throws IOException {
this(spellIndex, sd, SuggestWordQueue.DEFAULT_COMPARATOR);
}
CnSpellChecker(Directory spellIndex) throws IOException {
this(spellIndex, new LevensteinDistance());
}
private CnSpellChecker(Directory spellIndex, StringDistance sd, Comparator<SuggestWord> comparator) throws IOException {
this.bStart = 2.0F;
this.bEnd = 1.0F;
this.searcherLock = new Object();
this.modifyCurrentIndexLock = new Object();
this.closed = false;
this.accuracy = 0.5F;
this.setSpellIndex(spellIndex);
this.setStringDistance(sd);
this.comparator = comparator;
}
private void setSpellIndex(Directory spellIndexDir) throws IOException {
synchronized (this.modifyCurrentIndexLock) {
this.ensureOpen();
if (!DirectoryReader.indexExists(spellIndexDir)) {
IndexWriter writer = new IndexWriter(spellIndexDir, new IndexWriterConfig(null));
writer.close();
}
this.swapSearcher(spellIndexDir);
}
}
public void setComparator(Comparator<SuggestWord> comparator) {
this.comparator = comparator;
}
public Comparator<SuggestWord> getComparator() {
return this.comparator;
}
public void setStringDistance(StringDistance sd) {
this.sd = sd;
}
public StringDistance getStringDistance() {
return this.sd;
}
public void setAccuracy(float acc) {
this.accuracy = acc;
}
public float getAccuracy() {
return this.accuracy;
}
public String[] suggestSimilar(String word, int numSug) throws IOException {
return this.suggestSimilar(word, numSug, (IndexReader) null, (String) null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX);
}
public String[] suggestSimilar(String word, int numSug, float accuracy) throws IOException {
return this.suggestSimilar(word, numSug, (IndexReader) null, (String) null, SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX, accuracy);
}
public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, SuggestMode suggestMode) throws IOException {
return this.suggestSimilar(word, numSug, ir, field, suggestMode, this.accuracy);
}
public String[] suggestSimilar(String word, int numSug, IndexReader ir, String field, SuggestMode suggestMode, float accuracy) throws IOException {
IndexSearcher indexSearcher = this.obtainSearcher();
try {
if (ir == null || field == null) {
suggestMode = SuggestMode.SUGGEST_ALWAYS;
}
if (suggestMode == SuggestMode.SUGGEST_ALWAYS) {
ir = null;
field = null;
}
int lengthWord = word.length();
int freq = ir != null ? ir.docFreq(new Term(field, word)) : 0;
int goalFreq = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR ? freq : 0;
if (suggestMode == SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX && freq > 0) {
return new String[]{word};
} else {
BooleanQuery.Builder query = new BooleanQuery.Builder();
int ng;
for (ng = getMin(lengthWord); ng <= getMax(lengthWord); ++ng) {
String key = "gram" + ng;
String[] grams = formGrams(word, ng);
if (grams.length != 0) {
if (this.bStart > 0.0F) {
add(query, "start" + ng, grams[0], this.bStart);
}
if (this.bEnd > 0.0F) {
add(query, "end" + ng, grams[grams.length - 1], this.bEnd);
}
for (String gram : grams) {
add(query, key, gram);
}
}
}
ng = 10 * numSug;
ScoreDoc[] hits = indexSearcher.search(query.build(), ng).scoreDocs;
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug, this.comparator);
int stop = Math.min(hits.length, ng);
SuggestWord sugWord = new SuggestWord();
for (int i = 0; i < stop; ++i) {
sugWord.string = indexSearcher.doc(hits[i].doc).get("word");
sugWord.score = this.sd.getDistance(word, sugWord.string);
if (sugWord.score >= accuracy) {
if (ir != null) {
sugWord.freq = ir.docFreq(new Term(field, sugWord.string));
boolean exits = suggestMode == SuggestMode.SUGGEST_MORE_POPULAR && goalFreq > sugWord.freq || sugWord.freq < 1;
if (exits) {
continue;
}
}
sugQueue.insertWithOverflow(sugWord);
if (sugQueue.size() == numSug) {
accuracy = sugQueue.top().score;
}
sugWord = new SuggestWord();
}
}
String[] list = new String[sugQueue.size()];
for (int i = sugQueue.size() - 1; i >= 0; --i) {
list[i] = sugQueue.pop().string;
}
return list;
}
} finally {
this.releaseSearcher(indexSearcher);
}
}
private static void add(BooleanQuery.Builder q, String name, String value, float boost) {
Query tq = new TermQuery(new Term(name, value));
q.add(new BooleanClause(new BoostQuery(tq, boost), BooleanClause.Occur.SHOULD));
}
private static void add(BooleanQuery.Builder q, String name, String value) {
q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
}
private static String[] formGrams(String text, int ng) {
int len = text.length();
String[] res = new String[len - ng + 1];
for (int i = 0; i < len - ng + 1; ++i) {
res[i] = text.substring(i, i + ng);
}
return res;
}
public void clearIndex() throws IOException {
synchronized (this.modifyCurrentIndexLock) {
this.ensureOpen();
Directory dir = this.spellIndex;
IndexWriter writer = new IndexWriter(d
昨夜听雨
- 粉丝: 2
- 资源: 22
最新资源
- 快速定制中国传统节日头像(源码)
- hcia 复习内容的实验
- 准Z源光伏并网系统MATLAB仿真模型,采用了三次谐波注入法SPWM调制,具有更高的电压利用效率 并网部分采用了电压外环电流内环 电池部分采用了扰动观察法,PO Z源并网和逆变器研究方向的同学可
- 海面目标检测跟踪数据集.zip
- 欧美风格, 节日主题模板
- 西门子1200和三菱FXU通讯程序
- 11种概率分布的拟合与ks检验,可用于概率分析,可靠度计算等领域 案例中提供11种概率分布,具体包括:gev、logistic、gaussian、tLocationScale、Rayleigh、Log
- 机械手自动排列控制PLC与触摸屏程序设计
- uDDS源程序publisher
- 中国风格, 节日 主题, PPT模板
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈