/**
*
*/
package ksMethod;
import java.io.IOException;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.index.IndexReader;
/**
* @author Xia Ruijun
*
*/
public class CreateIndexWord {
/**
* 提取领域标引词――输入一组样本文献,输出文献预处理后的标引词
*/
StandardAnalyzer analyzer = new StandardAnalyzer();
public String[] FiledIndexWord(String[] SampleDocument) {
HashSet<String> Temp = new HashSet<String>();
// try {
for (int i = 0; i < SampleDocument.length; i++) {
// StringReader reader = new StringReader(SampleDocument[i]);
String[] tempIndexWord = fenci(SampleDocument[i]);
for (int j = 0; j < tempIndexWord.length; j++) {
if(tempIndexWord[j].length()>3&&tempIndexWord[j].length()<9){
Temp.add(tempIndexWord[j].toLowerCase());
}
}
}
Iterator iter = Temp.iterator();
String[] IndexWord=new String[Temp.size()];
int i = 0;
while (iter.hasNext()) {
IndexWord[i]=String.valueOf(iter.next());
i++;
}
return IndexWord;
}
// 分词程序
public String[] fenci(String source) {
/* 分隔符的集合 */
String delimiters = " \t\n\r\f~!@#$%^&*()_ |`1234567890-=\\{}[]:\";'<>?,./'";
/* 根据分隔符分词 */
StringTokenizer stringTokenizer = new StringTokenizer(source,
delimiters);
Vector<String> vector = new Vector<String>();
/* 根据大写首字母分词 */
- 1
- 2
前往页