lucene +中文分词

lucene

需积分: 10 2 下载量 30 浏览量 2014-02-24 20:16:44 上传评论收藏 6KB TXT 举报

温馨提示

试读

7页

Lucene 与中文分词的结合

资源推荐

资源详情

资源评论

package com.xh.analyzer;

import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;

import ICTCLAS.I3S.AC.ICTCLAS50;

public class ICTCLASAnalyzer extends Analyzer {
private ICTCLAS50 icta;
private volatile boolean initialized = false;
public ICTCLASAnalyzer() throws UnsupportedEncodingException {
icta = new ICTCLAS50();
String initPath=".";
// 初始化
if (icta.ICTCLAS_Init(initPath.getBytes("GB2312")) == false) {
System.out.println("Init Fail!");
return;

}

// 设置词性标注集(0 计算所二级标注集，1 计算所一级标注集，2 北大二级标注集，3 北大一级标注集)
icta.ICTCLAS_SetPOSmap(2);

// 导入用户字典
int nCount = 0;
String usrdir = "userdict.txt"; // 用户字典路径
byte[] usrdirb = usrdir.getBytes();// 将string转化为byte类型
// 导入用户字典,返回导入用户词语个数第一个参数为用户字典路径，第二个参数为用户字典的编码类型
nCount = icta.ICTCLAS_ImportUserDictFile(usrdirb, 0);
//System.out.println("导入用户词个数" + nCount);
initialized = true;
}
public List<String> tokenizeReader(Reader reader) {
List<String> result = new ArrayList<String>(1000);
try {
StringBuffer contentbuffer = new StringBuffer();
char[] temp = new char[1024];
int size = 0;
while ((size = reader.read(temp, 0, 1024)) != -1) {
String tempstr = new String(temp, 0, size);
contentbuffer.append(tempstr); }
byte nativeBytes[] = icta.ICTCLAS_ParagraphProcess(contentbuffer.toString().getBytes("GB2312"), 2, 1);
String nativeStr = new String(nativeBytes, 0, nativeBytes.length, "GB2312");
System.out.println("分词结果： " + nativeStr);
//进行词用词过滤
String[] terms=nativeStr.split("\\s+");
int pos;
String term,type;
for (String string : terms) {
pos=string.lastIndexOf('/');
if(pos==-1)continue;
term=string.substring(0,pos);
type=string.substring(pos+1, string.length());

剩余6页未读，继续阅读

评论收藏

内容反馈

资源评论