基于Java的中文分词库IKAnalyzer.zip资源-CSDN文库

共25个文件

java：20个

dic：4个

xml：1个

java

中文分词

5星 · 超过95%的资源 72 浏览量 2023-06-17 14:30:56 上传评论 1 收藏 1.12MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于Java的中文分词库 IKAnalyzer.zip （25个子文件）

src

stopword.dic 161B

IKAnalyzer.cfg.xml 414B

ext.dic 4B

org

wltea

analyzer

sample

IKAnalyzerDemo.java 5KB

core

CharacterUtil.java 3KB

CN_QuantifierSegmenter.java 7KB

IKSegmenter.java 5KB

IKArbitrator.java 5KB

LetterSegmenter.java 9KB

QuickSortSet.java 6KB

LexemePath.java 6KB

ISegmenter.java 1KB

CJKSegmenter.java 4KB

AnalyzeContext.java 11KB

Lexeme.java 6KB

dic

Hit.java 3KB

quantifier.dic 2KB

main2012.dic 2.91MB

DictSegment.java 9KB

Dictionary.java 10KB

lucene

IKTokenizer.java 3KB

IKAnalyzer.java 2KB

solr

IKTokenizerFactory.java 2KB

cfg

DefualtConfig.java 5KB

Configuration.java 2KB

/** * IK 中文分词版本 5.0 * IK Analyzer release 5.0 * * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * 源代码由林良益([email protected])提供 * 版权声明 2012，乌龙茶工作室 * provided by Linliangyi and copyright 2012 by Oolong studio * */ package org.wltea.analyzer.core; import java.io.IOException; import java.io.Reader; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.Map; import java.util.Set; import org.wltea.analyzer.cfg.Configuration; import org.wltea.analyzer.dic.Dictionary; /** * * 分词器上下文状态 * */ class AnalyzeContext { //默认缓冲区大小 private static final int BUFF_SIZE = 3072; //缓冲区耗尽的临界值 private static final int BUFF_EXHAUST_CRITICAL = 48; //字符窜读取缓冲 private char[] segmentBuff; //字符类型数组 private int[] charTypes; //记录Reader内已分析的字串总长度 //在分多段分析词元时，该变量累计当前的segmentBuff相对于reader起始位置的位移 private int buffOffset; //当前缓冲区位置指针 private int cursor; //最近一次读入的,可处理的字串长度 private int available; //子分词器锁 //该集合非空，说明有子分词器在占用segmentBuff private Set<String> buffLocker; //原始分词结果集合，未经歧义处理 private QuickSortSet orgLexemes; //LexemePath位置索引表 private Map<Integer , LexemePath> pathMap; //最终分词结果集 private LinkedList<Lexeme> results; //分词器配置项 private Configuration cfg; public AnalyzeContext(Configuration cfg){ this.cfg = cfg; this.segmentBuff = new char[BUFF_SIZE]; this.charTypes = new int[BUFF_SIZE]; this.buffLocker = new HashSet<String>(); this.orgLexemes = new QuickSortSet(); this.pathMap = new HashMap<Integer , LexemePath>(); this.results = new LinkedList<Lexeme>(); } int getCursor(){ return this.cursor; } // // void setCursor(int cursor){ // this.cursor = cursor; // } char[] getSegmentBuff(){ return this.segmentBuff; } char getCurrentChar(){ return this.segmentBuff[this.cursor]; } int getCurrentCharType(){ return this.charTypes[this.cursor]; } int getBufferOffset(){ return this.buffOffset; } /** * 根据context的上下文情况，填充segmentBuff * @param reader * @return 返回待分析的（有效的）字串长度 * @throws IOException */ int fillBuffer(Reader reader) throws IOException{ int readCount = 0; if(this.buffOffset == 0){ //首次读取reader readCount = reader.read(segmentBuff); }else{ int offset = this.available - this.cursor; if(offset > 0){ //最近一次读取的>最近一次处理的，将未处理的字串拷贝到segmentBuff头部 System.arraycopy(this.segmentBuff , this.cursor , this.segmentBuff , 0 , offset); readCount = offset; } //继续读取reader ，以onceReadIn - onceAnalyzed为起始位置，继续填充segmentBuff剩余的部分 readCount += reader.read(this.segmentBuff , offset , BUFF_SIZE - offset); } //记录最后一次从Reader中读入的可用字符长度 this.available = readCount; //重置当前指针 this.cursor = 0; return readCount; } /** * 初始化buff指针，处理第一个字符 */ void initCursor(){ this.cursor = 0; this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); } /** * 指针+1 * 成功返回 true；指针已经到了buff尾部，不能前进，返回false * 并处理当前字符 */ boolean moveCursor(){ if(this.cursor < this.available - 1){ this.cursor++; this.segmentBuff[this.cursor] = CharacterUtil.regularize(this.segmentBuff[this.cursor]); this.charTypes[this.cursor] = CharacterUtil.identifyCharType(this.segmentBuff[this.cursor]); return true; }else{ return false; } } /** * 设置当前segmentBuff为锁定状态 * 加入占用segmentBuff的子分词器名称，表示占用segmentBuff * @param segmenterName */ void lockBuffer(String segmenterName){ this.buffLocker.add(segmenterName); } /** * 移除指定的子分词器名，释放对segmentBuff的占用 * @param segmenterName */ void unlockBuffer(String segmenterName){ this.buffLocker.remove(segmenterName); } /** * 只要buffLocker中存在segmenterName * 则buffer被锁定 * @return boolean 缓冲去是否被锁定 */ boolean isBufferLocked(){ return this.buffLocker.size() > 0; } /** * 判断当前segmentBuff是否已经用完 * 当前执针cursor移至segmentBuff末端this.available - 1 * @return */ boolean isBufferConsumed(){ return this.cursor == this.available - 1; } /** * 判断segmentBuff是否需要读取新数据 * * 满足一下条件时， * 1.available == BUFF_SIZE 表示buffer满载 * 2.buffIndex < available - 1 && buffIndex > available - BUFF_EXHAUST_CRITICAL表示当前指针处于临界区内 * 3.!context.isBufferLocked()表示没有segmenter在占用buffer * 要中断当前循环（buffer要进行移位，并再读取数据的操作） * @return */ boolean needRefillBuffer(){ return this.available == BUFF_SIZE && this.cursor < this.available - 1 && this.cursor > this.available - BUFF_EXHAUST_CRITICAL && !this.isBufferLocked(); } /** * 累计当前的segmentBuff相对于reader起始位置的位移 */ void markBufferOffset(){ this.buffOffset += this.cursor; } /** * 向分词结果集添加词元 * @param lexeme */ void addLexeme(Lexeme lexeme){ this.orgLexemes.addLexeme(lexeme); } /** * 添加分词结果路径 * 路径起始位置 ---> 路径映射表 * @param path */ void addLexemePath(LexemePath path){ if(path != null){ this.pathMap.put(path.getPathBegin(), path); } } /** * 返回原始分词结果 * @return */ QuickSortSet getOrgLexemes(){ return this.orgLexemes; } /** * 处理未知类型的CJK字符 */ void processUnkownCJKChar(){ int index = 0; for( ; index < this.available ;){ //跳过标点符号等字符 if(CharacterUtil.CHAR_USELESS == this.charTypes[index]){ index++; continue; } //从pathMap找出对应index位置的LexemePath LexemePath path = this.pathMap.get(index); if(path != null){ //输出LexemePath中的lexeme到results集合 Lexeme l = path.pollFirst(); while(l != null){ this.results.add(l); //将index移至lexeme后 inde

评论收藏

内容反馈