/******************************************
http://www.sqlet.com
mail:199909@gmail.com
中文分词测试版
author:linfj
词典文件:sqlet.dict
*******************************************/
#include <stdio.h>
#include <string.h>
#include <malloc.h>
#define BOOL char
#define bool BOOL
#define TRUE 1
#define FALSE 0
#define MAX_CWORD_LEN 10 //最长的词
#define MAX_SWORD_LEN 256 //最长的句子
#define MAX_CDIM 90
//不进行索引的单词
char *arrayEnglishStop[] = {
"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
"about", "above", "after", "again", "all", "also", "am", "an", "and", "any", "are", "as", "at",
"back", "be", "been", "before", "behind", "being", "below", "but", "by",
"can", "click", "do", "does", "done", "each", "else", "etc", "ever", "every",
"few", "for", "from", "generally", "get", "go", "gone", "has", "have", "hello", "here", "how",
"if", "in", "into", "is", "just", "keep", "later", "let", "like", "lot", "lots", "made",
"make", "makes", "many", "may", "me", "more", "most", "much", "must", "my", "need", "no", "not",
"now", "of", "often", "on", "only", "or", "other", "others", "our", "out", "over", "please", "put",
"so", "some", "such", "than", "that", "the", "their", "them", "then", "there", "these", "they",
"this", "try", "to", "up", "us", "very", "want", "was", "we", "well", "what", "when", "where",
"which", "why", "will", "with", "within", "you", "your", "yourself"
};
//词典索引时,字或词不需要索引
char *arrayChineseStop[] = {
"的","吗","么","啊","说","对","在","和","是",
"被","最","所","那","这","有","将","会","与",
"於","于","他","她","它","您","为","欢迎"
};
//全角的ASCII,要全部转成半角英文字符,以后还要加入其它的符号如,.;/|等
//区码为163的都要转成ascii
//163 ! " # ¥ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | }  ̄
// ! # $ % & ' ( ) % + , - . / 0 1 2
unsigned char *arrayWideAscii[] = {
"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z",".",
"A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z","-"
};
//标点符号及汉字的标点符号,注意 + - " 这三个符号,因为在搜索的时候需要通过他们进行异或等条件判断
char arrayAsciiSymbol[] ={
'!','\\','*','(',')','-','_','+','=','{','}','[',']',':',';','\'','\"',',','<','>','.','?','/','|','@','#','$','%','^','&'
};
//BIG5与GB对照,把所有big5转成gb后进行处理
//$arrayBig5ToG = array ();
//UTF8的转换
//汉字词典
typedef struct _WORD_NODE
{
char strWord[MAX_CWORD_LEN+1];
// todo ,可以增加 两个字,三个字,四个字,五个字的数组,这样查起来更快
struct _WORD_NODE *nextWord;
}WORD_NODE;
//分词结果
typedef struct _SEG_NODE
{
char strWord[MAX_CWORD_LEN+1];
struct _SEG_NODE *nextWord;
}SEG_NODE;
struct _CH_DICT {
WORD_NODE *lstWord;
}CH_DICT[MAX_CDIM][MAX_CDIM];
struct _SEG_LIST {
SEG_NODE *lstWord;
}SEG_LIST[MAX_CDIM][MAX_CDIM];
/*同义词典
咖啡馆,咖啡屋
神六,神舟六号
synonymous
*/
char *strTrim(char str[])
{
int firstchar=0;
int endpos=0;
int i;
int firstpos=0;
for(i=0;str[i]!='\0';i++){
if(str[i]==' ' || str[i] == '\r' || str [i] == '\n' || str [i]=='\t'){
if(firstchar==0) firstpos++;
}
else{
endpos=i;
firstchar=1;
}
}
for(i=firstpos;i<=endpos;i++)
str[i-firstpos]=str[i];
str[i-firstpos]='\0';
return str;
}
int addDictWord(char *strWord , int len)
{
unsigned char firstChar,lastChar;
WORD_NODE* curLst;
WORD_NODE* newWord ,*curTmp ;
firstChar = strWord[0] ;
lastChar = strWord[len-1];
if (firstChar < 161 || lastChar < 161 ) //非汉字或是汉字的全角符号
return -1;
newWord = (WORD_NODE*)malloc(sizeof(WORD_NODE));
if ( newWord == NULL)
return -1;
strcpy(newWord->strWord,strWord);
newWord->nextWord = NULL;
firstChar -= 161 ;
lastChar -= 161 ;
curLst = CH_DICT[firstChar][lastChar].lstWord;
if( curLst == NULL) //reinit list;
{
CH_DICT[firstChar][lastChar].lstWord = newWord ;
return 0;
}
curTmp = curLst ;
while(curTmp -> nextWord != NULL )
{
curTmp = curTmp->nextWord;
}
curTmp -> nextWord = newWord ;
return 0;
}
int addSegWord(unsigned char *strWord , int len)
{
unsigned char firstChar,lastChar;
SEG_NODE* curLst;
SEG_NODE* newWord ,*curTmp ;
firstChar = strWord[0] ;
lastChar = strWord[len-1];
//查看是否已经存在
firstChar %= MAX_CDIM ;
lastChar %= MAX_CDIM ;
curLst = SEG_LIST[firstChar][lastChar].lstWord;
curTmp = curLst ;
while(curTmp != NULL )
{
if ( strcasecmp(curTmp->strWord,(char *)strWord) == 0)
return 0; //已经存在
curTmp = curTmp->nextWord;
}
newWord = (SEG_NODE*)malloc(sizeof(SEG_NODE));
if ( newWord == NULL)
return -1;
strcpy(newWord->strWord,(char *)strWord);
newWord->nextWord = NULL;
if( curLst == NULL) //reinit list;
{
SEG_LIST[firstChar][lastChar].lstWord = newWord ;
return 0;
}
curTmp = curLst ;
while(curTmp -> nextWord != NULL )
{
curTmp = curTmp->nextWord;
}
curTmp -> nextWord = newWord ;
return 0;
}
int freeDict()
{
int i ,j ;
WORD_NODE *curLst,*curTmp ,*tmp;
for ( i = 0 ; i < MAX_CDIM ; i ++ )
for ( j = 0 ; j < MAX_CDIM ; j ++ ) {
curLst = CH_DICT[i][j].lstWord;
curTmp = curLst;
while ( curTmp != NULL ) {
tmp = curTmp ;
// printf("%s|",curTmp->strWord);
curTmp = curTmp -> nextWord ;
// if ( curTmp == NULL)
// printf("\n");
free(tmp);
tmp = ( WORD_NODE *)NULL;
}
CH_DICT[i][j].lstWord = (WORD_NODE *)NULL;
}
}
int freeSeg()
{
int i ,j ;
SEG_NODE *curLst,*curTmp ,*tmp;
for ( i = 0 ; i < MAX_CDIM ; i ++ )
for ( j = 0 ; j < MAX_CDIM ; j ++ ) {
curLst = SEG_LIST[i][j].lstWord;
curTmp = curLst;
while ( curTmp != NULL ) {
tmp = curTmp ;
printf("%s|",curTmp->strWord);
curTmp = curTmp -> nextWord ;
if ( curTmp == NULL)
printf("\n");
free(tmp);
tmp = ( SEG_NODE *)NULL;
}
SEG_LIST[i][j].lstWord = (SEG_NODE *)NULL;
}
}
BOOL searchWord( unsigned char *strWord,int len )
{
WORD_NODE *curLst,*curTmp;
unsigned char firstChar,lastChar;
firstChar = strWord[0] ;
lastChar = strWord[len-1];
firstChar -= 161 ;
lastChar -= 161 ;
curLst = CH_DICT[firstChar][lastChar].lstWord;
curTmp = curLst;
while ( curTmp != NULL ) {
if ( strcmp((char *)strWord,curTmp->strWord) == 0)
return TRUE;
curTmp = curTmp -> nextWord ;
}
return FALSE;
}
int segWord ( unsigned char *strText , int iWordLen , BOOL bChinese )
{
int i = 0 ,j = 0 , k = 0 , l = 0;
unsigned char strChar[MAX_CWORD_LEN+1],strChar1[5],strChar2[5],strChar3[7];
BOOL bFound = FALSE;
i = iWordLen ;
if ( FALSE == bChinese ) { //英文
//检查 是否在stop数组里
addSegWord(strText,iWordLen);
return 0;
}
while ( i > 1 ) {
for ( j = MAX_CWORD_LEN ; j >= 2 ; j -=2 ) { //最长xxx个汉字
if ( i < j )
continue;
l = 0 ;
for ( k = i - j ,l = 0 ; k < i ; k ++,l ++ )
strChar[l] = strText[k];
strChar[l] = '\0';
if ( 8 == j ) {//4个字时,无论哪种情况下都要比较前两个及后两个,防止"后三字是一个词,但第一个字跟再前一个字是一个词",同时可以捕获成词由两个词组成的
//如,让“我看看怒火凤凰",应该是"怒火"+凤凰,而不是"怒"+"火凤凰"
strChar1[0] = strChar[0];strChar1[1] = strChar[1];
strChar1[2] = strChar[2];strChar1[3] = strChar[3];
strChar1[4] = '\0';
strChar2[0] = strChar[4];s
- 1
- 2
- 3
前往页