#include<stdio.h>
#include<string.h>
#include<locale.h>
#include<stdlib.h>
#include<malloc.h>
#include<io.h>
#include<assert.h>
#include<math.h>
typedef struct word_item{
wchar_t *word;
unsigned freq;
}WordItem;
int wfreqs[20902]={0};
double wfreqfs[20902]={0};
WordItem *items=NULL;
int wordcount=2000000;
wchar_t *puncs=L"⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ*-./⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩$‰∑§αβγ¥℃∏※±×÷□◆▲●★【】『』①②③④⑤⑥⑦⑧⑨⑩⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑12345678901234567890〈〉○●△▲『』±%×~!@#$%^&*()_+|=][';,.?:\"<>{}!·#¥%……—*‘’()、——+|{}[]:“”;《》,。?\n\r\t ";
wchar_t *puncs1=L"\n\t\r ";
int freq_cmp(const void *p,const void *q)//词频排序函数
{
WordItem *wp=(WordItem*)p,
*wq=(WordItem*)q;
return (wq->freq)-(wp->freq);
}
int wcs_cmp(const void *p,const void *q)//音序排序函数
{
return wcscmp(*(const wchar_t**)p,*(const wchar_t**)q);
}
int LoadUnicText(wchar_t * &text, const char*filename)//读入文件,必须用word保存为unicode编码
{
int char_num=0;
wchar_t ch,*p=text;
FILE *in;
if((in=fopen(filename,"rb"))==NULL){//判断源文件
printf("Can't open file!");
return 0;
}
char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file
// while(fgetwc(in)!=WEOF) char_num++;
rewind(in);
p=(wchar_t*)calloc(char_num,sizeof(wchar_t));
if(!text || (_msize(p)/sizeof(wchar_t) < (unsigned)char_num)){//分配检查判断
printf("内存分配失败!\n");
fcloseall();
return 0;
}
while((ch=fgetwc(in))!=WEOF) {//copy
*p=ch;
if(ch>=19968 && ch<=40869)//进行字频统计
wfreqs[ch-19968]++;
p++;
}
*p=L'\0'; //确保它是以0结尾
rewind(in);
if(fgetwc(in)==65279) char_num--;//unicode开头有一个不明的标记字符:值为65279
p=p-char_num;
text=p;
fclose(in);
return char_num;
}
wchar_t **GetUnicStringArray(wchar_t *TEXT,int char_num)//建立索引
{
wchar_t **pp;
int i=0;
if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num)
printf("GetUnicStringArray_源文本为空!\n");
return NULL;
}
pp=(wchar_t**)calloc(char_num+1,sizeof(wchar_t*));//分配row内存
if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check
printf("row内存分配失败!\n");
return NULL;
}
pp[char_num]=L"END!";//加一个结尾标记
for(i=0;i<char_num;i++)//赋值
pp[i]=TEXT++;
qsort(pp,char_num,sizeof(pp[i]),wcs_cmp);
// printf("排序后:\n");
// for (i=0;i<char_num;i++) printf("No.%3d:[%ls]\n",i,pp[i]);
return pp;
}
wchar_t **GetUnicStringArray2(wchar_t *TEXT,int char_num)//建立索引,不要 空格\t\n\r
{
wchar_t **pp;
int i=0;
if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num)
printf("GetUnicStringArray_源文本为空!\n");
return NULL;
}
pp=(wchar_t**)calloc(char_num,sizeof(wchar_t*));//分配row内存
if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check
printf("row内存分配失败!\n");
return NULL;
}
for(i=0;i<char_num;i++)//赋值
pp[i]=TEXT++;
qsort(pp,char_num,sizeof(pp[i]),wcs_cmp);
// printf("排序后:\n");
// for (i=0;i<char_num;i++) printf("No.%3d:[%ls]\n",i,pp[i]);
return pp;
}
int RetrievalUnicString(const wchar_t *string, wchar_t**array,
int array_num,int ContextLen,FILE *output)
{
int eg_num=0,i=0,hlen=0,rlen=0;
wchar_t **pp=(wchar_t**)array;
if(string==NULL || *string==L'\0') {
printf("没有输入要查找的字符串!\n");
return 0;
}
if(pp==NULL) return 0;
for(i=0;i<array_num;i++){
if(wcsncmp(pp[i],string,wcslen(string))==0){
int headlen=0,rearlen=0,j=0;
wchar_t* tmp;
eg_num++;
headlen=array_num-wcslen(pp[i]);//count headlen
if(ContextLen<headlen) headlen=ContextLen;//shorten headlen
rearlen=wcslen(pp[i])-wcslen(string);//count rearlen
if(ContextLen<rearlen) rearlen=ContextLen;//shorten rearlen
tmp=pp[i]-headlen;//get temporary string
fprintf(output,"No.%d::[",eg_num);
for(j=0;j<(headlen+rearlen+(int)wcslen(string));j++)//printf
fprintf(output,"%lc",*tmp++);
fprintf(output,"]\n");
// fprintf(output,"");
}
if((eg_num>0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0))
break; /*检索到的例子是相邻关系,所以在找到例子以后,一旦发现
下一条不符合条件,就可以立即跳出循环 */
}
if (eg_num==0) {
printf("文本没有找到相应的字符(串)!");
return 0;
}
fcloseall();
return eg_num;
}
wchar_t *cutoff(const wchar_t *source,FILE *output)//去除括号和词性标记
{
wchar_t *p=(wchar_t *)source;
// FILE *out;
// out=fopen("result30.txt","wt");
if(!p) return NULL;
printf("df");
while(*p){
if(*p==L'/' && *(p+5)==L' ') p+=7;//cutoff"/n]ns"类
if(*p==L'/' && *(p+6)==L' ') p+=8;//cutoff"/nt]ns"类
if((*p)==L'/' && *(p+2)==L' ') p+=4;//cutoff"/n "类
// {fprintf(output,"%lc",*p);}
if((*p)==L'/' && *(p+3)==L' ') p+=5;//cutoff"/ns "类
// {fprintf(output,"%lc",*p);}
if(*p==L'[') p++;//cutoff"["要放到最后一步来做
fprintf(output,"%lc",*p);
p++;
}
return NULL;
}
int Dicbulid(const wchar_t *source,FILE *output)//想做一个从文本直接提取词表的函数,失败
{
int i=0,j=0;
wchar_t *q=(wchar_t *)source;
//items=(WordItem*)calloc(wordcount,sizeof(WordItem));
//if(items==NULL)return 0;
if(!q) return 0;
while(*q){//" 字/"格式,建立单字词表
// wchar_t *q=p;
if((*q==L' ' || *q==L'[') && *(q+1) && *(q+2)==L'/'){
fprintf(output,"%lc\n",*(q+1));
q+=3;
i++;//计算个数
}
else q++;
}
return i;
// if (*q==L' ')
/* while(*(++q)!=L'/');
wchar_t tmp[200];
wcsncpy(tmp,p,q-p);
items[i].word=_wcsdup(tmp);
printf("%ls\n",items[i].word);
// fwrite(items[i].word,sizeof(wchar_t),wcslen(items[i].word)+1,output);
p=q+2;
if(*p==L' ' || *p==L'\n') p++;
i++;
}
*/
// for(j=0;j<i;j++)
// fwrite(items[j].word,sizeof(wchar_t),wcslen(items[j].word)+1,output);
// return i;
}
wchar_t *cleartag(const wchar_t *source,FILE *output)//从陈老师的检索结果提取出纯词条的文件
{
wchar_t *p=(wchar_t *)source;
if(!p) return NULL;
while(*p){
if(*p==L'\n')
fprintf(output,"%lc",*p);
if(*p>=L'一') //“一”最小的汉字 编码,作为阈值
p++;
}
return NULL;
}
int creatdic(const wchar_t *source,FILE *output)//从纯词条的文件建立一个带词频排序的词表
{
wchar_t *p=(wchar_t *)source;
wchar_t *q=(wchar_t *)source;
int Wordcount=0,i=0;
if(!p) return NULL;
while(*q){//计算词条数
if(*q==L'\n') Wordcount++;
q++;
}
printf("词条数[%d]",Wordcount);
items=(WordItem*)calloc(Wordcount,sizeof(WordItem));
if(items==NULL) return 0;
while(*p){
int len=0,j=0,tag=0;
wchar_t *words=p;
q=p;
while(*(++q)!=L'\n');
len=q-p;p=q+1;
words[len]=L'\0';
for(j=0;j<i;j++){
if (wcsncmp(words,items[j].word,len)==0) {
items[j].freq++;
tag=1;//find相同
}
}
if(tag==1) continue;
else{
items[i].word=_wcsdup(words);
items[i].freq=1;
i++;
}
}
// qsort(items,i,sizeof(WordItem),wcs_cmp);//按照音序排
qsort(items,i,sizeof(WordItem),freq_cmp);//按词频序排
// printf("%ls\t%d\n",items[i].word,items[i].freq);
// fwrite(items[i].word,sizeof(wchar_t),len+1,output);
// fwrite(&items[i].freq,sizeof(unsigned),1,output);
for(i=0;(i<Wordcount) && (items[i].word);i++)
fprintf(output,"%ls%d\n",items[i].word,items[i].freq);
// fprintf(output,"%lc",19968);
//
return Wordcount;
}
int RetrievalUnic2String(const wchar_t *string1, const wchar_t *string2,
wchar_t**array,int array_num,
int intelen,//设定距离
int ContextLen,FILE *output)
{
int eg_num=0,i=0,hlen=0,rlen=0;
wchar_t **pp=(wchar_t**)array;
wchar_t *string=(wchar_t*)string1;
if(string==NULL || *string==L'\0' || string2==NULL || *string2==L'\0') {
printf("没有输入要查找的字符串!\n");
return 0;
}
if(pp==NULL) return 0;
for(i=0;i<array_num;i++){
if(wcsncmp(pp[i],string,wcslen(string))==0){
int headlen=0,rearlen=0,j=0;
wchar_t* tmp;
eg_num++;
headlen=array_num-wcslen(pp[i]);//count