NLP中短语抽取模块的实现资源-CSDN文库

共11个文件

h：8个

cpp：3个

短语抽取

5星 · 超过95%的资源需积分: 32 38 浏览量 2011-05-02 14:27:58 上传评论 8 收藏 17KB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

短语抽取.rar （11个子文件）

comp_prob

com_prob.cpp 33KB

definev.h 547B

pharaoh

extract_phrase.cpp 2KB

extractphrase.h 6KB

phrase.h 823B

definev.h 438B

dict

proba_phrase.h 4KB

extractphrase.h 4KB

dictionary.cpp 19KB

phrase.h 803B

definev.h 590B

//****************** 本程序用来对生成的短语计算四个概率p(e|c),p(c|e),lex(e|c),lex(c|e).******************* //输入文件1:短语抽取结果,格式为:中文短语 ||| 英文短语 ||| 短语对的对齐信息 ||| 该短语对所在的句子号 //输入文件2:中英词典,格式为:中文词英文词 p(英文词|中文词) //输入文件3:英中词典,格式为:英文词中文词 p(中文词|英文词) //输出文件:4个概率,格式为:中文短语 ||| 英文短语 ||| p(c|e) lex(c|e) p(e|c) lex(e|c) //头文件 #define _LARGEFILE64_SOURCE #define _FILE_OFFSET_BITS 64 #include <string> #include <fstream> #include <sstream> #include <iostream> #include <functional> #include <math.h> #include <algorithm> #include <vector> #include "definev.h" using namespace std; //一些定义 typedef struct phrase_sen { char *phrase; double freq; struct phrase_sen *next; }PHRASE_SEN; typedef struct phrase_chen { char *cphrase; char *ephrase; char *align; double freq; struct phrase_chen *next; }PHRASE_CHEN; typedef struct my_dict { char *wordpair; double proba; struct my_dict *next; }MY_DICT; enum {NHASH = 1377043, MULTIPLIER = 31}; //声明不同的hashmap PHRASE_SEN *WordTab_ch[NHASH]; // 哈希表存放词的英汉信息和位置信息 PHRASE_SEN *WordTab_en[NHASH]; PHRASE_CHEN *WordTab_chen[NHASH]; MY_DICT *Mydict_c2e[NHASH]; MY_DICT *Mydict_e2c[NHASH]; PHRASE_SEN* c1; PHRASE_SEN* c2; PHRASE_CHEN* c3; MY_DICT* c4; MY_DICT* c5; //函数声明 unsigned int hash(char *s); //hash函数 //构建hashmap函数 PHRASE_SEN* construct_hash_ch(char* s,PHRASE_CHEN *st); PHRASE_SEN* construct_hash_en(char* s,PHRASE_CHEN *st); PHRASE_CHEN* construct_hash_chen(char* s1,char* s2,char* s3); MY_DICT* construct_hash_mydictc2e(char* line); MY_DICT* construct_hash_mydicte2c(char* line); //查询函数 double lookup_ch(char *s,PHRASE_SEN *WordTab_ch[NHASH]); double lookup_en(char *s,PHRASE_SEN *WordTab_en[NHASH]); double lookup_chen(char *s1,char* s2,PHRASE_CHEN *WordTab_chen[NHASH]); double lookup_mydict_c2e(char *ch_en,MY_DICT *Mydict_c2e[NHASH]); double lookup_mydict_e2c(char *en_ch,MY_DICT *Mydict_e2c[NHASH]); void extract_chword(char* phrase,char* wordch[PHRASEN]); void extract_enword(char* phrase,char* worden[PHRASEN]); void extract_location(char* phrase,int aligninforch[EXTRACTLEN*20],int aligninforen[EXTRACTLEN*20]); double comput_lexe2c(char* wordch[PHRASEN],char* worden[PHRASEN],int aligninforch[EXTRACTLEN*20],int aligninforen[EXTRACTLEN*20]); double comput_lexc2e(char* wordch[PHRASEN],char* worden[PHRASEN],int aligninforch[EXTRACTLEN*20],int aligninforen[EXTRACTLEN*20]); double probalility(double x,double y){return x/y;}//求商函数 string Double2String(double a) // convert double to string { ostringstream s; s < a; return s.str(); //将double转化为string } //主函数 int main() { string filename1 = INFILE; //输入文件,格式为:中文短语|||英文短语|||短语对的对齐信息|||该短语对所在的句子号 FILE* iFile1; if((iFile1 = fopen(INFILE,"rb")) == NULL) { cout < "The system could not open the phrase File!" < endl; exit(0); } char buffer[2000]; //用于存放每行 char* linedata; //用于存放每行 int line_no = 0; //用于存放行数 while(!feof(iFile1)) //每次取一行 { fgets(buffer, 2000,iFile1); line_no++; if (line_no%50000 == 0) //每5000行就说明一次 { cout<<"输入文件已处理"<<line_no<<endl; } linedata=buffer; char *str_tag = "|||"; //标志 char *tp1 = strstr(linedata, str_tag); //第一个|||处 char *tp2 = strstr(tp1+3, str_tag); //第二个|||处 char *tp3 = tp2+4; string strbuf(tp3); if (tp1 != NULL && tp2 != NULL && tp3 != NULL) //如果三个标志都找到 { size_t i_size = 0; char *s0 = NULL; //存放中文短语 i_size = tp1 - linedata - 1; s0 = (char*)calloc(sizeof(char), i_size+1); strncat(s0, linedata, i_size); //cout<<"中文短语 "<<s0<<endl; char *s1 = NULL; //存放英文短语 i_size = tp2 - tp1 - 5; s1 = (char*)calloc(sizeof(char), i_size+1); strncat(s1, tp1+4, i_size); //cout<<"英文短语 "<<s1<<"|||"<<endl; char *s2=(char*)calloc(sizeof(char), strbuf.size()-1 ); for( unsigned int i_buf=0; i_buf<strbuf.size()-1; i_buf++) { s2[i_buf] = strbuf[i_buf]; } //cout<<"对齐信息 "<<s2<<"|||"<<endl; c3=construct_hash_chen(s0,s1,s2); //将中英短语对和所在的句子号建立hashmap //cout<<"构建中英hash！"<<endl; } } fclose(iFile1); cout<<"取短语结束"<<endl; cout<<"开始构建中文hashmap！"<<endl; int ii; line_no = 0; for(ii = 0;ii NHASH;ii++) { PHRASE_CHEN* wqtemp = WordTab_chen[ii]; while( wqtemp != NULL) { line_no++; if (line_no%50000 == 0) //每5000行就说明一次 { cout<<"中文hashmap"<<line_no<<endl; } c1 = construct_hash_ch(wqtemp->cphrase,wqtemp); wqtemp = wqtemp->next; } } cout<<"构建中文hashmap结束！"<<endl; cout<<"开始构建英文hashmap！"<<endl; for( ii = 0;ii NHASH;ii++) { PHRASE_CHEN* wqtemp = WordTab_chen[ii]; while( wqtemp != NULL) { line_no++; if (line_no%50000 == 0) //每5000行就说明一次 { cout<<"英文hashmap"<<line_no<<endl; } c2 = construct_hash_en(wqtemp->ephrase,wqtemp); wqtemp = wqtemp->next; } } cout<<"构建中文hashmap结束！"<<endl; //**************************************************** //////下面开始构建字典的hashmap////////////// //**************************************************** cout<<"开始构建词典!"<<endl; string filename2 = CH_EN; //构建词典hashmap的输入文件1,中英词典,格式为:中文词英文词 p(英文词|中文词) string filename3 = EN_CH; //构建词典hashmap的输入文件2,英中词典,格式为:英文词中文词 p(中文词|英文词) ifstream iFile2; ifstream iFile3; iFile2.open(filename2.c_str()); iFile3.open(filename3.c_str()); if (!iFile2) { cout < "打不开文件"<<CH_EN< endl; iFile2.close(); } if (!iFile3) { cout < "打不开文件"<<EN_CH< endl; iFile3.close(); } char* line; //用于存放每行 line_no = 0;

评论收藏

内容反馈