#pragma warning(disable: 4786)
#include <iostream>
#include <fstream>
#include <cstdlib>
#include <vector>
#include <string>
#include <map>
using namespace std;
map <string,int>word_index;
map <string,int>pos_index;
vector <string> dict_word ; //存放字典中的所有词
vector <string> test_word ;
vector <string> test_pos ;
vector <string> dict_pos ; //存放字典中的所有词性
int dict_word_size,dict_pos_size;
#define ping_hua 0.1;
struct pos_node
{
int pos_num;
float pos_pro;
};
void read_dict()
{
string s1,s_pos,s_word,s;
ifstream fin;
fin.open("dict.txt");
if(!fin)
{
cerr<<"error 100 opening dict.txt";
exit(100);
}
int index_word=0,index_pos=0;
while(getline(fin,s1))
{
int k=0;
for(int i=0;i<s1.length();i++)
{
if(s1.compare(i,1,"\t"))
k++;
else
{
s_pos=s1.substr(k+1); //提取位置K+1后的子串,cixing
break;
}
}
s_word=s1.substr(0,k); //提取汉字词到TEMP
dict_word .push_back(s_word);
word_index[string(s_word)]=index_word;
index_word++;
for(int j=0;j<s_pos.length();j++)
{
if(s_pos.compare(j,1," ")&&s_pos.compare(j,1,"\t"))
{
if(j+1<s_pos.length())
{
if(!s_pos.compare(j+1,1," ")||!s_pos.compare(j+1,1,"\t")) //该词性只有一个字母组成
s=s_pos.substr(j,1); //提取一个字符
else
s=s_pos.substr(j,2); //提取一个词性,两个字
}
else
s=s_pos.substr(j,1); //提取一个字符
}
if(!pos_index.count(s)) //如果没有为该词建立索引,即没有找到该词
{
pos_index[s]=index_pos; //建立索引
index_pos++;
dict_pos.push_back(s);
}
}
}
dict_word_size=word_index.size();
dict_pos_size=pos_index.size();
for(int i=0;i<pos_index.size();i++)
cout<<pos_index[dict_pos[i]]<<" "<<dict_pos[i]<<endl;
fin.close();
}
void count_pi_A_B(float **pos_to_pos,float **pos_to_word)
{
string s1,s2,s_fir_pos,s_sce_pos,s_end_pos,s_word,fir_str,sce_str,end_str,end_pre_pos;
int k=0,fir_null,sce_null,end_null,fir=1,from,to;
ifstream fin1("corpus.txt");
if(!fin1)
{
cerr<<"error 100 opening dict.txt";
exit(100);
}
while(getline(fin1,s2))
{
if(s2.length()!=0)
{
end_null=s2.rfind(" ");
if(fir!=1)//从第二行开始,要考虑上一行最后一个词的词性向下一行第一个词的词性的转移
{
fir_null=s2.find(" ");
fir_str=s2.substr(0,fir_null);
k=fir_str.find_first_of('/');
s_fir_pos=fir_str.substr(k+1,2);
from=pos_index[s_end_pos];
to=pos_index[s_fir_pos]+1;
pos_to_pos[from][to]++; //上一行最后一个词的词性向下一行第一个词的词性的转移计数
}
fir_null=s2.find(" ");
fir_str=s2.substr(0,fir_null);
k=fir_str.find_first_of('/');
s_word=fir_str.substr(0,k); //提取第一个词
s_fir_pos=fir_str.substr(k+1,2); //提取第一个词的词性
from=word_index[s_word]; //获得词的索引
to=pos_index[s_fir_pos]; //获得词性索引
pos_to_word[from][to]++; //对(词,词性)对计数
pos_to_pos[pos_index[s_fir_pos]][0]++;//第一列放pi
while(fir_null!=end_null)
{
sce_null=s2.find(" ",fir_null+2);
sce_str=s2.substr(fir_null+2,sce_null-fir_null-2); //提取同一行的下一个“词/词性”串
k=sce_str.find_first_of('/');
s_word=sce_str.substr(0,k); //提取串中的词
s_sce_pos=sce_str.substr(k+1,2); //提取词性
from=word_index[s_word]; //获得词的索引
to=pos_index[s_sce_pos]; //获得词性索引
pos_to_word[from][to]++; //对(词,词性)对计数
from=pos_index[s_fir_pos]; //获得前一个词的词性索引
to=pos_index[s_sce_pos]; //当前词的词性索引
pos_to_pos[from][to+1]++; //diyilie fang pi, a
fir_null=sce_null;
s_fir_pos=s_sce_pos;
}
s_end_pos=s_sce_pos;//jilu xia shangyihang zuihou yige cixing
}
fir=0;
}
}
void pinghua(float **pos_to_pos,float **pos_to_word)
{
int i,j;
float all_pi=0,all_a=0,all_b=0,ph,size;
ph=(float)ping_hua;
for(i=0;i<dict_pos_size;i++) //pi_pro
{
pos_to_pos[i][0]=pos_to_pos[i][0]+ph;
all_pi=all_pi+pos_to_pos[i][0];
}
for(i=0;i<dict_pos_size;i++)
{
pos_to_pos[i][0]=pos_to_pos[i][0]/all_pi;
}
for(i=0;i<dict_pos_size;i++) //a_pro xianxing chazhi
{
all_a=0;
for(j=1;j<=dict_pos_size;j++)
all_a=pos_to_pos[i][j]+all_a;
for(j=1;j<=dict_pos_size;j++)
pos_to_pos[i][j]=((1-ph)*pos_to_pos[i][j]/all_a)+ph;
}
for(j=0;j<dict_word_size;j++)//b_pro
{
size=0;
all_b=0;
for(i=0;i<dict_pos_size;i++)
{
all_b=pos_to_word[j][i]+all_b; //c(word[j])
if(pos_to_word[j][i]!=0)
size++;
}
for(i=0;i<dict_pos_size;i++)
pos_to_word[j][i]= (float)(pos_to_word[j][i]+1)/(all_b+size);
}
}
float max(float **i_j_pro,float **pos_to_pos,int i,int j,int &rec_k)//第i+1时刻处于状态j的最大概率以及这个最大概率值对应的前一个状态
{
float max=0;
for(int k=0;k<dict_pos.size();k++)
{
if(i_j_pro[i][k]*pos_to_pos[k][j+1]>max)
{
max=i_j_pro[i][k]*pos_to_pos[k][j+1];
rec_k=k;
}
}
return max;
}
void viterbi(float **pos_to_pos,float **pos_to_word,float **i_j_pro,string **most_state,string *new_state)
{
int i,j,index,rec_k=0;
float mmax;
string state;
if(word_index.count(test_word[0])) //如果测试句子的第一个词已在语料库中出现
{
index=word_index[test_word[0]]; //获得第一个词的索引
for(j=0;j<dict_pos_size;j++) //pi
i_j_pro[0][j]=pos_to_pos[j][0]*pos_to_word[index][j]; //0时刻处于状态j(即词性j)并产生该词的概率
}
else //该词为未登录词,认为每个词性产生该词的概率一样
{
for(j=0;j<dict_pos_size;j++) //pi
i_j_pro[0][j]=pos_to_pos[j][0];
}
///////////////////////////////////////////////////
for(i=1;i<test_word.size();i++) //遍历每个词
{
if(word_index.count(test_word[i])) //如果该词存在于语料库中
{
index=word_index[test_word[i]]; //获得该词的索引
for(j=0;j<dict_pos_size;j++) //pi
{
mmax=max(i_j_pro,pos_to_pos,i-1,j,rec_k); //获得i时刻处于状态j的最大概率
i_j_pro[i][j]=mmax*pos_to_word[index][j]; //i时刻处于状态j并且产生第i+1个词的最大概率
most_state[i][j]=dict_pos[rec_k]; //存储i时刻处于状态j的最大概率对应的前一时刻的状态
}
}
else //如果该词为未登录词
{
for(j=0;j<dict_pos_size;j++) //pi
{
mmax=max(i_j_pro,pos_to_pos,i-1,j,rec_k);
i_j_pro[i][j]=mmax;
most_state[i][j]=dict_pos[rec_k];
}
}
}
mmax=i_j_pro[test_pos.size()-1][0]; //qiu zuidazhi pos_to_pos de diyilie fang pi
state=dict_pos[0];
for(int k=1;k<dict_pos.size();k++) //获得最优路径对应的最大概率值和最后时刻的状态
{
if(i_j_pro[test_word.size()-1][k]>mmax)
{
mmax=i_j_pro[test_pos.size()-1][k];
rec_k=k;
}
}
state=dict_pos[rec_k]; //获得最后时刻的状态,即最优路径末端的词性
new_state[test_pos.size()-1]=state; //存储最优路径末端的词性
for(k=test_pos.size()-1;k>=1;k--) //回溯,用most_state存储最优路径,即最佳的词性序列
{
new_state[k-1]=most_state[k][rec_k];
rec_k=pos_index[new_state[k-1]];
}
}
void tag_test(float **pos_to_pos,float **pos_to_word)
{
string s ,s2,one_sentence,fir_str,s_fir_pos,s_sce_pos,s_word,sce_str;
int end_null,fir_null,sce_null,k,i;
ifstream fin1("test.txt");
if(!fin1)
{
cerr<<"error 100 opening dict.txt";
exit(100);
}
ofstream fout("result.txt");
if(!fout)
{
cerr<<"error 100 opening result.txt";
exit(100);
}
int line=1;
float all_cor=0,all=0;
while(getline(fin1,s))
{
if(s.length()!=0) //tiqu yige juzi de ci he cixing ,bing cunru dongtai shuzu
{
end_null=s.rfind(" ");
fir_null=s.find(" ");
fir_str=s.substr(0,fir_null);
k=fir_str.find_first_of('/');
s_word=fir_str.substr(0,k); //提取一个词
s_fir_pos=fir_str.substr(k+1,2); //提取出词的词性
test_pos.push_back(s_fir_pos); //存储词性
test_word.push_back(s_word); //存储词
while(fir