/*
* $Id: maxent.cpp,v 1.28 2006/08/21 17:30:38 tsuruoka Exp $
*/
#include "maxent.h"
#include <cmath>
#include <cstdio>
using namespace std;
/*
int
ME_Model::BLMVMFunctionGradient(double *x, double *f, double *g, int n)
{
const int nf = _fb.Size();
if (_inequality_width > 0) {
assert(nf == n/2);
for (int i = 0; i < nf; i++) {
_va[i] = x[i];
_vb[i] = x[i + nf];
_vl[i] = _va[i] - _vb[i];
}
} else {
assert(nf == n);
for (int i = 0; i < n; i++) {
_vl[i] = x[i];
}
}
double score = update_model_expectation();
if (_inequality_width > 0) {
for (int i = 0; i < nf; i++) {
g[i] = -(_vee[i] - _vme[i] - _inequality_width);
g[i + nf] = -(_vme[i] - _vee[i] - _inequality_width);
}
} else {
if (_sigma == 0) {
for (int i = 0; i < n; i++) {
g[i] = -(_vee[i] - _vme[i]);
}
} else {
const double c = 1 / (_sigma * _sigma);
for (int i = 0; i < n; i++) {
g[i] = -(_vee[i] - _vme[i] - c * _vl[i]);
}
}
}
*f = -score;
return 0;
}
int
ME_Model::BLMVMLowerAndUpperBounds(double *xl,double *xu,int n)
{
if (_inequality_width > 0) {
for (int i = 0; i < n; i++){
xl[i] = 0;
xu[i] = 10000.0;
}
return 0;
}
for (int i = 0; i < n; i++){
xl[i] = -10000.0;
xu[i] = 10000.0;
}
return 0;
}
*/
int
ME_Model::perform_GIS(int C)
{
cerr << "C = " << C << endl;
C = 1;
cerr << "performing AGIS" << endl;
vector<double> pre_v;
double pre_logl = -999999;
for (int iter = 0; iter < 200; iter++) {
double logl = update_model_expectation();
fprintf(stderr, "iter = %2d C = %d f = %10.7f train_err = %7.5f", iter, C, logl, _train_error);
if (_heldout.size() > 0) {
double hlogl = heldout_likelihood();
fprintf(stderr, " heldout_logl(err) = %f (%6.4f)", hlogl, _heldout_error);
}
cerr << endl;
if (logl < pre_logl) {
C += 1;
_vl = pre_v;
iter--;
continue;
}
if (C > 1 && iter % 10 == 0) C--;
pre_logl = logl;
pre_v = _vl;
for (int i = 0; i < _fb.Size(); i++) {
double coef = _vee[i] / _vme[i];
_vl[i] += log(coef) / C;
}
}
cerr << endl;
}
int
ME_Model::perform_LMVM()
{
cerr << "performing LMVM" << endl;
if (_inequality_width > 0) {
int nvars = _fb.Size() * 2;
double *x = (double*)malloc(nvars*sizeof(double));
// INITIAL POINT
for (int i = 0; i < nvars / 2; i++) {
x[i] = _va[i];
x[i + _fb.Size()] = _vb[i];
}
// int info = BLMVMSolve(x, nvars);
for (int i = 0; i < nvars / 2; i++) {
_va[i] = x[i];
_vb[i] = x[i + _fb.Size()];
_vl[i] = _va[i] - _vb[i];
}
free(x);
return 0;
}
int nvars = _fb.Size();
double *x = (double*)malloc(nvars*sizeof(double));
// INITIAL POINT
for (int i = 0; i < nvars; i++) { x[i] = _vl[i]; }
// int info = BLMVMSolve(x, nvars);
for (int i = 0; i < nvars; i++) { _vl[i] = x[i]; }
free(x);
return 0;
}
int
ME_Model::conditional_probability(const Sample & s,
std::vector<double> & membp) const
{
int num_classes = membp.size();
double sum = 0, maxpow = 0;
// int max_label = -1;
int max_label = 0;
double maxp = 0;
vector<double> powv(_num_classes, 0.0);
for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){
for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) {
powv[_fb.Feature(*k).label()] += _vl[*k];
}
}
for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); j != s.rvfeatures.end(); j++) {
for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) {
powv[_fb.Feature(*k).label()] += _vl[*k] * j->second;
}
}
std::vector<double>::const_iterator pmax = max_element(powv.begin(), powv.end());
double offset = max(0.0, *pmax - 700); // to avoid overflow
for (int label = 0; label < _num_classes; label++) {
double pow = powv[label] - offset;
double prod = exp(pow);
// cout << pow << " " << prod << ", ";
// if (_ref_modelp != NULL) prod *= _train_refpd[n][label];
if (_ref_modelp != NULL) prod *= s.ref_pd[label];
assert(prod != 0);
membp[label] = prod;
sum += prod;
}
for (int label = 0; label < _num_classes; label++) {
membp[label] /= sum;
if (membp[label] > membp[max_label]) max_label = label;
}
assert(max_label >= 0);
return max_label;
}
int
ME_Model::make_feature_bag(const int cutoff)
{
int max_num_features = 0;
// count the occurrences of features
#ifdef USE_HASH_MAP
typedef __gnu_cxx::hash_map<unsigned int, int> map_type;
#else
typedef std::map<unsigned int, int> map_type;
#endif
map_type count;
if (cutoff > 0) {
for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++) {
for (std::vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++) {
count[ME_Feature(i->label, *j).body()]++;
}
for (std::vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) {
count[ME_Feature(i->label, j->first).body()]++;
}
}
}
int n = 0;
for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++, n++) {
max_num_features = max(max_num_features, (int)(i->positive_features.size()));
for (std::vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++) {
const ME_Feature feature(i->label, *j);
// if (cutoff > 0 && count[feature.body()] < cutoff) continue;
if (cutoff > 0 && count[feature.body()] <= cutoff) continue;
int id = _fb.Put(feature);
// cout << i->label << "\t" << *j << "\t" << id << endl;
// feature2sample[id].push_back(n);
}
for (std::vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) {
const ME_Feature feature(i->label, j->first);
// if (cutoff > 0 && count[feature.body()] < cutoff) continue;
if (cutoff > 0 && count[feature.body()] <= cutoff) continue;
int id = _fb.Put(feature);
}
}
count.clear();
// cerr << "num_classes = " << _num_classes << endl;
// cerr << "max_num_features = " << max_num_features << endl;
int c = 0;
init_feature2mef();
return max_num_features;
}
double
ME_Model::heldout_likelihood()
{
double logl = 0;
int ncorrect = 0;
for (std::vector<Sample>::const_iterator i = _heldout.begin(); i != _heldout.end(); i++) {
vector<double> membp(_num_classes);
int l = classify(*i, membp);
logl += log(membp[i->label]);
if (l == i->label) ncorrect++;
}
_heldout_error = 1 - (double)ncorrect / _heldout.size();
return logl /= _heldout.size();
}
double
ME_Model::update_model_expectation()
{
double logl = 0;
int ncorrect = 0;
_vme.resize(_fb.Size());
for (int i = 0; i < _fb.Size(); i++) _vme[i] = 0;
int n = 0;
for (vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++, n++) {
vector<double> membp(_num_classes);
int max_label = conditional_probability(*i, membp);
logl += log(membp[i->label]);
// cout << membp[*i] << " " << logl << " ";
if (max_label == i->label) ncorrect++;
// model_expectation
for (vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++){
for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) {
_vme[*k] += membp[_fb.Feature(*k).label()];
}
}
for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) {
for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) {
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
共44个文件
cpp:8个
exc:5个
dic:3个
The GENIA tagger analyzes English sentences and outputs the base forms, part-of-speech tags, chunk tags, and named entity tags.The tagger is specifically tuned for biomedical text such as MEDLINE abstracts.If you need to extract information from biomedical documents, this tagger might be a useful preprocessing tool. GENIA tagger分析英语句子,并输出单词原型、词性标记、Chunk标记,和命名实体标记。它专门针对生物医学文本,如MEDLINE摘要。如果你需要从生物医学文本中提取信息,这个tagger可能是一个有用的预处理工具。
资源推荐
资源详情
资源评论
收起资源包目录
geniatagger-3.0.1.tar.gz (44个子文件)
geniatagger-3.0.1
morph.cpp 6KB
models_named_entity
word_info 348KB
model001 18.32MB
morphdic
cousin.exc 28KB
verb.exc 81KB
adj.exc 20KB
verb.dic 158KB
WORDNETLICENSE 2KB
adj.dic 189KB
noun.dic 1.06MB
noun.exc 109KB
adv.exc 73B
common.h 529B
chunking.cpp 10KB
main.cpp 3KB
models_medline
model.bidir.4 2.36MB
model.bidir.11 2.37MB
model.bidir.9 2.26MB
model.bidir.3 2.39MB
model.bidir.6 2.55MB
model.bidir.15 2.78MB
model.bidir.5 2.34MB
model.bidir.13 2.32MB
model.bidir.14 2.6MB
model.bidir.12 2.38MB
model.bidir.1 2.28MB
model.bidir.10 2.28MB
model.bidir.7 2.67MB
model.bidir.2 2.4MB
model.bidir.0 2.43MB
model.bidir.8 2.34MB
LICENSE 2KB
namedentity.cpp 10KB
Makefile 298B
maxent.cpp 21KB
tokenize.cpp 3KB
maxent.h 10KB
models_chunking
model.bidir.4 2.6MB
model.bidir.6 1003KB
model.bidir.2 2.65MB
model.bidir.0 4.37MB
bidir.cpp 20KB
README 2KB
postag.cpp 7KB
共 44 条
- 1
资源评论
-m0nster-
- 粉丝: 0
- 资源: 1
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功