#include <cstdio>
#include "dict.h"
using namespace std;
namespace rmmseg
{
struct Entry
{
Word *word;
Entry *next;
};
const size_t init_size = 262147;
const size_t max_density = 5;
/*
Table of prime numbers 2^n+a, 2<=n<=30.
*/
static size_t primes[] = {
524288 + 21,
1048576 + 7,
2097152 + 17,
4194304 + 15,
8388608 + 9,
16777216 + 43,
33554432 + 35,
67108864 + 15,
134217728 + 29,
268435456 + 3,
536870912 + 11,
1073741824 + 85,
};
static size_t n_bins = init_size;
static size_t n_entries = 0;
static Entry **bins = static_cast<Entry **>(std::calloc(init_size,
sizeof(Entry *)));
static size_t new_size()
{
for (size_t i = 0;
i < sizeof(primes)/sizeof(primes[0]);
++i)
{
if (primes[i] > n_bins)
{
return primes[i];
}
}
// TODO: raise exception here
return n_bins;
}
static unsigned int hash(const char *str, int len)
{
unsigned int key = 0;
while (len--)
{
key += *str++;
key += (key << 10);
key ^= (key >> 6);
}
key += (key << 3);
key ^= (key >> 11);
key += (key << 15);
return key;
}
static void rehash()
{
int new_n_bins = new_size();
Entry **new_bins = static_cast<Entry **>(calloc(new_n_bins,
sizeof(Entry *)));
Entry *entry, *next;
unsigned int hash_val;
for (size_t i = 0; i < n_bins; ++i)
{
entry = bins[i];
while (entry)
{
next = entry->next;
hash_val = hash(entry->word->text,
entry->word->nbytes) % new_n_bins;
entry->next = new_bins[hash_val];
new_bins[hash_val] = entry;
entry = next;
}
}
free(bins);
n_bins = new_n_bins;
bins = new_bins;
}
namespace dict
{
/**
* str: the base of the string
* len: length of the string (in bytes)
*
* str may be a substring of a big chunk of text thus not nul-terminated,
* so len is necessary here.
*/
Word *get(const char *str, int len)
{
unsigned int h = hash(str, len) % n_bins;
Entry *entry = bins[h];
if (!entry)
return NULL;
do
{
if (len == entry->word->nbytes &&
strncmp(str, entry->word->text, len) == 0)
return entry->word;
entry = entry->next;
}
while (entry);
return NULL;
}
void add(Word *word)
{
unsigned int hash_val = hash(word->text, word->nbytes);
unsigned int h = hash_val % n_bins;
Entry *entry = bins[h];
if (!entry)
{
if (n_entries/n_bins > max_density)
{
rehash();
h = hash_val % n_bins;
}
entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
entry->word = word;
entry->next = NULL;
bins[h] = entry;
n_entries++;
return;
}
bool done = false;
do
{
if (word->nbytes == entry->word->nbytes &&
strncmp(word->text, entry->word->text, word->nbytes) == 0)
{
/* Overwriting. WARNING: the original Word object is
* permanently lost. This IS a memory leak, because
* the memory is allocated by pool_alloc. Instead of
* fixing this, tuning the dictionary file is a better
* idea
*/
entry->word = word;
done = true;
break;
}
entry = entry->next;
}
while (entry);
if (!done)
{
entry = static_cast<Entry *>(pool_alloc(sizeof(Entry)));
entry->word = word;
entry->next = bins[h];
bins[h] = entry;
n_entries++;
}
}
bool load_chars(const char *filename)
{
FILE *fp = fopen(filename, "r");
if (!fp)
{
return false;
}
const size_t buf_len = 24;
char buf[buf_len];
char *ptr;
while(fgets(buf, buf_len, fp))
{
// NOTE: there SHOULD be a newline at the end of the file
buf[strlen(buf)-1] = '\0'; // truncate the newline
ptr = strchr(buf, ' ');
if (!ptr)
continue; // illegal input
*ptr = '\0';
add(make_word(ptr+1, 1, atoi(buf)));
}
fclose(fp);
return true;
}
bool load_words(const char *filename)
{
FILE *fp = fopen(filename, "r");
if (!fp)
{
return false;
}
const int buf_len = 48;
char buf[buf_len];
char *ptr;
while(fgets(buf, buf_len, fp))
{
// NOTE: there SHOULD be a newline at the end of the file
buf[strlen(buf)-1] = '\0'; // truncate the newline
ptr = strchr(buf, ' ');
if (!ptr)
continue; // illegal input
*ptr = '\0';
add(make_word(ptr+1, atoi(buf), 0));
}
fclose(fp);
return true;
}
}
}
没有合适的资源?快使用搜索试试~ 我知道了~
64位编译完成的pymmseg
共26个文件
h:7个
py:4个
cpp:4个
4星 · 超过85%的资源 需积分: 10 22 下载量 105 浏览量
2015-07-29
17:35:28
上传
评论 1
收藏 1008KB ZIP 举报
温馨提示
libsvm 的依赖库, win 64位的已经编译完成的库,可以直接考到site-package里面。
资源推荐
资源详情
资源评论
收起资源包目录
pymmseg-cpp.zip (26个子文件)
pymmseg-cpp
mmseg-cpp
token.h 413B
mmseg.exp 1KB
dict.cpp 6KB
memory.obj 15KB
word.h 1KB
algor.obj 403KB
dict.obj 33KB
memory.h 1KB
mmseg.dll 130KB
dict.h 913B
memory.cpp 200B
rules.h 2KB
algor.cpp 6KB
algor.h 2KB
chunk.h 1KB
mmseg.cpp 2KB
mmseg.obj 253KB
build.py 2KB
mmseg.lib 3KB
mmseg.py 4KB
release.py 1KB
__init__.py 0B
data
chars.dic 78KB
words.dic 1.25MB
README 3KB
bin
pymmseg 837B
共 26 条
- 1
资源评论
- kou9982019-12-24东西是正的。。但是没用上。。
ShomyLiu
- 粉丝: 69
- 资源: 2
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功