没有合适的资源？快使用搜索试试~ 我知道了~

文库首页后端C++word2vec源码解析

word2vec源码解析

word2vec

源码

需积分: 25 47 下载量 87 浏览量 2016-05-03 16:36:24 上传评论 2 收藏 31KB DOCX 举报

温馨提示

试读

21页

word2vec源码解析word2vec源码解析word2vec源码解析

资源推荐

资源详情

资源评论

//下面是我对 word2vec.c 的注释

//详细算法可以参考论文，或者看这篇博客 http://www.cnblogs.com/downtjs/p/3784440.html

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <math.h>

#include <pthread.h>

#define MAX_STRING 100

#define EXP_TABLE_SIZE 1000

#define MAX_EXP 6

#define MAX_SENTENCE_LENGTH 1000

#define MAX_CODE_LENGTH 40

const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary

typedef float real; // Precision of float numbers

struct vocab_word

{

long long cn;//词频

int *point;//huffman 编码对应内节点的路径

char *word, *code, codelen;//huffman 编码

};

char train_file[MAX_STRING], output_file[MAX_STRING];

char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];

struct vocab_word *vocab;

int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1,

min_reduce = 1;

int *vocab_hash;

long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;

long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;

real alpha = 0.025, starting_alpha, sample = 0;

real *syn0, *syn1, *syn1neg, *expTable;

clock_t start;

int hs = 1, negative = 0;

const int table_size = 1e8;

int *table;

//每个单词的能量分布表，table 在负样本抽样中用到

void InitUnigramTable()

{

int a, i;

long long train_words_pow = 0;

real d1, power = 0.75;

table = (int *)malloc(table_size * sizeof(int));

for (a = 0; a < vocab_size; a++) //遍历词汇表，统计词的能量总值 train_words_pow，指数

power 应该是缩小值的吧。

train_words_pow += pow(vocab[a].cn, power);

i = 0;

d1 = pow(vocab[i].cn, power) / (real)train_words_pow;//表示已遍历的词的能量值占总能力值

的比例

for (a = 0; a < table_size; a++)//遍历 table。a 表示 table 的位置，i 表示词汇表的位置

{

table[a] = i;//单词 i 占用 table 的 a 位置

//table 反映的是一个单词能量的分布，一个单词能量越大，所占用的 table 的位置越多

if (a / (real)table_size > d1)

{

i++;//移动到下一个词

d1 += pow(vocab[i].cn, power) / (real)train_words_pow;

}

if (i >= vocab_siInitNetze) i = vocab_size - 1;

}

// Reads a single word from a file, assuming space + tab + EOL to be word boundaries

//从文件中读取一个词

void ReadWord(char *word, FILE *fin) {

int a = 0, ch;

while (!feof(fin)) {

ch = fgetc(fin);

if (ch == 13) continue;

if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {

if (a > 0) {

if (ch == '\n') ungetc(ch, fin);

break;

}

if (ch == '\n') {

strcpy(word, (char *)"</s>");

return;

} else continue;

}

word[a] = ch;

a++;

if (a >= MAX_STRING - 1) a--; // Truncate too long words

}

word[a] = 0;

}

// Returns hash value of a word 返回一个词的 hash 值，一个词跟 hash 值一一对应（可能冲

突）

int GetWordHash(char *word)

{

unsigned long long a, hash = 0;

for (a = 0; a < strlen(word); a++)

hash = hash * 257 + word[a];//采取 257 进制

hash = hash % vocab_hash_size;

return hash;

}

// Returns position of a word in the vocabulary; if the word is not found, returns -1

// 返回一个词在词汇表中的位置，如果不存在则返回-1

int SearchVocab(char *word)

{

unsigned int hash = GetWordHash(word);

while (1)

{

if (vocab_hash[hash] == -1) return -1;

if (!strcmp(word, vocab[vocab_hash[hash]].word))

return vocab_hash[hash];

hash = (hash + 1) % vocab_hash_size;

}

return -1;

}

// Reads a word and returns its index in the vocabulary

// 从文件流中读取一个词，并返回这个词在词汇表中的位置

int ReadWordIndex(FILE *fin)

{

char word[MAX_STRING];

ReadWord(word, fin);

if (feof(fin)) return -1;

return SearchVocab(word);

}

// Adds a word to the vocabulary 将一个词添加到一个词汇中

int AddWordToVocab(char *word)

{

unsigned int hash, length = strlen(word) + 1;

if (length > MAX_STRING)

length = MAX_STRING;

vocab[vocab_size].word = (char *)calloc(length, sizeof(char));

strcpy(vocab[vocab_size].word, word);

vocab[vocab_size].cn = 0;

vocab_size++;

// Reallocate memory if needed

if (vocab_size + 2 >= vocab_max_size)

{

vocab_max_size += 1000;

vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));

}

hash = GetWordHash(word);

while (vocab_hash[hash] != -1)//如果 hash 值冲突了

hash = (hash + 1) % vocab_hash_size;//使用开放地址法解决冲突

vocab_hash[hash] = vocab_size - 1;//由词的 hash 值找到她所在词汇表的排序位置

return vocab_size - 1;

}

// Used later for sorting by word counts

int VocabCompare(const void *a, const void *b)

{

return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;

}

// Sorts the vocabulary by frequency using word counts

// 根据词频排序

void SortVocab()

{

int a, size;

unsigned int hash;

// Sort the vocabulary and keep </s> at the first position

qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);

for (a = 0; a < vocab_hash_size; a++)

vocab_hash[a] = -1;

size = vocab_size;

train_words = 0;

for (a = 0; a < size; a++)

{

// Words occuring less than min_count times will be discarded from the vocab

//出现太少的词直接丢弃

if (vocab[a].cn < min_count)

{

vocab_size--;

free(vocab[vocab_size].word);

}

else

{

// Hash will be re-computed, as after the sorting it is not actual

// 重新计算 hash 查找。vocab_hash 是由 hash 值找到该词所在位置

hash=GetWordHash(vocab[a].word);

while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;

vocab_hash[hash] = a;

train_words += vocab[a].cn;

}

vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));

// Allocate memory for the binary tree construction

for (a = 0; a < vocab_size; a++)

{

vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));

vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));

}

// Reduces the vocabulary by removing infrequent tokens

// 再次移除词频过小的词，缩减词汇表

void ReduceVocab()

{

int a, b = 0;

unsigned int hash;

for (a = 0; a < vocab_size; a++)//我草，这很容易看错啊

剩余20页未读，继续阅读

评论收藏

内容反馈

资源评论

资源反馈

评论星级较低，若资源使用遇到问题可联系上传者，3个工作日内问题未解决可申请退款~

kxgwan

粉丝: 2
资源: 19

上传资源快速赚钱

我的内容管理展开

我的资源快来上传第一个资源

我的收益

登录查看自己的收益

我的积分登录查看自己的积分

我的C币登录后查看C币余额

我的收藏

我的下载

下载帮助

前往需求广场，查看用户热搜

word2vec源码解析

word2vec 源码解析

word2vec源代码

word2vec源码解析.pdf

word2vec 完整源码

word2vec源码与原理

唐宇迪word2vec的系列代码自然语言处理

word2vec.tar.gz 源码 安装文件

word2vec源码及文档

Word2Vec Python源代码

google word2vec开源项目

word2vec的源码

word2vec详解_word2vec_源码

word2vec注释版c语言源码

word2vec-api-源码.rar

word2vec源码-C语言版

Word2VEC c语言版源代码

Word2VEC_java 源代码

word2vec

word2vector

word2vec词向量训练及中文文本相似度计算 【源码+语料】

Word2Vec in C++ 11

word2vec-master

word2vec：word2vec ++是单词的分布式表示（word2vec）库和工具的实现，从头开始用C ++ 11编写

Word2vec c版本

VC读取Word文档

最新资源

word2vec.tar.gz 源码安装文件

word2vec词向量训练及中文文本相似度计算【源码+语料】