没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
//下面是我对 word2vec.c 的注释
//详细算法可以参考论文,或者看这篇博客 http://www.cnblogs.com/downtjs/p/3784440.html
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <pthread.h>
#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
#define MAX_EXP 6
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CODE_LENGTH 40
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
typedef float real; // Precision of float numbers
struct vocab_word
{
long long cn;//词频
int *point;//huffman 编码对应内节点的路径
char *word, *code, codelen;//huffman 编码
};
char train_file[MAX_STRING], output_file[MAX_STRING];
char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
struct vocab_word *vocab;
int binary = 0, cbow = 0, debug_mode = 2, window = 5, min_count = 5, num_threads = 1,
min_reduce = 1;
int *vocab_hash;
long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
long long train_words = 0, word_count_actual = 0, file_size = 0, classes = 0;
real alpha = 0.025, starting_alpha, sample = 0;
real *syn0, *syn1, *syn1neg, *expTable;
clock_t start;
int hs = 1, negative = 0;
const int table_size = 1e8;
int *table;
//每个单词的能量分布表,table 在负样本抽样中用到
void InitUnigramTable()
{
int a, i;
long long train_words_pow = 0;
real d1, power = 0.75;
table = (int *)malloc(table_size * sizeof(int));
for (a = 0; a < vocab_size; a++) //遍历词汇表,统计词的能量总值 train_words_pow,指数
power 应该是缩小值的吧。
train_words_pow += pow(vocab[a].cn, power);
i = 0;
d1 = pow(vocab[i].cn, power) / (real)train_words_pow;//表示已遍历的词的能量值占总能力值
的比例
for (a = 0; a < table_size; a++)//遍历 table。a 表示 table 的位置,i 表示词汇表的位置
{
table[a] = i;//单词 i 占用 table 的 a 位置
//table 反映的是一个单词能量的分布,一个单词能量越大,所占用的 table 的位置越多
if (a / (real)table_size > d1)
{
i++;//移动到下一个词
d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
}
if (i >= vocab_siInitNetze) i = vocab_size - 1;
}
}
// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
//从文件中读取一个词
void ReadWord(char *word, FILE *fin) {
int a = 0, ch;
while (!feof(fin)) {
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
if (a > 0) {
if (ch == '\n') ungetc(ch, fin);
break;
}
if (ch == '\n') {
strcpy(word, (char *)"</s>");
return;
} else continue;
}
word[a] = ch;
a++;
if (a >= MAX_STRING - 1) a--; // Truncate too long words
}
word[a] = 0;
}
// Returns hash value of a word 返回一个词的 hash 值,一个词跟 hash 值一一对应(可能冲
突)
int GetWordHash(char *word)
{
unsigned long long a, hash = 0;
for (a = 0; a < strlen(word); a++)
hash = hash * 257 + word[a];//采取 257 进制
hash = hash % vocab_hash_size;
return hash;
}
// Returns position of a word in the vocabulary; if the word is not found, returns -1
// 返回一个词在词汇表中的位置,如果不存在则返回-1
int SearchVocab(char *word)
{
unsigned int hash = GetWordHash(word);
while (1)
{
if (vocab_hash[hash] == -1) return -1;
if (!strcmp(word, vocab[vocab_hash[hash]].word))
return vocab_hash[hash];
hash = (hash + 1) % vocab_hash_size;
}
return -1;
}
// Reads a word and returns its index in the vocabulary
// 从文件流中读取一个词,并返回这个词在词汇表中的位置
int ReadWordIndex(FILE *fin)
{
char word[MAX_STRING];
ReadWord(word, fin);
if (feof(fin)) return -1;
return SearchVocab(word);
}
// Adds a word to the vocabulary 将一个词添加到一个词汇中
int AddWordToVocab(char *word)
{
unsigned int hash, length = strlen(word) + 1;
if (length > MAX_STRING)
length = MAX_STRING;
vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
strcpy(vocab[vocab_size].word, word);
vocab[vocab_size].cn = 0;
vocab_size++;
// Reallocate memory if needed
if (vocab_size + 2 >= vocab_max_size)
{
vocab_max_size += 1000;
vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
}
hash = GetWordHash(word);
while (vocab_hash[hash] != -1)//如果 hash 值冲突了
hash = (hash + 1) % vocab_hash_size;//使用开放地址法解决冲突
vocab_hash[hash] = vocab_size - 1;//由词的 hash 值找到她所在词汇表的排序位置
return vocab_size - 1;
}
// Used later for sorting by word counts
int VocabCompare(const void *a, const void *b)
{
return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
}
// Sorts the vocabulary by frequency using word counts
// 根据词频排序
void SortVocab()
{
int a, size;
unsigned int hash;
// Sort the vocabulary and keep </s> at the first position
qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
for (a = 0; a < vocab_hash_size; a++)
vocab_hash[a] = -1;
size = vocab_size;
train_words = 0;
for (a = 0; a < size; a++)
{
// Words occuring less than min_count times will be discarded from the vocab
//出现太少的词直接丢弃
if (vocab[a].cn < min_count)
{
vocab_size--;
free(vocab[vocab_size].word);
}
else
{
// Hash will be re-computed, as after the sorting it is not actual
// 重新计算 hash 查找。vocab_hash 是由 hash 值找到该词所在位置
hash=GetWordHash(vocab[a].word);
while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
vocab_hash[hash] = a;
train_words += vocab[a].cn;
}
}
vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
// Allocate memory for the binary tree construction
for (a = 0; a < vocab_size; a++)
{
vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
}
}
// Reduces the vocabulary by removing infrequent tokens
// 再次移除词频过小的词,缩减词汇表
void ReduceVocab()
{
int a, b = 0;
unsigned int hash;
for (a = 0; a < vocab_size; a++)//我草,这很容易看错啊
剩余20页未读,继续阅读
资源评论
kxgwan
- 粉丝: 2
- 资源: 19
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功