# -*- coding: utf-8 -*-
import os
from itertools import izip
import h5py
import numpy as np
import copy
import math
from config import HERE
from utils.logger import BaseLogger
class H5EmbeddingManager(BaseLogger):
def __init__(self, h5_path, mode='disk', **kwargs):
super(H5EmbeddingManager, self).__init__(**kwargs)
self.mode = mode
f = h5py.File(h5_path, 'r')
if mode == 'disk':
self.W = f['embedding']
elif mode == 'in-memory':
self.W = f['embedding'][:]
message = "load mode=%s, embedding data type=%s, shape=%s" % (self.mode, type(self.W), self.W.shape)
self.info(message)
words_flatten = f['words_flatten'][0]
self.id2word = words_flatten.split('\n')
assert len(self.id2word) == f.attrs['vocab_len'], "%s != %s" % (len(self.id2word), f.attrs['vocab_len'])
self.word2id = dict(izip(self.id2word, range(len(self.id2word))))
del words_flatten
def __getitem__(self, item):
item_type = type(item)
if item_type is str:
index = self.word2id[item]
embs = self.W[index]
return embs
else:
raise RuntimeError("don't support type: %s" % type(item))
def init_word_embedding(self, words, dim_size=300, scale=0.1, mode='google'):
print('loading word embedding.')
word2id = self.word2id
W = self.W
shape = (len(words), dim_size)
np.random.seed(len(words))
# W2V = np.random.uniform(low=-scale, high=scale, size=shape).astype('float32')
W2V = np.zeros(shape, dtype='float32')
for i, word in enumerate(words[1:], 1):
if word in word2id:
_id = word2id[word]
vec = W[_id]
vec /= np.linalg.norm(vec)
elif word.capitalize() in word2id:
_id = word2id[word.capitalize()]
vec = W[_id]
vec /= np.linalg.norm(vec)
else:
vec = np.random.normal(0, 1.0, 300)
vec = (0.01 * vec).astype('float32')
W2V[i] = vec[:dim_size]
return W2V
def init_word_embedding1(self, words, dim_size=300, scale=0.1, mode='google'):
word2id = self.word2id
W = self.W
shape = (len(words), dim_size)
np.random.seed(len(words))
# W2V = np.random.uniform(low=-scale, high=scale, size=shape).astype('float32')
W2V = np.random.normal(0, 1.0, size=shape).astype('float32') * 0.01
W2V[0, :] = 0
if mode == 'random':
return W2V
in_vocab = np.ones(shape[0], dtype=np.bool)
oov_set = set()
word_ids = []
for i, word in enumerate(words):
_id = -1
try:
_id = word2id[word]
except KeyError:
pass
if _id < 0:
try:
_id = word2id[word.capitalize()]
except KeyError:
pass
if _id < 0:
in_vocab[i] = False
if not word.startswith("$oov-"):
oov_set.update([word])
else:
word_ids.append(_id)
if self.mode == 'in-memory':
W2V[in_vocab][:, :] = W[np.array(word_ids, dtype='int32')][:, :dim_size]
else:
nonzero_ids = in_vocab.nonzero()[0]
for i in nonzero_ids:
emb = W[word_ids[i]]
W2V[i][:] = emb[:dim_size]
# logger.debug("%s words is not in google word2vec, and it is random "
# "initialized: %s" % (len(oov_set), oov_set))
return W2V
class EmbeddingInitEnhancer(BaseLogger):
'''
For more details, read "Counter-fitting Word Vectors to Linguistic Constraints"
'''
def __init__(self, init_word_vectors, vocab, repel_path_list, attract_path_list, **kwargs):
super(EmbeddingInitEnhancer, self).__init__(**kwargs)
self.build_word_vector_map(init_word_vectors, vocab)
self.init_vocab = vocab
self.repel_path_list = repel_path_list
self.attract_path_list = attract_path_list
self.repel = set()
self.attract = set()
# and we then have true the information to collect true the linguistic constraints:
for syn_filepath in self.attract_path_list:
self.attract = self.attract | self.load_constraints(syn_filepath, self.vocab)
for ant_filepath in self.repel_path_list:
self.repel = self.repel | self.load_constraints(ant_filepath, self.vocab)
# finally, set the experiment hyperparameters:
self.set_hyperparameters()
def build_word_vector_map(self, init_word_vectors, vocab):
self.word_vectors = {}
for i in xrange(len(vocab)):
self.word_vectors[vocab[i]] = init_word_vectors[i]
self.vocab = set(vocab)
def vector_map_to_vectors(self, word_vectors):
vector_list = [word_vectors[v] for v in self.init_vocab]
return np.vstack(vector_list)
def load_constraints(self, constraints_filepath, vocab):
"""
This methods reads a collection of constraints from the specified file, and returns a set with
true constraints for which both of their constituent words are in the specified vocabulary.
"""
constraints_filepath.strip()
constraints = set()
with open(constraints_filepath, "r+") as f:
for line in f:
word_pair = line.split()
if word_pair[0] in vocab and word_pair[1] in vocab and word_pair[0] != word_pair[1]:
constraints |= {(word_pair[0], word_pair[1])}
constraints |= {(word_pair[1], word_pair[0])}
self.info("%s yielded %s constraints." % (constraints_filepath, len(constraints)))
return constraints
def set_hyperparameters(self):
"""
This method sets the hyperparameters of the procedure as specified in the paper.
"""
self.hyper_k1 = 0.1
self.hyper_k2 = 0.1
self.hyper_k3 = 0.1
self.delta = 1.0
self.gamma = 0.0
self.rho = 0.2
self.info("embedding init enhancer hyperparameters --- k_1: %s, k_2: %s, k_3: %s, delta: %s, gamma: %s, rho: %s" %
(self.hyper_k1, self.hyper_k2, self.hyper_k3, self.delta, self.gamma, self.rho))
def get_enhanced_embedding(self, from_pretrained_vector=False):
"""
This method repeatedly applies SGD steps to counter-fit word vectors to linguistic constraints.
"""
word_vectors = self.word_vectors
repel = self.repel
attract = self.attract
current_iteration = 0
if from_pretrained_vector:
vsp_pairs = {}
if self.hyper_k3 > 0.0: # if we need to compute the VSP terms.
vsp_pairs = self.compute_vsp_pairs(word_vectors, self.vocab, rho=self.rho)
# Post-processing: remove synonym pairs which are deemed to be both synonyms and antonyms:
for repel_pair in repel:
if repel_pair in attract:
attract.remove(repel_pair)
if from_pretrained_vector and repel_pair in vsp_pairs:
del vsp_pairs[repel_pair]
max_iter = 20
self.info("repel pairs: %s, attract pairs: %s" % (len(repel), len(attract)))
self.info("Running the optimisation procedure for %s SGD steps..." % max_iter)
while current_iteration < max_iter:
current_iteration += 1
vsp_pairs = vsp_pairs if from_pretrained_vector else None
word_vectors = self.one_step_SGD(word_vectors, attract, repel, vsp_pairs)
return self.vector_map_to_vectors(word_vectors)
def one_step_SGD(self, word_vectors, attract_pairs, repel_pairs, vsp_pairs=None):
"""
This method performs a step of SGD to optimise the counterfitting cost funct
没有合适的资源?快使用搜索试试~ 我知道了~
资源推荐
资源详情
资源评论
收起资源包目录
人工智能-项目实践-深度学习-基于知识图谱的问答.zip (18个子文件)
KnowledgeGraph-QA-Service-master
main.py 416B
utils
__init__.py 1KB
neo4j_api.py 11KB
nlu_api.py 1KB
solr_api.py 5KB
logger.py 2KB
web
__init__.py 0B
Test.py 2KB
service
__init__.py 1KB
retrieval_service.py 4KB
template_service.py 13KB
const.py 2KB
wordembedding
__init__.py 2KB
embedding.py 16KB
readme 159B
embedding_h5.py 4KB
wordvector.py 2KB
config.py 1KB
共 18 条
- 1
资源评论
博士僧小星
- 粉丝: 1922
- 资源: 5884
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功