import random
import time
import json
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.autograd import Variable
import torch.nn.functional as F
import gensim
import sys
reload(sys)
sys.setdefaultencoding('utf8')
data_path = "../data/simple_question"
source_train_path = data_path + "/simple.v0.source.train"
source_test_path = data_path + "/simple.v0.source.test"
target_train_path = data_path + "/simple.v0.target.train"
target_test_path = data_path + "/simple.v0.target.test"
source_vocab_path = data_path + "/simple.v0.source.vocab"
target_vocab_path = data_path + "/simple.v0.target.vocab"
etype_train_path = data_path + "/simple.v0.source.etype.train"
etype_test_path = data_path + "/simple.v0.source.etype.test"
trs_train_path = data_path + "/simple.v0.source.trs.train"
trs_test_path = data_path + "/simple.v0.source.trs.test"
ps_train_path = data_path + "/simple.v0.source.ps.train"
ps_test_path = data_path + "/simple.v0.source.ps.test"
negrs_train_path = data_path + "/simple.v0.negrs.train"
negrs_test_path = data_path + "/simple.v0.negrs.test"
kse_t2v_path = "transh.fb2m.t2v"
kse_t2i_path = "transh.fb2m.t2i"
kse_r2v_path = "transh.fb2m.r2v"
kse_r2i_path = "transh.fb2m.r2i"
#tr2num_path = "transh.fb2m.tr2num.json"
w2v_path = "wikianswer.vectors.d200.bin"
all_type_list_path = data_path + "/../kb_data/freebase.FB2M.ts.json"
checkpoint_path = "checkpoint4"
source_vocab_size = None
target_vocab_size = None
encoder_dim_wf = 200
encoder_dim_pf = 10
encoder_dim_kf = 100
encoder_dim_mlp = 100
encoder_output_size = 310
decoder_input_size = 200
max_position_distance = 70
classifier_samples = 5
con_loss_lam = 0
learning_rates = [0.6, 0.15]
batch_size = 128
print_everys = [500,100]
save_everys = [1000,100]
part_epochs = [(8000, 2000, 1000), (100, 100, 100)]
#part_epochs = [(10, 10, 10), (10, 10, 10)]
turbo_num = 100
USE_CUDA = True
source_w2i = {}
source_i2w = {}
target_w2i = {}
target_i2w = {}
kse_t2v = {}
kse_t2i = {}
kse_i2t = {}
kse_r2v = {}
kse_r2i = {}
kse_i2r = {}
#tr2num = json.load(open(tr2num_path))
import model_apvaturbo
from model_apvaturbo import WFPFKFEncoderRNN, DecoderRNN, MLPClassifier
def load_vocabs(vocab_path):
w2i = {}
i2w = {}
vocab = open(vocab_path).read().split("\n")
for word in vocab:
if word == "":
continue
i = len(w2i)
w2i[word] = i
i2w[i] = word
return w2i, i2w
def extend_vocabs(word_list_path, w2i, i2w):
word_list = json.load(open(word_list_path))
for word in word_list:
if word == "":
continue
if word not in w2i:
i = len(w2i)
w2i[word] = i
i2w[i] = word
return w2i, i2w
def read_docs_to_seqs(docs, w2i):
seqs = []
for doc in docs:
words = doc.split(" ")
seq = [w2i[word] for word in words if word != ""]
seq.append(w2i["_EOS"])
seqs.append(seq)
return seqs
def replace_token_e(seqs, etypes, w2i):
seqs_p = []
etypes = [unicode(etype) for etype in etypes]
for i,seq in enumerate(seqs):
seq_p = []
for word in seq:
if word == w2i["<e>"]:
seq_p.append(w2i[etypes[i]])
else:
seq_p.append(word)
seqs_p.append(seq_p)
return seqs_p
def load_tr_vocabs(vocab_path):
name2i = {}
i2name = {}
lines = open(vocab_path).read().split("\n")
for line in lines:
if line == "":
continue
items = line.split("\t")
name2i[items[0]] = int(items[1])
i2name[int(items[1])] = items[0]
return name2i, i2name
def load_tr_vectors(vectors_path):
vectors = []
lines = open(vectors_path).read().split("\n")
for line in lines:
if line == "":
continue
vec = [float(number) for number in line.split("\t") if number != ""]
vectors.append(vec)
return vectors
def read_trs_to_seqs(trs, t2i, r2i):
t_all = []
rs_all = []
for tr in trs:
items = tr.split("\t")
t = items[1]
rs = items[2].split(" ")
if t in t2i:
t_all.append(t2i[t])
else:
t_all.append(t2i["_None"])
rs_all.append([r2i[r] for r in rs if r in r2i])
return t_all, rs_all
def read_position_seqs(docs):
seqs = []
for doc in docs:
seq = [int(p) for p in doc.split(" ")]
seqs.append(seq)
return seqs
def read_posrs_negrs(docs, r2i):
posrs = []
for doc in docs:
posr = [r2i[r] for r in doc.split("\t") if r in r2i]
posrs.append(posr)
return posrs
def get_batch(pairs, batch_size_local, USE_NEG=False, neg_r2i=None):
if batch_size_local is not None:
pairs_batch = []
while len(pairs_batch) < batch_size_local:
pair = random.choice(pairs)
pairs_batch.append(pair)
else:
batch_size_local = len(pairs)
pairs_batch = pairs
source_batch = []
target_batch = []
t_batch = []
rs_batch = []
ps_batch = []
for pair in pairs_batch:
source_batch.append(pair[0])
target_batch.append(pair[1])
t_batch.append(pair[2])
rs_batch.append(pair[3])
ps_batch.append(pair[4])
rs_lengths = [len(seq) for seq in rs_batch]
source_lengths = [len(seq) for seq in source_batch]
target_lengths = [len(seq) for seq in target_batch]
max_source_length = max(source_lengths)
max_target_length = max(target_lengths)
max_rs_length = max(rs_lengths)
seqs_padded = []
for seq in source_batch:
seqs_padded.append(seq + [source_w2i["_PAD"] for pad_num in range(max_source_length - len(seq))])
source_batch = seqs_padded
seqs_padded = []
for seq in target_batch:
seqs_padded.append(seq + [target_w2i["_PAD"] for pad_num in range(max_target_length - len(seq))])
target_batch = seqs_padded
seqs_padded = []
for seq in ps_batch:
seqs_padded.append(seq + [max_position_distance-1 for pad_num in range(max_source_length - len(seq))])
ps_batch = seqs_padded
seqs_padded = []
for seq in rs_batch:
seqs_padded.append(seq + [-1 for pad_num in range(max_rs_length - len(seq))])
rs_batch = seqs_padded
t_batch_vec = []
rs_batch_vec = []
for i in range(len(t_batch)):
t = t_batch[i]
t_vec = kse_t2v[t]
rs_vec = []
for r in rs_batch[i]:
r_vec = kse_r2v[r] if r != -1 else np.zeros(encoder_dim_kf)
rs_vec.append(r_vec)
t_batch_vec.append(t_vec)
rs_batch_vec.append(rs_vec)
source_batch = Variable(torch.LongTensor(source_batch)).transpose(0, 1) # (batch_size x max_len) tensors, transpose into (max_len x batch_size)
target_batch = Variable(torch.LongTensor(target_batch)).transpose(0, 1)
ps_batch = Variable(torch.LongTensor(ps_batch)).transpose(0, 1)
t_batch = Variable(torch.FloatTensor(t_batch_vec))
rs_batch = Variable(torch.FloatTensor(rs_batch_vec)).transpose(0, 1)
if USE_CUDA:
source_batch = source_batch.cuda()
target_batch = target_batch.cuda()
t_batch = t_batch.cuda()
rs_batch = rs_batch.cuda()
ps_batch = ps_batch.cuda()
if not USE_NEG:
return source_batch, source_lengths, target_batch, target_lengths, t_batch, rs_batch, rs_lengths, ps_batch
else:
samples = classifier_samples
batch_size_p = batch_size_local * samples
source_batch_p = source_batch.unsqueeze(2).expand(-1,-1,samples).contiguous().view(source_batch.size()[0], batch_size_p)
rs_batch_p = rs_batch.unsqueeze(2).expand(-1,-1,samples,-1).contiguous().view(rs_batch.size()[0], batch_size_p, rs_batch.size()[-1])
t_batch_p = t_batch.unsqueeze(1).expand(-1,samples,-1).contiguous().view(batch_size_p, t_batch.size()[-1])
ps_batch_p = ps_batch.unsqueeze(2).expand(-1,-1,samples).contiguous().view(ps_batch.size()[0], batch_size_p)
source_lengths_p = []
rs_lengths_p = []
class_r_batch = []
label_batch = []
for i in range(source_batch.size()[1]):
source_lengths_p += [source_lengths[i]]*samples
rs_lengths_p += [rs_lengths[i]]*samples
samples_pos = 1
samples_neg = samples - samples_pos
posr_set = pairs_batch[i][5]
negr_set = pairs_batch[i][6]
while len(negr_set) < (samples_neg):
negr = target_i2w[random.choice(range(len(t