import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm
def normalization(data, parameters=None):
'''数据归一化
Args:
- data: original data
Returns:
- norm_data: normalized data
- norm_parameters: min_val, max_val for each feature for renormalization
'''
# Parameters
_, dim = data.shape
norm_data = data.copy()
if parameters is None:
# MixMax normalization
min_val = np.zeros(dim)
max_val = np.zeros(dim)
# For each dimension
for i in range(dim):
min_val[i] = np.nanmin(norm_data[:, i])
norm_data[:, i] = norm_data[:, i] - np.nanmin(norm_data[:, i])
max_val[i] = np.nanmax(norm_data[:, i])
norm_data[:, i] = norm_data[:, i] / (np.nanmax(norm_data[:, i]) + 1e-6)
# Return norm_parameters for renormalization
norm_parameters = {'min_val': min_val,'max_val': max_val}
else:
min_val = parameters['min_val']
max_val = parameters['max_val']
# For each dimension
for i in range(dim):
norm_data[:, i] = norm_data[:, i] - min_val[i]
norm_data[:, i] = norm_data[:, i] / (max_val[i] + 1e-6)
norm_parameters = parameters
return norm_data, norm_parameters
def renormalization(norm_data, norm_parameters):
'''Renormalize data from [0, 1] range to the original range.
Args:
- norm_data: normalized data
- norm_parameters: min_val, max_val for each feature for renormalization
Returns:
- renorm_data: renormalized original data
'''
min_val = norm_parameters['min_val']
max_val = norm_parameters['max_val']
_, dim = norm_data.shape
renorm_data = norm_data.copy()
for i in range(dim):
renorm_data[:, i] = renorm_data[:, i] * (max_val[i] + 1e-6)
renorm_data[:, i] = renorm_data[:, i] + min_val[i]
return renorm_data
def rounding(imputed_data, data_x):
'''对于类别变量,对填补数据进行四舍五入
Args:
- imputed_data: imputed data
- data_x: original data with missing values
Returns:
- rounded_data: rounded imputed data
'''
_, dim = data_x.shape
rounded_data = imputed_data.copy()
for i in range(dim):
temp = data_x[~np.isnan(data_x[:, i]), i]
# Only for the categorical variable
if len(np.unique(temp)) < 20:
rounded_data[:, i] = np.round(rounded_data[:, i])
return rounded_data
# 生成Mask Vector 和 Hint Vector
def sample_M(m, n, p):
'''
p:缺失率
A:(0,1)均匀分布的[m,n]矩阵
B:[m,n]布尔矩阵
C:[m,n]1 0矩阵(大约(1-p)%的1,p%的0)
'''
A = np.random.uniform(0., 1., size=[m, n]) # 生成(0,1)分布随机采样
B = A > p
C = 1. * B
return C
def xavier_init(size):
'''Xavier initialization.
Args:
- size: vector size
Returns:
- initialized random vector.
'''
in_dim = size[0]
xavier_stddev = 1. / np.sqrt(in_dim / 2.)
return np.random.normal(size=size, scale=xavier_stddev)
def sample_batch_index(total, batch_size):
'''Sample index of the mini-batch.
Args:
- total: total number of samples
- batch_size: batch size
Returns:
- batch_idx: batch index
'''
total_idx = np.random.permutation(total) # 对序列total随机排序
batch_idx = total_idx[:batch_size] #获取前batch_size个值的数组
return batch_idx
# 随机样本生成器
def sample_Z(m, n):
return np.random.uniform(0, 0.01, size = [m, n]) # 生成(0,1)分布随机采样
def gain(data_x):
'''Impute missing values in data_x
Args:
- data_x: original data with missing values
Returns:
- imputed_data: imputed data
'''
# Define mask matrix
data_m = 1. - np.isnan(data_x) # 定义Mask矩阵(缺失数据为0,非缺失数据为1)
h_dim = int(dim) # 隐含层维度
# 归一化
norm_data, norm_parameters = normalization(data_x)
norm_data_x = np.nan_to_num(norm_data, 0) # 将NaN替换为0
## GAIN architecture
# Discriminator variables
if use_gpu is True:
D_W1 = torch.tensor(xavier_init([dim * 2, h_dim]), requires_grad=True, device="cuda") # Data + Hint as inputs
D_b1 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True, device="cuda")
D_W2 = torch.tensor(xavier_init([h_dim, h_dim]), requires_grad=True, device="cuda")
D_b2 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True, device="cuda")
D_W3 = torch.tensor(xavier_init([h_dim, dim]), requires_grad=True, device="cuda")
D_b3 = torch.tensor(np.zeros(shape=[dim]), requires_grad=True, device="cuda") # Output is multi-variate
else:
D_W1 = torch.tensor(xavier_init([dim * 2, h_dim]), requires_grad=True) # Data + Hint as inputs
D_b1 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True)
D_W2 = torch.tensor(xavier_init([h_dim, h_dim]), requires_grad=True)
D_b2 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True)
D_W3 = torch.tensor(xavier_init([h_dim, dim]), requires_grad=True)
D_b3 = torch.tensor(np.zeros(shape=[dim]), requires_grad=True) # Output is multi-variate
theta_D = [D_W1, D_W2, D_W3, D_b1, D_b2, D_b3]
# Generator variables
# Data + Mask as inputs (Random noise is in missing components)
if use_gpu is True:
G_W1 = torch.tensor(xavier_init([dim * 2, h_dim]), requires_grad=True,
device="cuda") # Data + Mask as inputs (Random Noises are in Missing Components)
G_b1 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True, device="cuda")
G_W2 = torch.tensor(xavier_init([h_dim, h_dim]), requires_grad=True, device="cuda")
G_b2 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True, device="cuda")
G_W3 = torch.tensor(xavier_init([h_dim, dim]), requires_grad=True, device="cuda")
G_b3 = torch.tensor(np.zeros(shape=[dim]), requires_grad=True, device="cuda")
else:
G_W1 = torch.tensor(xavier_init([dim * 2, h_dim]),
requires_grad=True) # Data + Mask as inputs (Random Noises are in Missing Components)
G_b1 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True)
G_W2 = torch.tensor(xavier_init([h_dim, h_dim]), requires_grad=True)
G_b2 = torch.tensor(np.zeros(shape=[h_dim]), requires_grad=True)
G_W3 = torch.tensor(xavier_init([h_dim, dim]), requires_grad=True)
G_b3 = torch.tensor(np.zeros(shape=[dim]), requires_grad=True)
theta_G = [G_W1, G_W2, G_W3, G_b1, G_b2, G_b3]
## GAIN functions
# Generator
def generator(x, m):
# Concatenate Data and Mask
inputs = torch.cat(dim=1, tensors=[x, m])
G_h1 = F.relu(torch.matmul(inputs, G_W1) + G_b1)
G_h2 = F.relu(torch.matmul(G_h1, G_W2) + G_b2)
# MinMax normalized output
G_prob = torch.sigmoid(torch.matmul(G_h2, G_W3) + G_b3)
return G_prob
# Discriminator
def discriminator(x, h):
# Concatenate Data and Hint
inputs = torch.cat(dim=1, tensors=[x, h])
D_h1 = F.relu(torch.matmul(inputs, D_W1) + D_b1)
D_h2 = F.relu(torch.matmul(D_h1, D_W2) + D_b2)
D_logit = torch.matmul(D_h2, D_W3) + D_b3
D_prob = torch.sigmoid(D_logit)
return D_prob
# GAIN Loss
def discriminator_loss(M, X, H):
# Generator
G_sample = generator(X, M)
# Combine with original data
Hat_X = X * M + G_sample * (1 - M)
# Discriminator
D_prob = discriminator(Hat_X, H)
# %% Loss
D_loss = -torch.mean(M
基于GAN的Spam数据集缺失数据填补的代码实现
版权申诉
5星 · 超过95%的资源 184 浏览量
2022-06-27
16:51:39
上传
评论 4
收藏 127KB ZIP 举报
wendy_ya
- 粉丝: 3w+
- 资源: 202
最新资源
- Docker容器配置进阶
- tensorflow-gpu-2.7.4-cp37-cp37m-manylinux2010-x86-64.whl
- 多段线、 圆、弧转多段线(仅我可见)
- tensorflow-2.7.2-cp38-cp38-manylinux2010-x86-64.whl
- yeyue-p8Yi4-ve4a83792.apk
- tensorflow-gpu-2.7.3-cp38-cp38-manylinux2010-x86-64.whl
- 五相感应电机矢量控制模型MATLAB
- RGLED (1) (1).circ
- IMG_20240427_215747.jpg
- python下前端WEB学习笔记
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
评论6