#-*-coding:utf-8-*-
import pandas as pd
import numpy as np
import os
from keras.models import Model
from keras.layers import Dense,Input
from keras.models import load_model
from sklearn.model_selection import train_test_split
import pickle
import random
import warnings
warnings.filterwarnings('ignore')
random.seed(21)
def processData(inputData):
'''
数据预处理
:param inputData: 输入原始数据
:return: 清洗后的数据
'''
arr = inputData['array'].apply(lambda x: map(int, x[1:-1].split(','))) #原始数据为字符串,对原始数据进行切分并转换成int型
arr_ = list(arr.values)
dataPro = pd.DataFrame(arr_)
dataPro.to_csv(filename_process,index=0)
return dataPro
def loadData():
'''
载入预处理后的数据
:return: 返回清洗后的数据
'''
if os.path.exists(npy_name):
print("...loading data export.npy...")
data = np.load(npy_name)
else:
if os.path.exists(filename_process):
print("...loading processed data...")
data = pd.read_csv(filename_process,header=None)
# np.save(npy_name, data.as_matrix())
else:
print("...processing original data...")
dataTmp = pd.read_csv(filename,nrows=13443)
data = processData(dataTmp)
# np.save(npy_name, data.as_matrix())
return data
def loadModel(backend):
'''
将模型训练完成后序列化至本地,可直接调用使用。
:return: 自编码模型
'''
if backend == 'theano': #若backend为theano则调用训练好的模型
print("....loading encoder model.... ")
encoder = load_model(model_name)
else: #backend为TensorFlow
print('...training model...')
x_train = np.load(x_train_npy)
encoding_dim = 16 #设置函数编码最终编码成16维
input_data = Input(shape=(x_train.shape[1],))
#encoded layers
encoded = Dense(256,activation='relu')(input_data)
encoded = Dense(64,activation='relu')(encoded)
encoder_output = Dense(encoding_dim)(encoded)
encoder = Model(inputs=input_data,outputs=encoder_output)
return encoder
def pearSim(inA,inB):
'''
皮尔逊相关系数计算,用于计算两个数据矩阵的相关性,结果归一化至0-1区间,接近于1则越相似
:param inA: 输入第一个矩阵
:param inB: 输入第二个矩阵
:return: 相似度
'''
if len(inA)<3:
return 1.0
else:
return 0.5+0.5*np.corrcoef(inA,inB,rowvar=0)[0][1]
if __name__ == '__main__':
filename = '../data/export.csv'
filename_process = '../data/process_export.csv'
npy_name = '../data/export.npy'
x_test_npy = '../data/x_test.npy'
x_train_npy = '../data/x_train.npy'
# npy_name = ''
model_name = '../code/encoder.h5'
backend = 'theano' #keras使用的平台,theano还是TensorFlow
# data = loadData() #数据载入
# x_train,x_test = train_test_split(data,train_size=0.6,random_state=21) #划分训练集与测试集
# x_train = x_train.astype('float32') #将数据转成float型,下同
# x_test = x_test.astype('float32')
# x_train = x_train.reshape((x_train.shape[0],-1))
# x_test = x_test.reshape((x_test.shape[0],-1))
# np.save('./data/x_test.npy',x_test)
print('...loading test data...')
x_test = np.load(x_test_npy) #载入测试数据
#loading model
encoder = loadModel(backend=backend) #载入自编码模型
encoded_data = encoder.predict(x_test) #对测试集进行编码
print('encode result:',encoded_data)
#encoding test data
# x_test = pd.DataFrame(x_test)
# x_test.to_csv('../result/test_function_export.csv',index=0,header=None) #保存测试集
encoded_data = pd.DataFrame(encoded_data)
encoded_data.to_csv('../result/test_encoded_export_theano.csv',index=0,header=None) #保存编码后的测试集
#编码后数据相关系数矩阵
# corr = encoded_data.corr()
# corr.to_csv('../result/encode_data_corr.csv',index=0,header=None)
#计算编码相似度,采用皮尔逊相关系数计算,相似度接近1则越相似
simValue = pearSim(encoded_data[0],encoded_data[2])
print('similarity value:',simValue)