# coding:utf-8
import re
import csv
import time
import json
import xlrd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# from pandas import ExcelWriter
# from pandas import options
# options.io.excel.xlsx.writer='xlsxwriter'
class Forecast():
def __init__(self):
pass
def read_train_data(self,data_file_path='D:/TFTData/训练.xlsx'):
source_file_path=u'D:/TFTData/训练.xlsx'
target_file_path=u'D:/TFTData/sorted_train_data.csv'
data=xlrd.open_workbook(source_file_path)
table=data.sheets()[0]
nrows=table.nrows
ncols=table.ncols
corr_dict={}
num_dict={}
count_i=0
for col_i in range(1,ncols):
# count_i+=1
# print count_i
i_list=[]
indpt_var_i=table.col_values(col_i)[1:]
dpt_var_i=table.col_values(ncols-1)[1:]
target_list=self.remove_null_value(indpt_var_list=indpt_var_i,dpt_var_list=dpt_var_i)
corr_value=self.get_correlation_coefficent(target_list[0],target_list[1])
corr_dict[col_i]=corr_value
num_dict[col_i]=len(target_list[0])
file_path_1 = 'D:/TFTData/corr_dict.txt'
file_path_2 = 'D:/TFTData/num_dict.txt'
with open(file_path_1, 'w') as f:
json.dump(corr_dict, f)
with open(file_path_2, 'w') as f:
json.dump(num_dict, f)
def get_correlation_coefficent(self,array_one=None,array_two=None ):
if len(array_one)<=1:
return 0.00
else:
data=np.array([array_one,array_two]).T
df=pd.DataFrame(data=data)
return df.corr()[0][1]
def remove_null_value(self,indpt_var_list=None,dpt_var_list=None):
count_i=0
for data_i in indpt_var_list[:]:
count_i+=1
# print count_i
# try:
# data_i=float(data_i)
# indpt_var_list=self.tranform_str_to_float(target_list=indpt_var_list)
# dpt_var_list=self.tranform_str_to_float(target_list=dpt_var_list)
# except:
# index_num = indpt_var_list.index(data_i)
# indpt_var_list.pop(index_num)
# dpt_var_list.pop(index_num)
# continue
if u'' in indpt_var_list:
index_num = indpt_var_list.index(u'')
indpt_var_list.pop(index_num)
dpt_var_list.pop(index_num)
elif isinstance(data_i,(str)):
count_i += 1
if count_i>20:
indpt_var_list=[]
dpt_var_list=[]
break
else:
break
return [indpt_var_list,dpt_var_list]
def tranform_str_to_float(self,target_list=None):
i_list=[]
for i in target_list:
if i==u'':
i_list.append(i)
continue
try:
i_num=float(i)
i_list.append(i_num)
except:
i_list.append(i)
return i_list
def tranform_str_to_float_1(self,target_list=None):
i_list=[]
judge_i=0
if target_list[0]==u'ID':
return target_list
i_list.append(target_list[0])
try:
float(target_list[1])
judge_i=1
except:
i_list=self.tranform_letter_to_num(target_list[1:])
i_list.insert(0,target_list[0])
if judge_i==1:
for i in target_list[1:]:
i_num=float(i)
i_list.append(i_num)
return i_list
def tranform_letter_to_num(self,target_list=['a','d','v','b','r','t','u','i','w','s','h','j','l',]):
s_list=set(target_list)
i_dict={}
count_i=0.0
for i in s_list:
count_i+=1.0
i_dict[i]=count_i
for i_data in target_list:
index_num = target_list.index(i_data)
target_list[index_num] = i_dict[i_data]
return target_list
def corr_sort(self,corr_dict=None):
file_path = 'F:/Deepleaning/TFTData/corr_dict_good.txt'
with open(file_path, 'r') as f:
source_dict = json.load(f)
nan_list={}
sorted_dict = sorted(source_dict.items(), key=lambda d: d[1], reverse=True)
for i in sorted_dict:
print(i[0],i[1])
# return nan_list
def cleaning_data(self):
source_file_path = u'D:/TFTData/训练.xlsx'
target_file_path = u'D:/TFTData/remove_nan_data.csv'
target_file_path_1 = 'D:/TFTData/remove_nan_data.xlsx'
data = xlrd.open_workbook(source_file_path)
table = data.sheets()[0]
nrows = table.nrows
ncols = table.ncols
corr_dict = {}
num_dict = {}
w_list=[]
for col_i in range(ncols):
nan_list=self.corr_sort()
transform_i=u'{}'.format(col_i)
if transform_i in nan_list:
continue
else:
i_list = table.col_values(col_i)
w_list.append(i_list)
# np_array=np.array(w_list)
# df=pd.DataFrame(np_array)
# with pd.ExcelWriter(target_file_path_1) as writer:
# df.to_excel(writer,sheet_name='Sheet1')
# writer.save()
# df.to_excel(target_file_path_1,sheet_name='Sheet1')
with open(target_file_path, 'wb') as f:
writer = csv.writer(f)
writer.writerows(w_list)
def calculation_corr(self):
source_file_path = 'F:/Deepleaning/TFTData/remove_nan_data.csv'
target_file_path = 'F:/Deepleaning/TFTData/remove_nan_data.xlsx'
tranform_list=[]
with open(source_file_path, 'r') as f:
reader = csv.reader(f)
count_i=0
for i in reader:
count_i+=1
try:
num_list=self.tranform_str_to_float(i[1:])
num_list.insert(0, i[0])
tranform_list.append(num_list)
except:
tranform_list.append(i)
# if count_i==8000:
# break
np_array = np.array(tranform_list).T
df = pd.DataFrame(np_array)
with pd.ExcelWriter(target_file_path) as writer:
df.to_excel(writer, sheet_name='Sheet1')
writer.save()
def calculation_corr_good_data(self):
source_file_path = 'F:/Deepleaning/TFTData/remove_nan_data.xlsx'
target_file_path = u'D:/TFTData/sorted_train_data.csv'
data = xlrd.open_workbook(source_file_path)
table = data.sheets()[0]
nrows = table.nrows
ncols = table.ncols
corr_dict = {}
num_dict = {}
count_i = 0
for col_i in range(ncols):
count_i+=1
# print count_i
i_list = []
indpt_var_i = self.tranform_str_to_float(table.col_values(col_i)[2:])
dpt_var_i = self.tranform_str_to_float(table.col_values(ncols - 1)[2:])
target_list = self.remove_null_value(indpt_var_list=indpt_var_i, dpt_var_list=dpt_var_i)
corr_value = self.get_correlation_coefficent(target_list[0], target_list[1])
corr_dict[col_i] = corr_value
num_dict[col_i] = len(target_list[0])
file_path_1 = 'F:/Deepleaning/TFTData/corr_dict_good.txt'
file_path_2 = 'F:/Deepleaning/TFTData/num_dict_good.txt'
with open(file_path_1, 'w') as f:
json.dump(corr_dict, f)
with open(file_path_2, 'w') as f:
json.dump(num_dict, f)
def arrange_data(self):
source_file_path = u'D:/TFT