# -*- coding: utf-8 -*-
"""
Created on Tue Sep 4 17:15:10 2018
@author: lenovo
"""
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
import math
from datetime import timedelta
close = pd.read_csv('./close.csv') #收盘价
ev = pd.read_csv('./ev.csv') #市值
pb_lf = pd.read_csv('./pb_lf.csv') #市净率
risk_beta120 = pd.read_csv('./risk_beta120.csv') #beta
tech_revs120 = pd.read_csv('./tech_revs120.csv') #6月动量
hs300 = pd.read_csv('./hs300.csv') #沪深300
def regular(data):
'''数据初步标准化,索引改为日期'''
data.index = data.iloc[:, 0]
data.drop('Unnamed: 0',axis=1, inplace=True)
data.index = pd.to_datetime(data.index)
return data
close = regular(close)
ev = regular(ev)
pb_lf = regular(pb_lf)
risk_beta120 = regular(risk_beta120)
tech_revs120 = regular(tech_revs120)
hs300 = regular(hs300)
def end_m(data):
return data.groupby(lambda x: x.year*100 + x.month).tail(1)
'''计算日对数收益率'''
yield_day = close/close.shift(1)
yield_day = yield_day.fillna(1)
yield_rate_day = np.log(yield_day)
yield_rate_month = yield_rate_day.resample('BM', how='sum') #月对数收益率
#yield_rate_day.index = pd.to_datetime(yield_rate_day.index)
vol_year = yield_rate_day.resample('BM', how='std') * math.sqrt(252) #年波动率
hs300_yield = hs300/hs300.shift(1)
hs300_day = hs300_yield.fillna(1)
hs300_day = np.log(hs300_day)
#hs300_day.index = pd.to_datetime(hs300_day.index)
hs300_month = hs300_day.resample('BM', how='sum')
sharpe_m = yield_rate_month.sub(hs300_month.iloc[:,0], axis=0) * 12 / vol_year
sharpe_m = sharpe_m.replace([float('inf'), float('-inf')], 0) #夏普率
sharpe_m[np.abs(sharpe_m)>50]=0
"""市值处理
initial_day = datetime.datetime(2010, 2, 1) #数据从2010年1月开始
def add_month(date_time):
'''让时间的月份加一'''
if date_time.month==12:
date = datetime.datetime(date_time.year+1, 1, date_time.day)
else:
date = datetime.datetime(date_time.year, date_time.month+1, date_time.day)
return date
def month(data):
'''取出月末数据'''
data_1 = pd.DataFrame()
compare_time = initial_day
for i in range(data.shape[0]):
# date_time = datetime.datetime.strptime(data.index[i],'%Y-%m-%d') #将str转换为时间格式
# if i%50 == 0:
# print(i)
if data.index[i]>= compare_time: #当这个月结束时
# print(i)
compare_time = add_month(compare_time) #更新比较日期为下个月1号
data_1 = data_1.append(data.iloc[i-1]) # 将月末的数据添入表格当中
return data_1
"""
ev_month = end_m(ev)
ev_max = max(ev_month.max())
ev_month = ev_month / ev_max #归一化市值
#pb_lf.index = pd.to_datetime(pb_lf.index) #账面市值比
pb_lf = 1 / pb_lf
pb_lf_month = end_m(pb_lf)
#risk_beta120.index = pd.to_datetime(risk_beta120.index)
#tech_revs120.index = pd.to_datetime(tech_revs120.index)
risk_beta120_month = end_m(risk_beta120) #120日beta
tech_revs120_month = end_m(tech_revs120) #六个月动量
dataset = pd.DataFrame(np.zeros(shape=[(ev_month.shape[0]-1)*300,8]),
columns=('ev', 'pb_lf', 'beta', '6month',
'vol', '1month', 'sharpe', 'next_m'))
for i in range(ev_month.shape[0]-1):
for j in range(300):
dataset.iloc[i*300+j][0] = ev_month.iloc[i][j]
dataset.iloc[i*300+j][1] = pb_lf_month.iloc[i][j]
dataset.iloc[i*300+j][2] = risk_beta120_month.iloc[i][j]
dataset.iloc[i*300+j][3] = tech_revs120_month.iloc[i][j]
dataset.iloc[i*300+j][4] = vol_year.iloc[i][j]
dataset.iloc[i*300+j][5] = yield_rate_month.iloc[i][j]
dataset.iloc[i*300+j][6] = sharpe_m.iloc[i][j]
dataset.iloc[i*300+j][7] = yield_rate_month.iloc[i+1][j]
if i%50 == 0:
print(i)
#dataset1 = dataset.dropna(axis=0)
dataset.to_csv('dataset.csv')
#用之前的数据做训练集,把空值数据直接去掉
data_train = dataset.iloc[:(ev_month.shape[0]-7)*300].dropna(axis=0)
#用最后半年的数据做测试集,可以做6组,其中把空值附为0
data_test = dataset.iloc[(ev_month.shape[0]-7)*300:].fillna(0)
data_train.to_csv('train.csv')
data_test.to_csv('test.csv')
'''
模型预测
'''
import tensorflow as tf
import tensorflow.contrib.eager as tfe
tf.enable_eager_execution()
def parse_csv(line):
'''将数据一条一条读进来,前7项为x,最后1项为y'''
example_defaults = [[0], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]] # sets field types
parsed_line = tf.decode_csv(line, example_defaults)
# First 4 fields are features, combine into single tensor
features = tf.reshape(parsed_line[1:-1], shape=(7,)) #因为第一列为索引,不要了
# Last field is the label
label = tf.reshape(parsed_line[-1], shape=())
return features, label
train_dataset = tf.data.TextLineDataset('train.csv')
train_dataset = train_dataset.skip(1) # skip the first header row
train_dataset = train_dataset.map(parse_csv) # parse each row
train_dataset = train_dataset.shuffle(buffer_size=24000) # randomize
train_dataset = train_dataset.batch(128)
# View a single example entry from a batch
features, label = iter(train_dataset).next()
#print("example features:", features[0])
#print("example label:", label[0])
#构建模型
'''这里可以设置模型参数,先暂定为两层隐藏层,每层10个神经元'''
model = tf.keras.Sequential([
tf.keras.layers.Dense(10, activation="relu", input_shape=(7,)), # input shape required
tf.keras.layers.Dense(10, activation="relu"),
tf.keras.layers.Dense(1)
])
def loss(model, x, y):
'''定义损失函数,用均方误差'''
# print(x)
# print(2)
# print(y)
y_ = model(x)
# print(3)
# print(y_)
return tf.reduce_mean(tf.square(y_ - y))
# return tf.losses.sparse_softmax_cross_entropy(labels=y, logits=y_)
def grad(model, inputs, targets):
'''定义参数更新'''
with tf.GradientTape() as tape:
loss_value = loss(model, inputs, targets)
return tape.gradient(loss_value, model.variables)
#优化器
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
#放置loss
train_loss_results = []
#train_accuracy_results = []
#训练代数
num_epochs = 201
for epoch in range(num_epochs):
epoch_loss_avg = tfe.metrics.Mean()
# epoch_accuracy = tfe.metrics.Accuracy()
# Training loop - using batches of 32
for x, y in train_dataset:
# Optimize the model
grads = grad(model, x, y)
optimizer.apply_gradients(zip(grads, model.variables),
global_step=tf.train.get_or_create_global_step())
# Track progress
epoch_loss_avg(loss(model, x, y)) # add current batch loss
# compare predicted label to actual label
# epoch_accuracy(tf.argmax(model(x), axis=1, output_type=tf.int32), y)
# end epoch
train_loss_results.append(epoch_loss_avg.result())
# train_accuracy_results.append(epoch_accuracy.result())
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f}".format(epoch,epoch_loss_avg.result()))
# epoch_accuracy.result()))
#绘制损失函数图像
fig, axes = plt.subplots(1, sharex=True, figsize=(8, 4))
fig.suptitle('Training Metrics')
axes.set_ylabel("Loss", fontsize=14)
axes.plot(train_loss_results)
#axes[1].set_ylabel("Accuracy", fontsize=14)
#axes[1].set_xlabel("Epoch", fontsize=14)
#axes[1].plot(train_accuracy_results)
plt.show()
'''构建测试集'''
test_dataset = tf.data.TextLineDataset('test.csv')
test_dataset = test_dataset.skip(1) # skip header row
test_dataset = test_dataset.map(parse_csv)