from numpy import *
import numpy as np
import tushare as ts
import pandas as pd
import talib
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
# Get the stock data from tushare
def get_stock_data(code,pred_days):
df_raw = ts.get_k_data(code)
# Classification
label = ['']*len(df_raw['close'])
for i in range(len(df_raw['close'])-pred_days):
if (df_raw['close'][i + pred_days] - df_raw['close'][i]) > 0:
label[i] = 1
else:
label[i] = -1
# Save to csv file
df_raw['LABEL'] = label
df_raw.to_csv('raw_stock_data.csv')
return 'raw_stock_data.csv'
def exponential_smoothing(alpha, s):
s2 = np.zeros(s.shape)
s2[0] = s[0]
for i in range(1, len(s2)):
s2[i] = alpha*float(s[i])+(1-alpha)*float(s2[i-1])
return s2
# preprocess the stock data with exponential_smoothing
def em_stock_data(pathfile,alpha):
df = pd.read_csv(pathfile)
es_open = pd.DataFrame(exponential_smoothing(alpha,np.array(df['open'])))
es_close = pd.DataFrame(exponential_smoothing(alpha, np.array(df['close'])))
es_high = pd.DataFrame(exponential_smoothing(alpha, np.array(df['high'])))
es_low = pd.DataFrame(exponential_smoothing(alpha, np.array(df['low'])))
df['open'],df['close'],df['high'],df['low'] = es_open,es_close,es_high,es_low
df.to_csv('em_stock_data.csv')
return str('em_stock_data.csv')
# preprocess the stock data with calc_technical_indicators
def calc_technical_indicators(filepath):
df = pd.read_csv(filepath, index_col='date')
# Simple Moving Average SMA 简单移动平均
df['SMA5'] = talib.MA(df['close'], timeperiod=5)
df['SMA10'] = talib.MA(df['close'], timeperiod=10)
df['SMA20'] = talib.MA(df['close'], timeperiod=20)
# Williams Overbought/Oversold Index WR 威廉指标
df['WR14'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=14)
df['WR18'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=18)
df['WR22'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=22)
# Moving Average Convergence / Divergence MACD 指数平滑移动平均线
DIFF1, DEA1, df['MACD9'] = talib.MACD(np.array(df['close']), fastperiod=12, slowperiod=26, signalperiod=9)
DIFF2, DEA2, df['MACD10'] = talib.MACD(np.array(df['close']), fastperiod=14, slowperiod=28, signalperiod=10)
DIFF3, DEA3, df['MACD11'] = talib.MACD(np.array(df['close']), fastperiod=16, slowperiod=30, signalperiod=11)
df['MACD9'] = df['MACD9'] * 2
df['MACD10'] = df['MACD10'] * 2
df['MACD11'] = df['MACD11'] * 2
# Relative Strength Index RSI 相对强弱指数
df['RSI15'] = talib.RSI(np.array(df['close']), timeperiod=15)
df['RSI20'] = talib.RSI(np.array(df['close']), timeperiod=20)
df['RSI25'] = talib.RSI(np.array(df['close']), timeperiod=25)
df['RSI30'] = talib.RSI(np.array(df['close']), timeperiod=30)
# Stochastic Oscillator Slow STOCH 常用的KDJ指标中的KD指标
df['STOCH'] = \
talib.STOCH(df['high'], df['low'], df['close'], fastk_period=9, slowk_period=3, slowk_matype=0, slowd_period=3,
slowd_matype=0)[1]
# On Balance Volume OBV 能量潮
df['OBV'] = talib.OBV(np.array(df['close']), df['volume'])
# Simple moving average SMA 简单移动平均
df['SMA15'] = talib.SMA(df['close'], timeperiod=15)
df['SMA20'] = talib.SMA(df['close'], timeperiod=20)
df['SMA25'] = talib.SMA(df['close'], timeperiod=25)
df['SMA30'] = talib.SMA(df['close'], timeperiod=30)
# Money Flow Index MFI MFI指标
df['MFI14'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14)
df['MFI18'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=18)
df['MFI22'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=22)
# Ultimate Oscillator UO 终极指标
df['UO7'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28)
df['UO8'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=8, timeperiod2=16, timeperiod3=22)
df['UO9'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=9, timeperiod2=18, timeperiod3=26)
# Rate of change Percentage ROCP 价格变化率
df['ROCP'] = talib.ROCP(df['close'], timeperiod=10)
df.to_csv('final_stock_data.csv')
return 'final_stock_data.csv'
# preprocess the stock data with normalization and split data
def normalization(filepath,features):
df= pd.read_csv(filepath)
df = df[33:(len(df['volume']) - pred_days)]
# normalization
min_max_scaler = preprocessing.MinMaxScaler()
for i in range(len(features)):
df[features[i]] = min_max_scaler.fit_transform(np.reshape(df[features[i]] ,(-1,1)))
# split data
df_len = len(df)
df_train = df[:int(df_len * 0.8)]
df_valid = df[int(df_len * 0.8):int(df_len * 0.9)]
df_test = df[int(df_len * 0.9):]
df_train.to_csv('train.csv')
df_valid.to_csv('valid.csv')
df_test.to_csv('test.csv')
return 'train.csv','valid.csv','test.csv'
def random_forest_model(train_filepath,valid_filepath,test_filepath,features):
df_train = pd.read_csv(train_filepath)
df_valid = pd.read_csv(valid_filepath)
df_test = pd.read_csv(test_filepath)
# set hyper-parameter
sample_leaf_options = 7
n_estimators_options = 7
alg = RandomForestClassifier(criterion='gini',bootstrap=True,min_samples_leaf=sample_leaf_options, n_estimators=n_estimators_options, random_state=50)
alg.fit(df_train[features],df_train['LABEL'])
predict = alg.predict(df_test[features])
features_degree = sorted(zip(map(lambda x: round(x, 4), alg.feature_importances_),df_train[features]), reverse=True)
pred_accuracy = (df_test['LABEL'] == predict).mean()
return pred_accuracy,features_degree
if __name__=='__main__':
code = '000034'
alpha = 0.7
pred_days = 15
features = ["open", "close", "high", "low", "volume", "SMA5", "WR14", "MACD9", "RSI15", "MFI14", "UO7", "ROCP"]
raw_filepath = get_stock_data(code=code,pred_days=pred_days)
em_filepath = em_stock_data(pathfile=raw_filepath, alpha=alpha)
final_filepath = calc_technical_indicators(filepath=em_filepath)
train_filepath,valid_filepath,test_filepath = normalization(filepath=final_filepath,features=features)
pred_accuracy, features_degree = random_forest_model(train_filepath,valid_filepath,test_filepath,features)
print('pred_accuracy: ',pred_accuracy)
print('features_degree: ',features_degree)
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
主题:使用随机森林分类器去预测金融市场股票未来第K天的涨跌情况 细节:原始股票数据经过指数平滑处理,并计算常用的技术指标,进行最大最小归一化处理,使用随机森林分类器训练预测 1、get_stock_data通过Tushare获取原始股票数据 2、exponential_smoothing、em_stock_data进行股票指数平滑处理 3、calc_technical_indicators进行常用的技术指标 4、normalization进行归一化处理并分割数据集 5、random_forest_model随机森林模型,返回准确率和数据特征对数据标签的影响程度
资源推荐
资源详情
资源评论
收起资源包目录
financial-prediction-with-random-forest-master.zip (1个子文件)
financial-prediction-with-random-forest-master
random_forest.py 6KB
共 1 条
- 1
资源评论
博士僧小星
- 粉丝: 1768
- 资源: 5875
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
- #P0015. 全排列 超级简单
- pta题库答案c语言之排序4统计工龄.zip
- pta题库答案c语言之树结构7堆中的路径.zip
- pta题库答案c语言之树结构3TreeTraversalsAgain.zip
- pta题库答案c语言之树结构2ListLeaves.zip
- pta题库答案c语言之树结构1树的同构.zip
- 基于C++实现民航飞行与地图简易管理系统可执行程序+说明+详细注释.zip
- pta题库答案c语言之复杂度1最大子列和问题.zip
- 三维装箱问题(Three-Dimensional Bin Packing Problem,3D-BPP)是一个经典的组合优化问题
- 以下是一些关于Linux线程同步的基本概念和方法.txt
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功