人工智能-项目实践-数据预处理-使用随机森林模型预测股价趋势(涉及数据预处理)_融合外部指数与内部动态的A股价格预测:随机森林模型的应用资源-CSDN文库

共1个文件

py：1个

版权申诉

人工智能

随机森林

数据预处理

python

18 浏览量 2024-03-02 20:35:15 上传评论收藏 2KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

financial-prediction-with-random-forest-master.zip （1个子文件）

financial-prediction-with-random-forest-master

random_forest.py 6KB

from numpy import * import numpy as np import tushare as ts import pandas as pd import talib from sklearn.ensemble import RandomForestClassifier from sklearn import preprocessing # Get the stock data from tushare def get_stock_data(code,pred_days): df_raw = ts.get_k_data(code) # Classification label = ['']*len(df_raw['close']) for i in range(len(df_raw['close'])-pred_days): if (df_raw['close'][i + pred_days] - df_raw['close'][i]) > 0: label[i] = 1 else: label[i] = -1 # Save to csv file df_raw['LABEL'] = label df_raw.to_csv('raw_stock_data.csv') return 'raw_stock_data.csv' def exponential_smoothing(alpha, s): s2 = np.zeros(s.shape) s2[0] = s[0] for i in range(1, len(s2)): s2[i] = alpha*float(s[i])+(1-alpha)*float(s2[i-1]) return s2 # preprocess the stock data with exponential_smoothing def em_stock_data(pathfile,alpha): df = pd.read_csv(pathfile) es_open = pd.DataFrame(exponential_smoothing(alpha,np.array(df['open']))) es_close = pd.DataFrame(exponential_smoothing(alpha, np.array(df['close']))) es_high = pd.DataFrame(exponential_smoothing(alpha, np.array(df['high']))) es_low = pd.DataFrame(exponential_smoothing(alpha, np.array(df['low']))) df['open'],df['close'],df['high'],df['low'] = es_open,es_close,es_high,es_low df.to_csv('em_stock_data.csv') return str('em_stock_data.csv') # preprocess the stock data with calc_technical_indicators def calc_technical_indicators(filepath): df = pd.read_csv(filepath, index_col='date') # Simple Moving Average SMA 简单移动平均 df['SMA5'] = talib.MA(df['close'], timeperiod=5) df['SMA10'] = talib.MA(df['close'], timeperiod=10) df['SMA20'] = talib.MA(df['close'], timeperiod=20) # Williams Overbought/Oversold Index WR 威廉指标 df['WR14'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=14) df['WR18'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=18) df['WR22'] = talib.WILLR(df['high'], df['low'], df['close'], timeperiod=22) # Moving Average Convergence / Divergence MACD 指数平滑移动平均线 DIFF1, DEA1, df['MACD9'] = talib.MACD(np.array(df['close']), fastperiod=12, slowperiod=26, signalperiod=9) DIFF2, DEA2, df['MACD10'] = talib.MACD(np.array(df['close']), fastperiod=14, slowperiod=28, signalperiod=10) DIFF3, DEA3, df['MACD11'] = talib.MACD(np.array(df['close']), fastperiod=16, slowperiod=30, signalperiod=11) df['MACD9'] = df['MACD9'] * 2 df['MACD10'] = df['MACD10'] * 2 df['MACD11'] = df['MACD11'] * 2 # Relative Strength Index RSI 相对强弱指数 df['RSI15'] = talib.RSI(np.array(df['close']), timeperiod=15) df['RSI20'] = talib.RSI(np.array(df['close']), timeperiod=20) df['RSI25'] = talib.RSI(np.array(df['close']), timeperiod=25) df['RSI30'] = talib.RSI(np.array(df['close']), timeperiod=30) # Stochastic Oscillator Slow STOCH 常用的KDJ指标中的KD指标 df['STOCH'] = \ talib.STOCH(df['high'], df['low'], df['close'], fastk_period=9, slowk_period=3, slowk_matype=0, slowd_period=3, slowd_matype=0)[1] # On Balance Volume OBV 能量潮 df['OBV'] = talib.OBV(np.array(df['close']), df['volume']) # Simple moving average SMA 简单移动平均 df['SMA15'] = talib.SMA(df['close'], timeperiod=15) df['SMA20'] = talib.SMA(df['close'], timeperiod=20) df['SMA25'] = talib.SMA(df['close'], timeperiod=25) df['SMA30'] = talib.SMA(df['close'], timeperiod=30) # Money Flow Index MFI MFI指标 df['MFI14'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=14) df['MFI18'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=18) df['MFI22'] = talib.MFI(df['high'], df['low'], df['close'], df['volume'], timeperiod=22) # Ultimate Oscillator UO 终极指标 df['UO7'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=7, timeperiod2=14, timeperiod3=28) df['UO8'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=8, timeperiod2=16, timeperiod3=22) df['UO9'] = talib.ULTOSC(df['high'], df['low'], df['close'], timeperiod1=9, timeperiod2=18, timeperiod3=26) # Rate of change Percentage ROCP 价格变化率 df['ROCP'] = talib.ROCP(df['close'], timeperiod=10) df.to_csv('final_stock_data.csv') return 'final_stock_data.csv' # preprocess the stock data with normalization and split data def normalization(filepath,features): df= pd.read_csv(filepath) df = df[33:(len(df['volume']) - pred_days)] # normalization min_max_scaler = preprocessing.MinMaxScaler() for i in range(len(features)): df[features[i]] = min_max_scaler.fit_transform(np.reshape(df[features[i]] ,(-1,1))) # split data df_len = len(df) df_train = df[:int(df_len * 0.8)] df_valid = df[int(df_len * 0.8):int(df_len * 0.9)] df_test = df[int(df_len * 0.9):] df_train.to_csv('train.csv') df_valid.to_csv('valid.csv') df_test.to_csv('test.csv') return 'train.csv','valid.csv','test.csv' def random_forest_model(train_filepath,valid_filepath,test_filepath,features): df_train = pd.read_csv(train_filepath) df_valid = pd.read_csv(valid_filepath) df_test = pd.read_csv(test_filepath) # set hyper-parameter sample_leaf_options = 7 n_estimators_options = 7 alg = RandomForestClassifier(criterion='gini',bootstrap=True,min_samples_leaf=sample_leaf_options, n_estimators=n_estimators_options, random_state=50) alg.fit(df_train[features],df_train['LABEL']) predict = alg.predict(df_test[features]) features_degree = sorted(zip(map(lambda x: round(x, 4), alg.feature_importances_),df_train[features]), reverse=True) pred_accuracy = (df_test['LABEL'] == predict).mean() return pred_accuracy,features_degree if __name__=='__main__': code = '000034' alpha = 0.7 pred_days = 15 features = ["open", "close", "high", "low", "volume", "SMA5", "WR14", "MACD9", "RSI15", "MFI14", "UO7", "ROCP"] raw_filepath = get_stock_data(code=code,pred_days=pred_days) em_filepath = em_stock_data(pathfile=raw_filepath, alpha=alpha) final_filepath = calc_technical_indicators(filepath=em_filepath) train_filepath,valid_filepath,test_filepath = normalization(filepath=final_filepath,features=features) pred_accuracy, features_degree = random_forest_model(train_filepath,valid_filepath,test_filepath,features) print('pred_accuracy: ',pred_accuracy) print('features_degree: ',features_degree)

评论收藏

内容反馈

版权申诉