kaggle竞赛JaneStreetMarketPrediction实操代码.zip资源-CSDN文库

共66个文件

py：41个

jpg：11个

pyc：6个

版权申诉

计算机竞赛

152 浏览量 2023-12-01 16:21:23 上传评论收藏 24.98MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

kaggle竞赛Jane Street Market Prediction实操代码.zip （66个子文件）

Graduation Design

pic

02.jpg 28KB

06.jpg 31KB

10.jpg 99KB

01.jpg 14KB

08.jpg 38KB

07.jpg 15KB

05.jpg 21KB

04.jpg 32KB

09.jpg 27KB

03.jpg 27KB

00.jpg 25KB

preprocess.py 2KB

test_pytorch.py 19KB

py_nn.py 8KB

jsmp_local.py 3KB

README 134B

nn.py 3KB

FE.py 1KB

LSTM.py 4KB

py_nn_use.py 10KB

jsmp.py 3KB

py_nn_back.py 12KB

janestreet

__init__.py 59B

competition.cpython-37m-x86_64-linux-gnu.so 441KB

__pycache__

__init__.cpython-37.pyc 186B

__init__.cpython-38.pyc 220B

myxgboost.py 6KB

test_dt.py 612B

copy_jsmp.py 8KB

hidegpu

FE.py 1KB

optuna_test.py 3KB

nohup.out 78KB

tools.py 4KB

optuna_test.py 2KB

LSTM_work.py 6KB

hello.py 202B

X.npy 32.22MB

pytorch_work.py 4KB

run.py 3KB

optuna_DP.py 6KB

works.py 3KB

nn.py 5KB

backward.py 12KB

sample_weight.pkl 178KB

learning.py 13KB

perceptron.py 1KB

run.py 3KB

__pycache__

run.cpython-37.pyc 2KB

improve.py 33KB

test_work.py 5KB

EDA.py 66KB

.gitignore 26B

tools.py 4KB

__pycache__

run.cpython-37.pyc 2KB

run.cpython-38.pyc 2KB

tools.cpython-38.pyc 4KB

MyFrame.py 11KB

DP.py 25KB

Y.npy 161KB

FE.py 1KB

run.py 3KB

optuna_DP.py 5KB

tools.py 4KB

NNDL

mnist.pkl.gz 16.26MB

minst.py 4KB

run.py 3KB

# coding:utf-8 # kaggle Jane Street Market Prediction代码 # 数据探索代码 import numpy as np import pandas as pd from pandas import Series, DataFrame import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec import plotly.express as px import plotly.graph_objects as go import seaborn as sns import os import sys import gc from run import * from sklearn.preprocessing import StandardScaler as scale from sklearn.decomposition import PCA from sklearn.cluster import k_means from sklearn.preprocessing import RobustScaler from sklearn.ensemble import RandomForestClassifier from sklearn.cluster import KMeans # 数据探索 def data_explore_old(df): # 复制数据，防止改变原数据 data = df.copy() # 查看列名 print(data.columns) # 查看数据开头 print(data.head()) # 有空值 # 看数据总和 print(data.sum()) # 看平均数 print(data.mean()) # 输出描述统计值 print(data.describe()) # 查看空值 print(data.isnull().sum()) # 最多的有1734个空值，接近20% # 先画折线图吧 # 画目标值 fig = plt.figure() fig, axes = plt.subplots(4, 2, sharex = True) for i in range(4): for j in range(2): pos = 2*i + j if pos > 6: break axes[i][j].set_title(data.columns[pos]) axes[i][j].plot(data.iloc[:, pos]) plt.subplots_adjust(wspace = 0.2, hspace = 1) plt.savefig("./output/targets_line.png") plt.close() # 画特征 fig = plt.figure(figsize = (10, 80)) for i in range(130): ax = fig.add_subplot(65, 2, i+1) ax.set_title(data.columns[i+7]) plt.plot(data.iloc[:, i+7]) plt.subplots_adjust(wspace = 0.2, hspace = 1) plt.savefig("./output/features_line.png") plt.close() # 画柱状图 # 画目标值 fig = plt.figure() sns.distplot(data.iloc[:, 1:8], hist = True, bins = 100, kde = True) # data.iloc[:, 1:8].plot.hist(subplots = True, sharex = True, layout = (4, 2), bins = 50) plt.savefig("./output/targets_hist.png") # 画特征 fig = plt.figure() sns.distplot(data.iloc[:, 8:-2], hist = True, bins = 100, kde = True) # data.iloc[:, 8:-2].plot.hist(subplots = True, sharex = True, layout = (65, 2), figsize = (10, 80), bins = 50) plt.savefig("./output/features_hist.png") # # 画密度图 # # 画目标值 # fig = plt.figure() # data.iloc[:, 1:8].plot(subplots = True, kind = "hist", sharex = True, layout = (4, 2), bins = 50) # plt.savefig("./output/targets_hist.png") # # 画特征 # fig = plt.figure() # data.iloc[:, 8:-2].plot(subplots = True, kind = "hist", sharex = True, layout = (65, 2), figsize = (10, 80), bins = 50) # plt.savefig("./output/features_hist.png") # 数据探索 @change_dir def data_explore(): sns.set_style('darkgrid') pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', None) # 抽样，读取1%数据 # 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA train_df = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > 0.01) test_df = pd.read_csv("./example_test.csv") feature_df = pd.read_csv("./features.csv") # 复制数据 # train = train_df.copy() # test = test_df.copy() EDA1(train_df, test_df, feature_df) EDA2(train_df, test_df, feature_df) EDA3(train_df, test_df, feature_df) # 第一篇文章的EDA # 参考https://www.kaggle.com/muhammadmelsherbini/jane-street-extensive-eda def EDA1(train, test, feature): df = train.copy() # 看数据长度 org_len = len(df) print(org_len) # 查看数据概况 print(df.info()) # 按日期排序数据 df.sort_values(by = ["date", "ts_id"], inplace = True) # 增加目标数据 df["action"] = np.where(df["resp"] > 0, 1, 0) df.action = df.action.astype("category") # 下面开始分析数据 # 先分析resp fig = plt.figure(figsize = (16, 6)) ax = plt.subplot(1, 1, 1) df.groupby("date")[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]].sum().cumsum().plot(ax = ax) plt.savefig("./output/01.png") # 前92天收益较高，resp_4的累积收益较高 # resp_1的累积收益较低 # 再画resp的平均值 fig = px.line(df.groupby("date")[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]].mean(), x = df.groupby("date")[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]].mean().index, y = ["resp_1", "resp_2", "resp_3", "resp_4", "resp"], title = "average resp per day") fig.write_image("./output/02.png") # 画组图 # 画resp数据的直方组图 def resp_hists(ax1, ax2, ax3, data, name): ax1.hist(data, bins = 150, color = "darkblue", alpha = 0.6) ax1.axvline(data.mean() + data.std(),color = 'darkorange', linestyle = ':',linewidth = 2) ax1.axvline(data.mean() - data.std(),color = 'darkorange', linestyle = ':',linewidth = 2) data.plot.hist(bins = 150,ax = ax2, color = 'darkblue', alpha = 0.6) ax2.axvline(data.mean() + data.std(),color = 'darkorange', linestyle = ':', linewidth = 2) ax2.axvline(data.mean() - data.std(), color = 'darkorange', linestyle = ':',linewidth = 2) ax2.set_xlim(-.08, .08) ax3.hist(data, bins=150, color='darkblue',alpha=.6) ax3.set_yscale('log') skew= round(data.skew(),4) kurt= round(data.kurtosis()) std1= round((((data.mean()-data.std()) < data ) & (data < (data.mean()+data.std()))).mean()*100,2) props = dict(boxstyle='round', facecolor='white', alpha=0.5) ax1.text(.02,.96,'μ = {}\nstd = {}\nskewness = {}\nkurtosis = {}\n% values in 1 std = {}%'.format(round(data.mean(),4),round(data.std(),4),skew,kurt,std1), transform=ax1.transAxes, verticalalignment='top',bbox=props,fontsize=10) ax1.set_title(name + ' Hist Normal scale', fontsize=14) ax2.set_title(name + ' Hist normal scale zoomed',fontsize=14) ax3.set_title(name + ' Hist with freq on a log scale',fontsize=14); ax1.set_xlabel('') ax1.set_ylabel('') ax2.set_xlabel('') ax2.set_ylabel('') ax3.set_xlabel('') ax3.set_ylabel('') fig,((ax11,ax12,ax13),(ax21,ax22,ax23),(ax31,ax32,ax33),(ax41,ax42,ax43),(ax51,ax52,ax53)) = plt.subplots(5,3,figsize=(18,24)) plt.subplots_adjust(hspace = 0.35) resp_hists(ax11, ax12, ax13, df.resp, "Resp") resp_hists(ax21, ax22, ax23, df.resp_1, "Resp_1") resp_hists(ax31, ax32, ax33, df.resp_2, "Resp_2") resp_hists(ax41, ax42, ax43, df.resp_3, "Resp_3") resp_hists(ax51, ax52, ax53, df.resp_4, "Resp_4") plt.savefig("./output/03.png") # resp变量之间配对作图 sns.pairplot(df[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]], corner = False) plt.savefig("./output/04.png") # resp与resp_4，以及resp_1与resp_2之间高度相关。 # 投资时区越长，风险及收益越大，反之越小 # 下面分析date # 看独特的date值 print(df.date.unique()) # 完整数据500天，大约两年的数据 # 现在查看每天的收益总数，以及操作总数 fig = px.area(data_frame = df.groupby("date")[["resp"]].count(), title='Number of operation per day') fig.update_traces(showlegend = False) fig.layout.xaxis.title = 'Day' fig.layout.yaxis.title = 'Number of operations' fig.write_image("./output/05.png") # 每天收益总数 fig = px.area(data_frame = df.groupby("date")[["resp"]].sum(), title='Resp sum of operation per day') fig.update_traces(showlegend = False) fig.layout.xaxis.title = 'Day' fig.layout.yaxis.title = 'Resp sum of operations' fig.write_image("./output/06.png") # 可以看到收益有很多波动 # 下面建立平均收益的20天移动标准差 date_df = df.groupby("date")[["resp"]].mean() std20 = [] for i in range(

评论收藏

内容反馈

版权申诉