# coding:utf-8
# kaggle Jane Street Market Prediction代码
# 数据探索代码
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import os
import sys
import gc
from run import *
from sklearn.preprocessing import StandardScaler as scale
from sklearn.decomposition import PCA
from sklearn.cluster import k_means
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
# 数据探索
def data_explore_old(df):
# 复制数据,防止改变原数据
data = df.copy()
# 查看列名
print(data.columns)
# 查看数据开头
print(data.head())
# 有空值
# 看数据总和
print(data.sum())
# 看平均数
print(data.mean())
# 输出描述统计值
print(data.describe())
# 查看空值
print(data.isnull().sum())
# 最多的有1734个空值,接近20%
# 先画折线图吧
# 画目标值
fig = plt.figure()
fig, axes = plt.subplots(4, 2, sharex = True)
for i in range(4):
for j in range(2):
pos = 2*i + j
if pos > 6:
break
axes[i][j].set_title(data.columns[pos])
axes[i][j].plot(data.iloc[:, pos])
plt.subplots_adjust(wspace = 0.2, hspace = 1)
plt.savefig("./output/targets_line.png")
plt.close()
# 画特征
fig = plt.figure(figsize = (10, 80))
for i in range(130):
ax = fig.add_subplot(65, 2, i+1)
ax.set_title(data.columns[i+7])
plt.plot(data.iloc[:, i+7])
plt.subplots_adjust(wspace = 0.2, hspace = 1)
plt.savefig("./output/features_line.png")
plt.close()
# 画柱状图
# 画目标值
fig = plt.figure()
sns.distplot(data.iloc[:, 1:8], hist = True, bins = 100, kde = True)
# data.iloc[:, 1:8].plot.hist(subplots = True, sharex = True, layout = (4, 2), bins = 50)
plt.savefig("./output/targets_hist.png")
# 画特征
fig = plt.figure()
sns.distplot(data.iloc[:, 8:-2], hist = True, bins = 100, kde = True)
# data.iloc[:, 8:-2].plot.hist(subplots = True, sharex = True, layout = (65, 2), figsize = (10, 80), bins = 50)
plt.savefig("./output/features_hist.png")
# # 画密度图
# # 画目标值
# fig = plt.figure()
# data.iloc[:, 1:8].plot(subplots = True, kind = "hist", sharex = True, layout = (4, 2), bins = 50)
# plt.savefig("./output/targets_hist.png")
# # 画特征
# fig = plt.figure()
# data.iloc[:, 8:-2].plot(subplots = True, kind = "hist", sharex = True, layout = (65, 2), figsize = (10, 80), bins = 50)
# plt.savefig("./output/features_hist.png")
# 数据探索
@change_dir
def data_explore():
sns.set_style('darkgrid')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# 抽样,读取1%数据
# 参考https://mp.weixin.qq.com/s/2LSKnN9R-N-I2HcHePT9zA
train_df = pd.read_csv("./train.csv", skiprows = lambda x: x>0 and np.random.rand() > 0.01)
test_df = pd.read_csv("./example_test.csv")
feature_df = pd.read_csv("./features.csv")
# 复制数据
# train = train_df.copy()
# test = test_df.copy()
EDA1(train_df, test_df, feature_df)
EDA2(train_df, test_df, feature_df)
EDA3(train_df, test_df, feature_df)
# 第一篇文章的EDA
# 参考https://www.kaggle.com/muhammadmelsherbini/jane-street-extensive-eda
def EDA1(train, test, feature):
df = train.copy()
# 看数据长度
org_len = len(df)
print(org_len)
# 查看数据概况
print(df.info())
# 按日期排序数据
df.sort_values(by = ["date", "ts_id"], inplace = True)
# 增加目标数据
df["action"] = np.where(df["resp"] > 0, 1, 0)
df.action = df.action.astype("category")
# 下面开始分析数据
# 先分析resp
fig = plt.figure(figsize = (16, 6))
ax = plt.subplot(1, 1, 1)
df.groupby("date")[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]].sum().cumsum().plot(ax = ax)
plt.savefig("./output/01.png")
# 前92天收益较高,resp_4的累积收益较高
# resp_1的累积收益较低
# 再画resp的平均值
fig = px.line(df.groupby("date")[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]].mean(), x = df.groupby("date")[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]].mean().index, y = ["resp_1", "resp_2", "resp_3", "resp_4", "resp"], title = "average resp per day")
fig.write_image("./output/02.png")
# 画组图
# 画resp数据的直方组图
def resp_hists(ax1, ax2, ax3, data, name):
ax1.hist(data, bins = 150, color = "darkblue", alpha = 0.6)
ax1.axvline(data.mean() + data.std(),color = 'darkorange', linestyle = ':',linewidth = 2)
ax1.axvline(data.mean() - data.std(),color = 'darkorange', linestyle = ':',linewidth = 2)
data.plot.hist(bins = 150,ax = ax2, color = 'darkblue', alpha = 0.6)
ax2.axvline(data.mean() + data.std(),color = 'darkorange', linestyle = ':', linewidth = 2)
ax2.axvline(data.mean() - data.std(), color = 'darkorange', linestyle = ':',linewidth = 2)
ax2.set_xlim(-.08, .08)
ax3.hist(data, bins=150, color='darkblue',alpha=.6)
ax3.set_yscale('log')
skew= round(data.skew(),4)
kurt= round(data.kurtosis())
std1= round((((data.mean()-data.std()) < data ) & (data < (data.mean()+data.std()))).mean()*100,2)
props = dict(boxstyle='round', facecolor='white', alpha=0.5)
ax1.text(.02,.96,'μ = {}\nstd = {}\nskewness = {}\nkurtosis = {}\n% values in 1 std = {}%'.format(round(data.mean(),4),round(data.std(),4),skew,kurt,std1),
transform=ax1.transAxes, verticalalignment='top',bbox=props,fontsize=10)
ax1.set_title(name + ' Hist Normal scale', fontsize=14)
ax2.set_title(name + ' Hist normal scale zoomed',fontsize=14)
ax3.set_title(name + ' Hist with freq on a log scale',fontsize=14);
ax1.set_xlabel('')
ax1.set_ylabel('')
ax2.set_xlabel('')
ax2.set_ylabel('')
ax3.set_xlabel('')
ax3.set_ylabel('')
fig,((ax11,ax12,ax13),(ax21,ax22,ax23),(ax31,ax32,ax33),(ax41,ax42,ax43),(ax51,ax52,ax53)) = plt.subplots(5,3,figsize=(18,24))
plt.subplots_adjust(hspace = 0.35)
resp_hists(ax11, ax12, ax13, df.resp, "Resp")
resp_hists(ax21, ax22, ax23, df.resp_1, "Resp_1")
resp_hists(ax31, ax32, ax33, df.resp_2, "Resp_2")
resp_hists(ax41, ax42, ax43, df.resp_3, "Resp_3")
resp_hists(ax51, ax52, ax53, df.resp_4, "Resp_4")
plt.savefig("./output/03.png")
# resp变量之间配对作图
sns.pairplot(df[["resp_1", "resp_2", "resp_3", "resp_4", "resp"]], corner = False)
plt.savefig("./output/04.png")
# resp与resp_4,以及resp_1与resp_2之间高度相关。
# 投资时区越长,风险及收益越大,反之越小
# 下面分析date
# 看独特的date值
print(df.date.unique())
# 完整数据500天,大约两年的数据
# 现在查看每天的收益总数,以及操作总数
fig = px.area(data_frame = df.groupby("date")[["resp"]].count(), title='Number of operation per day')
fig.update_traces(showlegend = False)
fig.layout.xaxis.title = 'Day'
fig.layout.yaxis.title = 'Number of operations'
fig.write_image("./output/05.png")
# 每天收益总数
fig = px.area(data_frame = df.groupby("date")[["resp"]].sum(), title='Resp sum of operation per day')
fig.update_traces(showlegend = False)
fig.layout.xaxis.title = 'Day'
fig.layout.yaxis.title = 'Resp sum of operations'
fig.write_image("./output/06.png")
# 可以看到收益有很多波动
# 下面建立平均收益的20天移动标准差
date_df = df.groupby("date")[["resp"]].mean()
std20 = []
for i in range(
kaggle竞赛Jane Street Market Prediction实操代码.zip
版权申诉
152 浏览量
2023-12-01
16:21:23
上传
评论
收藏 24.98MB ZIP 举报
学术菜鸟小晨
- 粉丝: 1w+
- 资源: 4940
最新资源
- 937712277954201实习5.word
- 2程序语言基础知识pdf1_1716337722703.jpeg
- 简单的Python示例,演示了如何使用TCP/IP协议进行基本的客户端和服务器通信
- 考试.sql
- keil2 + proteus + 8051.exe
- 1961ee27df03bd4595d28e24b00dde4e_744c805f7e4fb4d40fa3f695bfbab035_8(1).c
- mediapipe-0.9.0.1-cp37-cp37m-win-amd64.whl.zip
- windows注册表编辑工具
- mediapipe-0.9.0.1-cp37-cp37m-win-amd64.whl.zip
- 校园通行码预约管理系统20240522075502
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈