#!/usr/bin/env python
# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import r2_score
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import warnings
import datetime
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')
get_ipython().run_line_magic('matplotlib', 'inline')
# plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['SimHei']
# # 数据分析
# In[2]:
train = pd.read_csv('../data/train_data.csv')
test = pd.read_csv('../data/test_data.csv')
# ## 缺失值分析
# In[4]:
train['tradeNewMeanPrice']
# ### 一行代码
#
# In[3]:
train.isnull().sum().sort_values()
# In[4]:
test.isnull().sum().sort_values()
# ### 封装函数
# In[5]:
def missing_values(df):
alldata_na = pd.DataFrame(df.isnull().sum(), columns={'missingNum'})
alldata_na['existNum'] = len(df) - alldata_na['missingNum']
alldata_na['sum'] = len(df)
alldata_na['missingRatio'] = alldata_na['missingNum']/len(df)*100
alldata_na['dtype'] = df.dtypes
#ascending:默认True升序排列;False降序排列
alldata_na = alldata_na[alldata_na['missingNum']>0].reset_index().sort_values(by=['missingNum','index'],ascending=[False,True])
alldata_na.set_index('index',inplace=True)
return alldata_na
# In[6]:
missing_values(train)
# In[7]:
missing_values(test)
# ## 特征值分析
# ### 单调特征
# In[8]:
#是否有单调特征列(单调的特征列很大可能是时间)
def incresing(vals):
cnt = 0
len_ = len(vals)
for i in range(len_-1):
if vals[i+1] > vals[i]:
cnt += 1
return cnt
fea_cols = [col for col in train.columns]
for col in fea_cols:
cnt = incresing(train[col].values)
if cnt / train.shape[0] >= 0.55:
print('单调特征:',col)
print('单调特征值个数:', cnt)
print('单调特征值比例:', cnt / train.shape[0])
# ### 特征nunique分布
# In[9]:
train.nunique().sort_values()
# In[10]:
test.nunique().sort_values()
# In[11]:
cat = ['rentType', 'houseType', 'houseFloor', 'houseToward', 'houseDecoration', 'city', 'region', 'plate', 'buildYear', 'tradeTime']
# In[12]:
train[cat].nunique().plot(kind='bar',rot=45)
# In[13]:
test[cat].nunique().plot(kind='bar',rot=45)
# In[14]:
# 统计特征值出现频次大于100的特征
fea_cols = train.columns
interesting_cols = []
for col in fea_cols:
if train[col].value_counts().iloc[0] > 1000:
print(col)
print(train[col].value_counts().iloc[:3])
interesting_cols.append(col)
# In[15]:
interesting_cols
# ## Label分布
# In[146]:
sns.distplot(train['tradeMoney'],hist=True)
# In[147]:
train['tradeMoney'].describe()
# In[148]:
sns.distplot(train[train['tradeMoney']<55000]['tradeMoney'],hist=True)
# In[149]:
sns.distplot(np.log1p(train[(train['tradeMoney']<55000)&(train['tradeMoney']>300)]['tradeMoney']),hist=True)
plt.title('log on tradeMoney')
plt.show()
# In[159]:
fig,axes = plt.subplots(2,3,figsize=(20,5))
fig.set_size_inches(20,12)
sns.distplot(train['tradeMoney'],ax=axes[0][0])
sns.distplot(train[(train['tradeMoney']<=15000)]['tradeMoney'],ax=axes[0][1])
sns.distplot(train[(train['tradeMoney']>15000)&(train['tradeMoney']<=20000)]['tradeMoney'],ax=axes[0][2])
sns.distplot(train[(train['tradeMoney']>20000)&(train['tradeMoney']<=50000)]['tradeMoney'],ax=axes[1][0])
sns.distplot(train[(train['tradeMoney']>50000)&(train['tradeMoney']<=100000)]['tradeMoney'],ax=axes[1][1])
sns.distplot(train[(train['tradeMoney']>100000)]['tradeMoney'],ax=axes[1][2])
print("money<=15000 ",len(train[(train['tradeMoney']<=15000)]['tradeMoney']))
print("10000<money<=20000 ",len(train[(train['tradeMoney']>16000)&(train['tradeMoney']<=20000)]['tradeMoney']))
print("20000<money<=50000 ",len(train[(train['tradeMoney']>20000)&(train['tradeMoney']<=50000)]['tradeMoney']))
print("50000<money<=100000 ",len(train[(train['tradeMoney']>50000)&(train['tradeMoney']<=100000)]['tradeMoney']))
print("100000<money ",len(train[(train['tradeMoney']>100000)]['tradeMoney']))
# ## 数据清洗
# ### 清洗前
# In[176]:
# 数据清洗
data = train.copy()
g= sns.lmplot('area','tradeMoney',hue='rentType',col='region', col_wrap=3,data=data,sharex=False, sharey=False,palette='husl',scatter_kws={'alpha':0.3} )
plt.tight_layout()
plt.show()
# ### 清洗后
# In[177]:
# Clean Data
def cleanData(data):
data.drop(data[(data['tradeMoney']>16000)].index,inplace=True)
data.drop(data[(data['area']>160)].index,inplace=True)
data.drop(data[(data['tradeMoney']<100)].index,inplace=True)
data.drop(data[(data['totalFloor']==0)].index,inplace=True)
# 深度清理
data.drop(data[(data['region']=='RG00001')&(data['tradeMoney']<1000)&(data['area']>50)].index,inplace=True)
data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']>25000)].index,inplace=True)
data.drop(data[(data['region']=='RG00001') & (data['area']>250)&(data['tradeMoney']<20000)].index,inplace=True)
data.drop(data[(data['region']=='RG00001') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
data.drop(data[(data['region']=='RG00001') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
data.drop(data[(data['region']=='RG00002') & (data['area']<100)&(data['tradeMoney']>60000)].index,inplace=True)
data.drop(data[(data['region']=='RG00003') & (data['area']<300)&(data['tradeMoney']>30000)].index,inplace=True)
data.drop(data[(data['region']=='RG00003')&(data['tradeMoney']<500)&(data['area']<50)].index,inplace=True)
data.drop(data[(data['region']=='RG00003')&(data['tradeMoney']<1500)&(data['area']>100)].index,inplace=True)
data.drop(data[(data['region']=='RG00003')&(data['tradeMoney']<2000)&(data['area']>300)].index,inplace=True)
data.drop(data[(data['region']=='RG00003')&(data['tradeMoney']>5000)&(data['area']<20)].index,inplace=True)
data.drop(data[(data['region']=='RG00003') & (data['area']>600)&(data['tradeMoney']>40000)].index,inplace=True)
data.drop(data[(data['region']=='RG00004')&(data['tradeMoney']<1000)&(data['area']>80)].index,inplace=True)
data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<200)].index,inplace=True)
data.drop(data[(data['region']=='RG00005')&(data['tradeMoney']<2000)&(data['area']>180)].index,inplace=True)
data.drop(data[(data['region']=='RG00005')&(data['tradeMoney']>50000)&(data['area']<200)].index,inplace=True)
data.drop(data[(data['region']=='RG00006') & (data['area']>200)&(data['tradeMoney']<2000)].index,inplace=True)
data.drop(data[(data['region']=='RG00007') & (data['area']>100)&(data['tradeMoney']<2500)].index,inplace=True)
data.drop(data[(data['region']=='RG00010') & (data['area']>200)&(data['tradeMoney']>25000)].index,inplace=True)
data.drop(data[(data['region']=='RG00010') & (data['area']>400)&(data['tradeMoney']<15000)].index,inplace=True)
data.drop(data[(data['region']=='RG00010')&(data['tradeMoney']<3000)&(data['area']>200)].index,inplace=True)
data.drop(data[(data['region']=='RG00010')&(data['tradeMoney']>7000)&(data['area']<75)].index,inplace=True)
data.drop(data[(data['region']=='RG00010')&(data['tradeMoney']>12500)&(data['area']<100)].index,inplace=True)
data.drop(data[(data['region']=='RG00004') & (data['area']>400)&(data['tradeMoney']>20000)].index,inplace=True)
data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']<2000)&(data['area']>80)].index,inplace=True)
data.drop(data[(data['region']=='RG00009') & (data['tradeMoney']>40000)].index,inplace=True)
data.drop(data[(data['region']=='RG00009') & (data['area']>300)].index,inplace=True)
data.drop(data[(data['region']=='RG00009')&(data['area']>100)&(data['tradeMoney']<2000)].in
没有合适的资源?快使用搜索试试~ 我知道了~
2019未来杯高校AI挑战赛城市-房产租金预测.zip
共13个文件
ipynb:8个
csv:3个
html:1个
1.该资源内容由用户上传,如若侵权请联系客服进行举报
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
2.虚拟产品一经售出概不退款(资源遇到问题,请及时私信上传者)
版权申诉
0 下载量 56 浏览量
2023-11-09
01:16:27
上传
评论
收藏 3.1MB ZIP 举报
温馨提示
挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考 挑战杯大赛相关代码、设计文档、使用说明,供参考
资源推荐
资源详情
资源评论
收起资源包目录
2019未来杯高校AI挑战赛城市-房产租金预测.zip (13个子文件)
rental-forecast-master
Data analysis and preprocessing.ipynb 1.7MB
test_a.csv 598KB
train_data.csv 9.93MB
数据集字段说明.html 101KB
baseline.ipynb 11KB
2西兰花炒猪脚.py 80KB
sub_a_913.csv 539KB
feature engineering and selection.ipynb 14KB
.ipynb_checkpoints
baseline-checkpoint.ipynb 11KB
feature engineering-checkpoint.ipynb 131KB
Data analysis and preprocessing-checkpoint.ipynb 72B
feature engineering and selection-checkpoint.ipynb 72B
feature engineering.ipynb 132KB
共 13 条
- 1
资源评论
辣椒种子
- 粉丝: 3531
- 资源: 5721
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功