kaggle房价预测_kaggle资源-CSDN文库

共4个文件

csv：3个

py：1个

kaggle比赛

4星 · 超过85%的资源需积分: 50 66 浏览量 2018-03-21 16:31:57 上传评论 1 收藏 182KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

kaggle_coding.zip （4个子文件）

kaggle_coding

test.csv 441KB

train.csv 450KB

submission.csv 35KB

HousePrices.py 18KB

import seaborn as sns import matplotlib.pyplot as plt import pandas as pd import numpy as np from sklearn.model_selection import cross_val_score from sklearn import linear_model from sklearn import metrics from scipy.stats import norm, skew from scipy import stats # %matplotlib inline # 读取数据 train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') print(train.shape) print(test.shape) # 显示最起那么的五条数据 print(train.head(5)) # 提取Id这一列 train_ID = train['Id'] test_ID = test['Id'] train.drop("Id", axis=1, inplace=True) test.drop("Id", axis=1, inplace=True) print("\nThe train data size after dropping Id feature is : {} ".format(train.shape)) print("The test data size after dropping Id feature is : {} ".format(test.shape)) # 离群点处理 fig, ax = plt.subplots() ax.scatter(x=train['GrLivArea'], y=train['SalePrice']) plt.ylabel('SalePrice', fontsize=13) plt.xlabel('GrLivArea', fontsize=13) plt.show() # 删除离群点 train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index) # 再次查看离群点情况 fig, ax = plt.subplots() ax.scatter(train['GrLivArea'], train['SalePrice']) plt.ylabel('SalePrice', fontsize=13) plt.xlabel('GrLivArea', fontsize=13) plt.show() sns.distplot(train['SalePrice'], fit=norm); # 使用probplot函数检测房价偏离正态分布 (mu, sigma) = norm.fit(train['SalePrice']) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) # 绘制 plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') # 获取QQ-plot fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show() # 对数据做log处理 train["SalePrice"] = np.log1p(train["SalePrice"]) train_SalePrice = train["SalePrice"] # 查看新的数据分布 sns.distplot(train['SalePrice'], fit=norm); # 获取新的数据的分布参数 (mu, sigma) = norm.fit(train['SalePrice']) print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) # 绘制分布 plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') # 查看Q-Qplot fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show() #训练集数据量 ntrain = train.shape[0] #测试集数据量 ntest = test.shape[0] y_train = train.SalePrice.values # train的数据和test的数据合并，一起处理 all_data = pd.concat((train, test)).reset_index(drop=True) # all_data.to_csv("dddd.csv") # 去掉SalePrice，SalePrice不需要处理 all_data.drop(['SalePrice'], axis=1, inplace=True) print(all_data.head(5)) print(all_data.shape) # 计算每一列的缺失率，从高到低排列 percent = (all_data.isnull().sum() / len(all_data)).sort_values(ascending=False) # 数据缺失情况 all_data_na = percent[percent > 0] print(all_data_na) # 缺失数据可视化 f, ax = plt.subplots(figsize=(15, 12)) plt.xticks(rotation='90') sns.barplot(x=all_data_na.index, y=all_data_na) plt.xlabel('Features', fontsize=15) plt.ylabel('Percent of missing values', fontsize=15) plt.title('Percent missing data by feature', fontsize=15) plt.show() # 使用热力图，分析各个特征和房价的关系 corrmat = train.corr() f, ax = plt.subplots(figsize=(12, 9)) ax.set_xticklabels(corrmat, rotation='horizontal') sns.heatmap(corrmat, vmax=0.9, square=True) label_y = ax.get_yticklabels() plt.setp(label_y, rotation=360) label_x = ax.get_xticklabels() plt.setp(label_x, rotation=90) plt.show() # 缺失数据列 # missing_index=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', # 'LotFrontage', 'GarageQual', 'GarageYrBlt', 'GarageFinish', # 'GarageCond', 'GarageType', 'BsmtCond', 'BsmtExposure', 'BsmtQual', # 'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 'MasVnrArea', 'MSZoning', # 'BsmtHalfBath', 'Utilities', 'Functional', 'BsmtFullBath', 'Electrical', # 'Exterior2nd', 'KitchenQual', 'GarageCars', 'Exterior1st', 'GarageArea', # 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1', 'SaleType'] # PoolQC: Pool quality 游泳池质量。NA，表示没有游泳池。大量数据的值是NA all_data["PoolQC"] = all_data["PoolQC"].fillna("None") # MiscFeature：MiscFeature: Miscellaneous feature not covered in other categories 其它条件中未包含部分的特性 all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None") # Alley: Type of alley access 小道的路面类型。 all_data["Alley"] = all_data["Alley"].fillna("None") print("train的alley") print(train["Alley"]) print("all_data的allty", all_data["Alley"]) # Fence: Fence quality 围栏质量 all_data["Fence"] = all_data["Fence"].fillna("None") # FireplaceQu: Fireplace quality 壁炉质量 all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None") # LotFrontage: Linear feet of street connected to property 房子同街道之间的距离。 # neighborhood: Physical locations within Ames city limits。Ames市区范围内的物理位置 # 通过neighborhood的所有中值来填充缺失值 all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median())) # GarageType: Garage location 车库位置 # GarageFinish: Interior finish of the garage 车库中间建成时间（比如翻修） # GarageQual: Garage quality 车库质量 # GarageCond: Garage condition 车库条件 for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'): all_data[col] = all_data[col].fillna("None") # GarageYrBlt: Year garage was built 车库建造时间 # GarageArea: Size of garage in square feet 车库面积 # GarageCars: Size of garage in car capacity 车库大小以停车数量表示 # 这几个数据用零填充 for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'): all_data[col] = all_data[col].fillna(0) # BsmtFinSF1: Type 1 finished square feet Type 1完工面积 # BsmtFinSF2: Type 2 finished square feet Type 2完工面积 # BsmtUnfSF: Unfinished square feet of basement area 地下室区域未完工面积 # TotalBsmtSF: Total square feet of basement area 地下室总体面积 # BsmtFullBath: Basement full bathrooms 地下室全浴室 # BsmtHalfBath: Basement half bathrooms 地下室半浴室 for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'): all_data[col] = all_data[col].fillna(0) # BsmtQual: Height of the basement 地下室高度 # BsmtCond：BsmtCond: General condition of the basement 地下室总体情况 # BsmtExposure: Walkout or garden level basement walls 地下室出口或者花园层的墙面 # BsmtFinType1: Quality of basement finished area 地下室区域质量 # BsmtFinType2: Quality of second finished area (if present) 二次完工面积质量（如果有） for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): all_data[col] = all_data[col].fillna("None") # MasVnrArea: Masonry veneer area in square feet 装饰石材面积 all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0) # MasVnrType: Masonry veneer type 装饰石材类型 all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None") # MSZoning: The general zoning classification 区域分类 # 大部分数据都是‘RL’，这一列的数据用出现频次最高的值填充。 all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0]) # Utilities: Type of utilities available 配套公用设施类型 # test数据中的Utilities的所有的值都是“AllPub”,只有train的数据里面有一个"NoSeWa"，和两个空值。可以删除这一列 all_data = all_data.drop(['Utilities'], axis=1) # Functional: Home functionality rating 功能性评级。 # 空值

评论收藏

内容反馈