import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn import linear_model
from sklearn import metrics
from scipy.stats import norm, skew
from scipy import stats
# %matplotlib inline
# 读取数据
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape)
print(test.shape)
# 显示最起那么的五条数据
print(train.head(5))
# 提取Id这一列
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)
print("\nThe train data size after dropping Id feature is : {} ".format(train.shape))
print("The test data size after dropping Id feature is : {} ".format(test.shape))
# 离群点处理
fig, ax = plt.subplots()
ax.scatter(x=train['GrLivArea'], y=train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
# 删除离群点
train = train.drop(train[(train['GrLivArea'] > 4000) & (train['SalePrice'] < 300000)].index)
# 再次查看离群点情况
fig, ax = plt.subplots()
ax.scatter(train['GrLivArea'], train['SalePrice'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('GrLivArea', fontsize=13)
plt.show()
sns.distplot(train['SalePrice'], fit=norm);
# 使用probplot函数检测房价偏离正态分布
(mu, sigma) = norm.fit(train['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
# 绘制
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
# 获取QQ-plot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
# 对数据做log处理
train["SalePrice"] = np.log1p(train["SalePrice"])
train_SalePrice = train["SalePrice"]
# 查看新的数据分布
sns.distplot(train['SalePrice'], fit=norm);
# 获取新的数据的分布参数
(mu, sigma) = norm.fit(train['SalePrice'])
print('\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
# 绘制分布
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
# 查看Q-Qplot
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
#训练集数据量
ntrain = train.shape[0]
#测试集数据量
ntest = test.shape[0]
y_train = train.SalePrice.values
# train的数据和test的数据合并,一起处理
all_data = pd.concat((train, test)).reset_index(drop=True)
# all_data.to_csv("dddd.csv")
# 去掉SalePrice,SalePrice不需要处理
all_data.drop(['SalePrice'], axis=1, inplace=True)
print(all_data.head(5))
print(all_data.shape)
# 计算每一列的缺失率,从高到低排列
percent = (all_data.isnull().sum() / len(all_data)).sort_values(ascending=False)
# 数据缺失情况
all_data_na = percent[percent > 0]
print(all_data_na)
# 缺失数据可视化
f, ax = plt.subplots(figsize=(15, 12))
plt.xticks(rotation='90')
sns.barplot(x=all_data_na.index, y=all_data_na)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percent of missing values', fontsize=15)
plt.title('Percent missing data by feature', fontsize=15)
plt.show()
# 使用热力图,分析各个特征和房价的关系
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
ax.set_xticklabels(corrmat, rotation='horizontal')
sns.heatmap(corrmat, vmax=0.9, square=True)
label_y = ax.get_yticklabels()
plt.setp(label_y, rotation=360)
label_x = ax.get_xticklabels()
plt.setp(label_x, rotation=90)
plt.show()
# 缺失数据列
# missing_index=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
# 'LotFrontage', 'GarageQual', 'GarageYrBlt', 'GarageFinish',
# 'GarageCond', 'GarageType', 'BsmtCond', 'BsmtExposure', 'BsmtQual',
# 'BsmtFinType2', 'BsmtFinType1', 'MasVnrType', 'MasVnrArea', 'MSZoning',
# 'BsmtHalfBath', 'Utilities', 'Functional', 'BsmtFullBath', 'Electrical',
# 'Exterior2nd', 'KitchenQual', 'GarageCars', 'Exterior1st', 'GarageArea',
# 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1', 'SaleType']
# PoolQC: Pool quality 游泳池质量。NA,表示没有游泳池。大量数据的值是NA
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
# MiscFeature:MiscFeature: Miscellaneous feature not covered in other categories 其它条件中未包含部分的特性
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
# Alley: Type of alley access 小道的路面类型。
all_data["Alley"] = all_data["Alley"].fillna("None")
print("train的alley")
print(train["Alley"])
print("all_data的allty", all_data["Alley"])
# Fence: Fence quality 围栏质量
all_data["Fence"] = all_data["Fence"].fillna("None")
# FireplaceQu: Fireplace quality 壁炉质量
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")
# LotFrontage: Linear feet of street connected to property 房子同街道之间的距离。
# neighborhood: Physical locations within Ames city limits。Ames市区范围内的物理位置
# 通过neighborhood的所有中值来填充缺失值
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(lambda x: x.fillna(x.median()))
# GarageType: Garage location 车库位置
# GarageFinish: Interior finish of the garage 车库中间建成时间(比如翻修)
# GarageQual: Garage quality 车库质量
# GarageCond: Garage condition 车库条件
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
all_data[col] = all_data[col].fillna("None")
# GarageYrBlt: Year garage was built 车库建造时间
# GarageArea: Size of garage in square feet 车库面积
# GarageCars: Size of garage in car capacity 车库大小以停车数量表示
# 这几个数据用零填充
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
all_data[col] = all_data[col].fillna(0)
# BsmtFinSF1: Type 1 finished square feet Type 1完工面积
# BsmtFinSF2: Type 2 finished square feet Type 2完工面积
# BsmtUnfSF: Unfinished square feet of basement area 地下室区域未完工面积
# TotalBsmtSF: Total square feet of basement area 地下室总体面积
# BsmtFullBath: Basement full bathrooms 地下室全浴室
# BsmtHalfBath: Basement half bathrooms 地下室半浴室
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
all_data[col] = all_data[col].fillna(0)
# BsmtQual: Height of the basement 地下室高度
# BsmtCond:BsmtCond: General condition of the basement 地下室总体情况
# BsmtExposure: Walkout or garden level basement walls 地下室出口或者花园层的墙面
# BsmtFinType1: Quality of basement finished area 地下室区域质量
# BsmtFinType2: Quality of second finished area (if present) 二次完工面积质量(如果有)
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
all_data[col] = all_data[col].fillna("None")
# MasVnrArea: Masonry veneer area in square feet 装饰石材面积
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)
# MasVnrType: Masonry veneer type 装饰石材类型
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
# MSZoning: The general zoning classification 区域分类
# 大部分数据都是‘RL’,这一列的数据用出现频次最高的值填充。
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
# Utilities: Type of utilities available 配套公用设施类型
# test数据中的Utilities的所有的值都是“AllPub”,只有train的数据里面有一个"NoSeWa",和两个空值。可以删除这一列
all_data = all_data.drop(['Utilities'], axis=1)
# Functional: Home functionality rating 功能性评级。
# 空值