'''
@version: python3.6
@author: Administrator
@file: kaggle_house_prices.py
@time: 2019/09/25
'''
#具体题目和讲解请见于该地址:http://zh.d2l.ai/chapter_deep-learning-basics/kaggle-house-price.html
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from scipy.stats import skew
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import Imputer
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge
from sklearn.kernel_ridge import KernelRidge
from xgboost import XGBRegressor
from mlxtend.regressor import StackingRegressor
#1.获取数据并查看
train_data=pd.read_csv('D:/用户目录/我的文档/keras实践/house_prices/train.csv')#返回的是dataframe类型
test_data=pd.read_csv('D:/用户目录/我的文档/keras实践/house_prices/test.csv')
print(train_data.shape,test_data.shape)
# data=train_data.iloc[0:4,[0,1,2,3,-1]]
print(train_data.head())#输出前5行数据
print(test_data.head())
#2、去除异常数据和偏值分析
# 使用下列两种方式都可以查看与SalePrice最相关的10个属性
# #作图来显示相关性
# corrmat=train_data.corr()
# plt.figure(figsize=(12,9))
# cols=corrmat.nlargest(10,'SalePrice')['SalePrice'].index
# cm=np.corrcoef(train_data[cols].values.T)
# sns.set(font_scale=1.25)
# hm=sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f', annot_kws={'size': 10},xticklabels=cols.values,yticklabels=cols.values)
# plt.show()
# 或者不作图,直接输出相关性大于0.5的属性数据
Corr=train_data.corr()
print(Corr[Corr['SalePrice']>0.5])
#已经知道有哪些特征与SalePrice比较相关,加下来绘制散点图来具体看每个属性与SalePrice的关系
# sns.pairplot(x_vars=['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtFullBath', 'YearBuilt'],y_vars=['SalePrice'],data=train_data,dropna=True)
# plt.show()
#根据散点图的显示来去除异常值
train_data.drop(train_data[(train_data['GrLivArea']>4000) & (train_data['SalePrice']<200000)].index,inplace=True)
# train_data.drop(train_data[(train_data['OverallQual']<5) & (train_data['SalePrice']>200000)].index,inplace=True)
# train_data.drop(train_data[(train_data['YearBuilt']<1900) & (train_data['SalePrice']>400000)].index,inplace=True)
# train_data.drop(train_data[(train_data['BsmtFullBath']<1) & (train_data['SalePrice']>300000)].index,inplace=True)
# train_data.drop(train_data[(train_data['TotalBsmtSF']>6000) & (train_data['SalePrice']<200000)].index,inplace=True)
train_data.reset_index(drop=True, inplace=True)
print(train_data.shape)
#除了Ridge和ElasticNet训练了训练集,并对训练集进行预测,找出两个算法中预测效果都不理想的样本作为离群值
#从大到小排序前20个属性的偏值,查看属性是否符合正态分布,符合的后面特征工程处理
#np.abs(train_data.skew()).sort_values(ascending=False).head(20)
#3、缺省值处理
'''
对于缺失数据的处理,通常会有以下几种做法:
如果缺失的数据过多,可以考虑删除该列特征
用平均值、中值、分位数、众数、随机值等替代。但是效果一般,因为等于人为增加了噪声
用插值法进行拟合
用其他变量做预测模型来算出缺失变量。效果比方法1略好。有一个根本缺陷,如果其他变量和缺失变量无关,则预测的结果无意义
最精确的做法,把变量映射到高维空间。比如性别,有男、女、缺失三种情况,则映射成3个变量:是否男、是否女、是否缺失。缺点就是计算量会加大
'''
#将数据集连接组合在一起。等所有的需要的预处理进行完之后,再把他们分隔开。
# all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))#测试数据接在训练数据后面。训练集和测试集第一列是序号,无用舍去,测试集没有标签
all_features=pd.concat([train_data,test_data], ignore_index=True)
all_features.drop(['Id'],axis=1, inplace=True)
numeric=all_features.dtypes[all_features.dtypes!='object'].index#取出dataframe所有非字符串类型的列名(特征名)
print(all_features.shape)
print(all_features.head())
# print(numeric)
#倒叙统计每种属性的缺省总数和占总数比重
count=all_features.isnull().sum().sort_values(ascending=False)
ratio=count/len(all_features)
nulldata=pd.concat([count,ratio],axis=1,keys=['count','ratio'])
print(nulldata)
#缺失代表的是这个房子本身没有这种特征,用 “None” 来填补。(字符串类型)
cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt",
"GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"]
for col in cols1:
all_features[col].fillna("None", inplace=True)
##缺失代表的是这个房子本身没有这种特征,用 0 来填补。(数字类型)
cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"]
for col in cols:
all_features[col].fillna(0, inplace=True)
#众数填充
cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"]
for col in cols2:
all_features[col].fillna(all_features[col].mode()[0], inplace=True)
#LotFrontage这个特征与LotAreaCut和Neighborhood有比较大的关系,所以这里用这两个特征分组后的中位数进行插补。
#use qcut to divide it into 10 parts.
all_features["LotAreaCut"] = pd.qcut(all_features.LotArea,10)
all_features.groupby(['LotAreaCut'])[['LotFrontage']].agg(['mean','median','count'])
all_features['LotFrontage']=all_features.groupby(['LotAreaCut','Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
# Since some combinations of LotArea and Neighborhood are not available, so we just LotAreaCut alone.
all_features['LotFrontage']=all_features.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))
print(all_features.isnull().sum().sort_values(ascending=False))
#4、特征工程
# 以上特征是字符特征,转化成数值型特征,所以这样后属性统一,更易于特征训练。
NumStr = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr","KitchenAbvGr","MoSold","YrSold","YearBuilt","YearRemodAdd","LowQualFinSF","GarageYrBlt"]
for col in NumStr:
all_features[col]=all_features[col].astype(str)
#Convert some numerical features into categorical features
#build as many features as possible and trust the model to choose the right features
#groupby SalePrice according to one feature and sort it based on mean and median.
#映射这些值
def map_values():
all_features["oMSSubClass"] = all_features.MSSubClass.map({'180': 1,
'30': 2, '45': 2,
'190': 3, '50': 3, '90': 3,
'85': 4, '40': 4, '160': 4,
'70': 5, '20': 5, '75': 5, '80': 5, '150': 5,
'120': 6, '60': 6})
all_features["oMSZoning"] = all_features.MSZoning.map({'C (all)': 1, 'RH': 2, 'RM': 2, 'RL': 3, 'FV': 4})
all_features["oNeighborho
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
【资源说明】 1、该资源包括项目的全部源码,下载可以直接使用! 2、本项目适合作为计算机、数学、电子信息等专业的课程设计、期末大作业和毕设项目,作为参考资料学习借鉴。 3、本资源作为“参考资料”如果需要实现其他功能,需要能看懂代码,并且热爱钻研,自行调试。 基于tensorflow后台的keras框架实践源码,以神经网络,机器学习和深度学习的经典项目为主.zip
资源推荐
资源详情
资源评论
收起资源包目录
基于tensorflow后台的keras框架实践源码,以神经网络,机器学习和深度学习的经典项目为主.zip (8个子文件)
code_20105
kaggle_house_prices.py 24KB
回归问题(预测波士顿房价).py 3KB
使用预训练的卷积神经网络.py 7KB
CAPTCHA验证码识别.py 9KB
二分类问题(IMDB数据集).py 3KB
多分类问题(路透社新闻数据集的分类).py 3KB
mnist数据分类实验.py 2KB
从头开始训练一个卷积神经网络.py 8KB
共 8 条
- 1
资源评论
土豆片片
- 粉丝: 1527
- 资源: 5641
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功