基于tensorflow后台的keras框架实践源码，以神经网络，机器学习和深度学习的经典项目为主.zip

共8个文件

py：8个

版权申诉

机器学习

深度学习

源码

毕业设计

113 浏览量 2024-01-12 00:48:22 上传评论收藏 24KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于tensorflow后台的keras框架实践源码，以神经网络，机器学习和深度学习的经典项目为主.zip （8个子文件）

code_20105

kaggle_house_prices.py 24KB

回归问题(预测波士顿房价).py 3KB

使用预训练的卷积神经网络.py 7KB

CAPTCHA验证码识别.py 9KB

二分类问题(IMDB数据集).py 3KB

多分类问题(路透社新闻数据集的分类).py 3KB

mnist数据分类实验.py 2KB

从头开始训练一个卷积神经网络.py 8KB

''' @version: python3.6 @author: Administrator @file: kaggle_house_prices.py @time: 2019/09/25 ''' #具体题目和讲解请见于该地址：http://zh.d2l.ai/chapter_deep-learning-basics/kaggle-house-price.html import pandas as pd import numpy as np import seaborn as sns import matplotlib.pylab as plt from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import RobustScaler, StandardScaler from sklearn.metrics import mean_squared_error from sklearn.pipeline import Pipeline, make_pipeline from scipy.stats import skew from sklearn.decomposition import PCA, KernelPCA from sklearn.preprocessing import Imputer from sklearn.model_selection import cross_val_score, GridSearchCV, KFold from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor from sklearn.svm import SVR, LinearSVR from sklearn.linear_model import ElasticNet, SGDRegressor, BayesianRidge from sklearn.kernel_ridge import KernelRidge from xgboost import XGBRegressor from mlxtend.regressor import StackingRegressor #1.获取数据并查看 train_data=pd.read_csv('D:/用户目录/我的文档/keras实践/house_prices/train.csv')#返回的是dataframe类型 test_data=pd.read_csv('D:/用户目录/我的文档/keras实践/house_prices/test.csv') print(train_data.shape,test_data.shape) # data=train_data.iloc[0:4,[0,1,2,3,-1]] print(train_data.head())#输出前5行数据 print(test_data.head()) #2、去除异常数据和偏值分析 # 使用下列两种方式都可以查看与SalePrice最相关的10个属性 # #作图来显示相关性 # corrmat=train_data.corr() # plt.figure(figsize=(12,9)) # cols=corrmat.nlargest(10,'SalePrice')['SalePrice'].index # cm=np.corrcoef(train_data[cols].values.T) # sns.set(font_scale=1.25) # hm=sns.heatmap(cm,cbar=True,annot=True,square=True,fmt='.2f', annot_kws={'size': 10},xticklabels=cols.values,yticklabels=cols.values) # plt.show() # 或者不作图，直接输出相关性大于0.5的属性数据 Corr=train_data.corr() print(Corr[Corr['SalePrice']>0.5]) #已经知道有哪些特征与SalePrice比较相关，加下来绘制散点图来具体看每个属性与SalePrice的关系 # sns.pairplot(x_vars=['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'BsmtFullBath', 'YearBuilt'],y_vars=['SalePrice'],data=train_data,dropna=True) # plt.show() #根据散点图的显示来去除异常值 train_data.drop(train_data[(train_data['GrLivArea']>4000) & (train_data['SalePrice']<200000)].index,inplace=True) # train_data.drop(train_data[(train_data['OverallQual']<5) & (train_data['SalePrice']>200000)].index,inplace=True) # train_data.drop(train_data[(train_data['YearBuilt']<1900) & (train_data['SalePrice']>400000)].index,inplace=True) # train_data.drop(train_data[(train_data['BsmtFullBath']<1) & (train_data['SalePrice']>300000)].index,inplace=True) # train_data.drop(train_data[(train_data['TotalBsmtSF']>6000) & (train_data['SalePrice']<200000)].index,inplace=True) train_data.reset_index(drop=True, inplace=True) print(train_data.shape) #除了Ridge和ElasticNet训练了训练集，并对训练集进行预测，找出两个算法中预测效果都不理想的样本作为离群值 #从大到小排序前20个属性的偏值，查看属性是否符合正态分布，符合的后面特征工程处理 #np.abs(train_data.skew()).sort_values(ascending=False).head(20) #3、缺省值处理 ''' 对于缺失数据的处理，通常会有以下几种做法：如果缺失的数据过多，可以考虑删除该列特征用平均值、中值、分位数、众数、随机值等替代。但是效果一般，因为等于人为增加了噪声用插值法进行拟合用其他变量做预测模型来算出缺失变量。效果比方法1略好。有一个根本缺陷，如果其他变量和缺失变量无关，则预测的结果无意义最精确的做法，把变量映射到高维空间。比如性别，有男、女、缺失三种情况，则映射成3个变量：是否男、是否女、是否缺失。缺点就是计算量会加大 ''' #将数据集连接组合在一起。等所有的需要的预处理进行完之后，再把他们分隔开。 # all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))#测试数据接在训练数据后面。训练集和测试集第一列是序号，无用舍去，测试集没有标签 all_features=pd.concat([train_data,test_data], ignore_index=True) all_features.drop(['Id'],axis=1, inplace=True) numeric=all_features.dtypes[all_features.dtypes!='object'].index#取出dataframe所有非字符串类型的列名（特征名） print(all_features.shape) print(all_features.head()) # print(numeric) #倒叙统计每种属性的缺省总数和占总数比重 count=all_features.isnull().sum().sort_values(ascending=False) ratio=count/len(all_features) nulldata=pd.concat([count,ratio],axis=1,keys=['count','ratio']) print(nulldata) #缺失代表的是这个房子本身没有这种特征，用 “None” 来填补。（字符串类型） cols1 = ["PoolQC" , "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageQual", "GarageCond", "GarageFinish", "GarageYrBlt", "GarageType", "BsmtExposure", "BsmtCond", "BsmtQual", "BsmtFinType2", "BsmtFinType1", "MasVnrType"] for col in cols1: all_features[col].fillna("None", inplace=True) ##缺失代表的是这个房子本身没有这种特征，用 0 来填补。（数字类型） cols=["MasVnrArea", "BsmtUnfSF", "TotalBsmtSF", "GarageCars", "BsmtFinSF2", "BsmtFinSF1", "GarageArea"] for col in cols: all_features[col].fillna(0, inplace=True) #众数填充 cols2 = ["MSZoning", "BsmtFullBath", "BsmtHalfBath", "Utilities", "Functional", "Electrical", "KitchenQual", "SaleType","Exterior1st", "Exterior2nd"] for col in cols2: all_features[col].fillna(all_features[col].mode()[0], inplace=True) #LotFrontage这个特征与LotAreaCut和Neighborhood有比较大的关系，所以这里用这两个特征分组后的中位数进行插补。 #use qcut to divide it into 10 parts. all_features["LotAreaCut"] = pd.qcut(all_features.LotArea,10) all_features.groupby(['LotAreaCut'])[['LotFrontage']].agg(['mean','median','count']) all_features['LotFrontage']=all_features.groupby(['LotAreaCut','Neighborhood'])['LotFrontage'].transform(lambda x: x.fillna(x.median())) # Since some combinations of LotArea and Neighborhood are not available, so we just LotAreaCut alone. all_features['LotFrontage']=all_features.groupby(['LotAreaCut'])['LotFrontage'].transform(lambda x: x.fillna(x.median())) print(all_features.isnull().sum().sort_values(ascending=False)) #4、特征工程 # 以上特征是字符特征，转化成数值型特征，所以这样后属性统一，更易于特征训练。 NumStr = ["MSSubClass","BsmtFullBath","BsmtHalfBath","HalfBath","BedroomAbvGr","KitchenAbvGr","MoSold","YrSold","YearBuilt","YearRemodAdd","LowQualFinSF","GarageYrBlt"] for col in NumStr: all_features[col]=all_features[col].astype(str) #Convert some numerical features into categorical features #build as many features as possible and trust the model to choose the right features #groupby SalePrice according to one feature and sort it based on mean and median. #映射这些值 def map_values(): all_features["oMSSubClass"] = all_features.MSSubClass.map({'180': 1, '30': 2, '45': 2, '190': 3, '50': 3, '90': 3, '85': 4, '40': 4, '160': 4, '70': 5, '20': 5, '75': 5, '80': 5, '150': 5, '120': 6, '60': 6}) all_features["oMSZoning"] = all_features.MSZoning.map({'C (all)': 1, 'RH': 2, 'RM': 2, 'RL': 3, 'FV': 4}) all_features["oNeighborho

评论收藏

内容反馈

版权申诉