# 1.数据读取
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
data = pd.read_csv('./data/train.csv')
pd.set_option('display.max_columns', 20)
print(data.head(4))
print(data.info())
print(data['Survived'].value_counts())
# 2.数据探索
# 标签比例 获救比例,饼图
# f, ax = plt.subplots(1, 2, figsize=(8, 4))
# data['Survived'].value_counts().plot.pie(explode=[0, 0.1], # 偏移量
# autopct='%1.1f%%', # 百分比保留小数位数
# ax=ax[0], shadow=True)
# ax[0].set_title('Survived') # 图一标题
# sns.countplot('Survived', saturation=0.75, # 饱和度
# data=data, ax=ax[1]) # 柱状图
# plt.show()
# 男女获救比例
# print(data.groupby(['Sex', 'Survived'])['Survived'].count())
# sns.countplot('Sex', hue='Survived', data=data) # 柱状图
# plt.show()
# 船舱等级和获救之间的关系
# print(pd.crosstab(data['Pclass'], data['Survived'], margins=True))
# sns.countplot('Pclass', hue='Survived', data=data)
# plt.show()
# 不同性别及船舱等级和获救之间的关系
# print(pd.crosstab([data['Sex'], data['Survived']], data['Pclass'], margins=True))
# sns.factorplot('Pclass', 'Survived',
# hue='Sex', # 颜色
# data=data) # 降维画图
# plt.show()
# 3.数据清洗和预处理
# 3.1 提取性别身份,并将少数类归为其他
data['initial'] = data['Name'].str.extract('([A-Za-z]+)\.') # 提取性别身份
print(data['initial'].head(5))
print(pd.crosstab(data['initial'], data['Sex']).T)
def transformOther(str):
if str != 'Master' and str != 'Miss' and str != 'Mr' and str != 'Mrs':
str = 'other'
return str
data['re'] = data['initial'].apply(transformOther)
print(data['re'].unique())
print(data.groupby('re')['Sex'].count())
# 3.2缺失值补充
# 填补Age缺失值(用各性别下年龄均值去填补Age缺失值)
print(data.groupby('re')['Age'].mean()) # 各性别下年龄均值
print(int(data.groupby('re')['Age'].mean()[0]))
def FillNullAge(age):
for str in data['re'].values:
if np.isnan(age):
age = int(data.groupby('re')['Age'].mean()[str])
return age
data['Age'] = data['Age'].apply(FillNullAge)
print(data['Age'].isnull().sum())
# 港口
# 上船港口和船舱等级之间的关系
# print(pd.crosstab(data['Embarked'], data['Pclass'], margins=True))
# sns.countplot('Embarked', # 上船的港口
# hue='Pclass', data=data)
# plt.grid()
# plt.show()
# 上船港口和获救之间的关系
# print(pd.crosstab(data['Embarked'], data['Survived'], margins=True))
# plt.grid(b=True, axis='y') # 只显示x轴网格线
# sns.countplot('Embarked', hue='Survived', data=data)
# plt.show()
data['Embarked'].fillna('s', inplace=True)
print(data['Embarked'].isnull().any())
# 不同船舱的核密度估计图
# f, ax = plt.subplots(1, 3, figsize=(10, 8))
# sns.distplot(data[data['Pclass'] == 1].Fare, ax=ax[0])
# sns.distplot(data[data['Pclass'] == 2].Fare, ax=ax[1])
# sns.distplot(data[data['Pclass'] == 3].Fare, ax=ax[2])
# plt.show()
# 相关性热图
# sns.heatmap(data.corr(), annot=True, linewidths=0.2, cmap='summer_r')
# fig = plt.gcf()
# fig.set_size_inches(8, 6)
# plt.savefig('heatmap.jpg')
# 3.3 数据处理
# 年龄分段(<16,16< <32, 32< <48, 48< <65, >65)
def AgeBand(age):
if age <= 16:
return 0
elif age <= 32 and age > 16:
return 1
elif age <= 48 and age > 32:
return 2
elif age <= 65 and age > 48:
return 3
elif age >65:
return 4
data['Age_band'] = 0
data['Age_band'] = data['Age'].apply(AgeBand)
print(data.head(1))
# 数值化(将Sex、Embarked、re列数值化)
from sklearn import preprocessing
lb = preprocessing.LabelEncoder()
data['Sex'] = lb.fit_transform(data['Sex'])
data['Embarked'] = lb.fit_transform(data['Embarked'])
data['re'] = lb.fit_transform(data['re'])
print(data.head(1))
print(data['Embarked'].unique()) # 哑变量
# 独热编码(将数值化的Embarked转换为one-hot编码)
oh = preprocessing.OneHotEncoder(sparse=False)
data['Embarked'] = oh.fit_transform(data[['Embarked']])
print(data['Embarked'])
print(data.head(2))
# 3.4 变量选择
data.drop(['PassengerId', 'Name', 'Age', 'Ticket', 'Cabin', 'initial'], axis=1, inplace=True) # 删数据
print(data.head(2))
# 4.建模
# 导包
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
# 划分数据集
X = data.iloc[:, data.columns != 'Survived']
y = data.iloc[:, data.columns == 'Survived']
print(X.shape)
print(y.shape)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.2, random_state=10) # 划分数据集
# 4.1 logistic
# 逻辑回归
l = LogisticRegression()
l.fit(Xtrain, Ytrain)
print(l.score(Xtest, Ytest))
# 或者如下显示得分
pre_l = l.predict(Xtest)
print(metrics.accuracy_score(Ytest, pre_l))
# X_train = Xtrain.iloc[:, :].values
# 4.2 knn近邻算法
# score = []
# for i in list(range(1, 11)):
# KNN = KNeighborsClassifier(n_neighbors=i)
# CVS = cross_val_score(KNN, Xtrain, Ytrain, cv=5)
# score.append(CVS.mean())
#
# # 绘图 knn算法选取的1-10的近邻与评价分数的折线图
# plt.plot([*range(1, 11)], score)
# fig = plt.gcf()
# fig.set_size_inches(12, 6)
# plt.show()
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(Xtrain, Ytrain)
print(KNN.score(Xtest, Ytest))
pred_KNN = KNN.predict(Xtest)
print(metrics.accuracy_score(Ytest, pred_KNN))
# 4.3 网格搜索
from sklearn.model_selection import GridSearchCV
# 设置可选参数
param_grid = {
'criterion': ['entropy', 'gini'],
'max_depth': range(2, 10),
'min_samples_leaf': range(1, 10),
'min_samples_split': range(2, 10)
}
# 设置网格
# 创建网格对象,默认以R的平方评价拟合的好坏,本节以决策树去拟合
GR = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5)
# 建模
GR.fit(Xtrain, Ytrain)
# 输出接口
print(GR.best_params_)
print(GR.best_estimator_)
# 分数
print(GR.best_score_)
print(GR.cv_results_)
# 4.4 决策树
DTC = DecisionTreeClassifier(criterion='entropy', max_depth=6,
min_samples_leaf=6,
min_samples_split=4)
DTC.fit(Xtrain, Ytrain)
print(DTC.score(Xtest, Ytest))
print(DTC.feature_importances_)
# 绘图(特征重要性的柱状图)
f = plt.figure(figsize=(8, 4))
DTC_series = pd.Series(DTC.feature_importances_, X.columns).sort_values(ascending=True)
print(DTC_series)
DTC_series.plot.barh(width=0.8)
plt.show()
# 4.5 roc曲线
# 绘制决策树模型的roc曲线
y_pred =DTC.predict(Xtest)
from sklearn.metrics import roc_curve, auc
# 计算真正率 假正率
fpr, tpr, threshold = roc_curve(Ytest, y_pred)
# 计算AUC
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, color='darkorange',
label='ROC curve (area=%0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc='lower right')
plt.show()
# 4.6 混淆矩阵
plt.figure(figsize=(4, 4))
KNN = KNeighborsClassifier(n_neighbors=3)
KNN.fit(Xtrain, Ytrain)
y_pred = KNN.predict(