import pandas as pd
import numpy as np
import sys
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline # Pipeline将多个数据处理步骤串联起来,使得数据处理流程更加清晰和高效
from sklearn.preprocessing import StandardScaler, MinMaxScaler # 标准化,归一化
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
pd.set_option("display.max_columns", None) # 所有列都会被显示出来
# 读数据
data = pd.read_csv('risk_factors_cervical_cancer.csv', sep=',')
# print(data.head()) # 数据太长,看着不是很方便
# print(data.info())
names = data.columns # 用于获取数据框中所有列的列名
# 数据清洗
data.replace('?', np.nan, inplace=True)
# SimpleImputer()缺省值填充 列填充:missing_values='NaN', strategy='mean', axis=0
imputer = SimpleImputer() # # 平均值处理缺失
data = imputer.fit_transform(data)
data = pd.DataFrame(data, columns=names) # columns参数设置后,读取DataFrame.info属性仍保持原名字
# print(data.head())
# print(data.info())
# 获取X和Y
X = data.iloc[:, :-4]
Y = data.iloc[:, -4:].astype('int')
# print(X.info())
# print(Y.info())
# 数据分割
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
# 构建一个管道
# 标准化:把数据转化为均值为0,方差为1的
# 归一化:把数据压缩到0-1
# PCA降维
models = [Pipeline([('standarscaler', StandardScaler()), # 对决策树来说,数据标准化/归一化 不需要做
('pca', PCA()),
('RF', RandomForestClassifier())]),
Pipeline([
('pca', PCA(n_components=0.5)),
('RF', RandomForestClassifier(n_estimators=10, max_depth=20))])
]
# 这里的Pipeline[0]管道其实没有用到
# 只是为了演示管道构造
# 设置参数
params = {'pca__n_components': [0.5, 0.6, 0.7, 0.8, 0.9],
'RF__n_estimators': [40, 50, 60, 70, 80, 90, 100], # 一般50-100,默认100;值过大会过拟合
'RF__max_depth': [1, 3, 5, 7, 10, 15]}
# 网格调参
model = GridSearchCV(estimator=models[1], param_grid=params, cv=5)
# 训练
model.fit(x_train, y_train)
print('最优参数:', model.best_params_)
print('最优模型:', model.best_estimator_)
print('最优模型的分数:', model.best_score_)
print(model.score(x_train, y_train))
print(model.score(x_test, y_test))
# 保存模型
# from sklearn.externals import joblib
# import joblib
# joblib.dump(model,'./model/risk01.m')