# -*- coding: utf-8 -*-
"""
Created on Wed Apr 25 16:42:42 2018
@author: zju
"""
def compute_class_weight(class_weight, classes, y):
import numpy as np
from numpy import bincount
import warnings
from sklearn.preprocessing import LabelEncoder
if set(y) - set(classes):
raise ValueError("classes should include all valid labels that can "
"be in y")
if class_weight is None or len(class_weight) == 0:
# uniform class weights
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
elif class_weight in ['auto', 'balanced']:
# Find the weight of each class as present in y.
le = LabelEncoder()
y_ind = le.fit_transform(y)
if not all(np.in1d(classes, le.classes_)):
raise ValueError("classes should have valid labels that are in y")
# inversely proportional to the number of samples in the class
if class_weight == 'auto':
recip_freq = 1. / bincount(y_ind)
weight = recip_freq[le.transform(classes)] / np.mean(recip_freq)
warnings.warn("The class_weight='auto' heuristic is deprecated in"
" 0.17 in favor of a new heuristic "
"class_weight='balanced'. 'auto' will be removed in"
" 0.19", DeprecationWarning)
else:
recip_freq = len(y) / (len(le.classes_) *
bincount(y_ind).astype(np.float64))
weight = recip_freq[le.transform(classes)]
else:
# user-defined dictionary
weight = np.ones(classes.shape[0], dtype=np.float64, order='C')
if not isinstance(class_weight, dict):
raise ValueError("class_weight must be dict, 'balanced', or None,"
" got: %r" % class_weight)
for c in class_weight:
i = np.searchsorted(classes, c)
if i >= len(classes) or classes[i] != c:
raise ValueError("Class label %d not present." % c)
else:
weight[i] = class_weight[c]
return weight
def compute_sample_weight(class_weight, y, indices=None):
import six
from numpy import in1d
y = np.atleast_1d(y)
if y.ndim == 1:
y = np.reshape(y, (-1, 1))
n_outputs = y.shape[1]
if isinstance(class_weight, six.string_types):
if class_weight not in ['balanced', 'auto']:
raise ValueError('The only valid preset for class_weight is '
'"balanced". Given "%s".' % class_weight)
elif (indices is not None and
not isinstance(class_weight, six.string_types)):
raise ValueError('The only valid class_weight for subsampling is '
'"balanced". Given "%s".' % class_weight)
elif n_outputs > 1:
if (not hasattr(class_weight, "__iter__") or
isinstance(class_weight, dict)):
raise ValueError("For multi-output, class_weight should be a "
"list of dicts, or a valid string.")
if len(class_weight) != n_outputs:
raise ValueError("For multi-output, number of elements in "
"class_weight should match number of outputs.")
expanded_class_weight = []
for k in range(n_outputs):
y_full = y[:, k]
classes_full = np.unique(y_full)
classes_missing = None
if class_weight in ['balanced', 'auto'] or n_outputs == 1:
class_weight_k = class_weight
else:
class_weight_k = class_weight[k]
if indices is not None:
# Get class weights for the subsample, covering all classes in
# case some labels that were present in the original data are
# missing from the sample.
y_subsample = y[indices, k]
classes_subsample = np.unique(y_subsample)
weight_k = np.choose(np.searchsorted(classes_subsample,
classes_full),
compute_class_weight(class_weight_k,
classes_subsample,
y_subsample),
mode='clip')
classes_missing = set(classes_full) - set(classes_subsample)
else:
weight_k = compute_class_weight(class_weight_k,
classes_full,
y_full)
weight_k = weight_k[np.searchsorted(classes_full, y_full)]
if classes_missing:
# Make missing classes' weight zero
weight_k[in1d(y_full, list(classes_missing))] = 0.
expanded_class_weight.append(weight_k)
expanded_class_weight = np.prod(expanded_class_weight,
axis=0,
dtype=np.float64)
return expanded_class_weight
import pandas
import numpy as np
f = pandas.read_csv('changqiao.csv', encoding='gbk')
X = f.iloc[:,0:23]
y = f.iloc[:,23]
from sklearn.model_selection import StratifiedKFold
skf= StratifiedKFold(n_splits=10)
skf.get_n_splits(X,y)
print(skf)
sumaccuracy=0
sumprecision=0
sumrecall=0
for train_index, test_index in skf.split(X,y):
X_train= np.array(X)[train_index]
y_train= np.array(y)[train_index]
X_test= np.array(X)[test_index]
y_test= np.array(y)[test_index]
from sklearn.model_selection import train_test_split
X_train, X_train_xgb, y_train, y_train_xgb = train_test_split(X_train, y_train, test_size=0.5)
expanded_class_weight = compute_sample_weight('balanced', y_train, indices=None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
lr = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
algorithm="SAMME",
n_estimators=200, learning_rate=0.8)
lr.fit(X_train, y_train)
from xgboost.sklearn import XGBClassifier
rf = XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
rf.fit(X_train, y_train)
from sklearn.ensemble import GradientBoostingClassifier
gbdt = GradientBoostingClassifier(n_estimators=200)
gbdt.fit(X_train, y_train)
y_pred_lr_xgb = lr.predict_proba(X_train_xgb)[:, 1]
y_pred_rf_xgb= rf.predict_proba(X_train_xgb)[:, 1]
y_pred_gbdt_xgb = gbdt.predict_proba(X_train_xgb)[:, 1]
input1_xgb = np.array(y_pred_lr_xgb).round()
input2_xgb = np.array(y_pred_rf_xgb).round()
input3_xgb = np.array(y_pred_gbdt_xgb).round()
input = np.vstack((input1_xgb,input2_xgb,input3_xgb)).T
from sklearn.linear_model import LogisticRegression
xgb = LogisticRegression(penalty='l2')
xgb.fit(input, y_train_xgb)
y_pred_lr = lr.predict_proba(X_test)[:, 1]
y_pred_rf = rf.predict_proba(X_test)[:, 1]
y_pred_gbdt = gbdt.predict_proba(X_test)[:, 1]
input1 = np.array(y_pred_lr).round()
input2 = np.array(y_pred_rf).round()
input3 = np.array(y_pred_gbdt).round()
input = np.vstack((input1,input2,input3)).T
y_pred = xgb.predict_proba(input)[:, 1]