pythonimbalanced-learn库实用例子（examples）代码

共96个文件

py：96个

imbalanced

python

代码

需积分: 44 47 浏览量 2018-11-30 20:51:46 上传评论收藏 163KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

imblearn.zip （96个子文件）

imblearn

exceptions.py 404B

ensemble

_bagging.py 9KB

_weight_boosting.py 12KB

base.py 2KB

__init__.py 588B

_easy_ensemble.py 10KB

tests

test_bagging.py 17KB

__init__.py 0B

test_weight_boosting.py 4KB

test_balance_cascade.py 7KB

test_forest.py 4KB

test_easy_ensemble.py 11KB

_forest.py 19KB

_balance_cascade.py 8KB

utils

_validation.py 22KB

estimator_checks.py 15KB

testing.py 6KB

__init__.py 412B

_docstring.py 1KB

deprecation.py 2KB

tests

test_docstring.py 1024B

test_estimator_checks.py 2KB

test_deprecation.py 584B

__init__.py 0B

test_testing.py 2KB

test_validation.py 14KB

tensorflow

__init__.py 193B

_generator.py 6KB

tests

test_generator.py 3KB

base.py 7KB

over_sampling

_adasyn.py 7KB

base.py 2KB

_smote.py 40KB

__init__.py 398B

_random_over_sampler.py 4KB

tests

test_adasyn.py 4KB

test_smote_nc.py 7KB

test_smote.py 13KB

test_svm_smote.py 2KB

__init__.py 0B

test_random_over_sampler.py 4KB

test_borderline_smote.py 2KB

metrics

__init__.py 658B

_classification.py 34KB

tests

test_classification.py 15KB

test_score_objects.py 2KB

_version.py 628B

__init__.py 2KB

pipeline.py 22KB

datasets

_zenodo.py 12KB

__init__.py 208B

tests

test_imbalance.py 2KB

test_zenodo.py 3KB

__init__.py 0B

_imbalance.py 5KB

keras

__init__.py 267B

_generator.py 9KB

tests

test_generator.py 4KB

__init__.py 0B

combine

_smote_enn.py 4KB

__init__.py 209B

_smote_tomek.py 5KB

tests

__init__.py 0B

test_smote_tomek.py 5KB

test_smote_enn.py 4KB

under_sampling

base.py 3KB

__init__.py 957B

_prototype_generation

__init__.py 232B

tests

test_cluster_centroids.py 4KB

__init__.py 0B

_cluster_centroids.py 6KB

_prototype_selection

_neighbourhood_cleaning_rule.py 7KB

_instance_hardness_threshold.py 7KB

_nearmiss.py 11KB

__init__.py 964B

_one_sided_selection.py 7KB

_random_under_sampler.py 5KB

_tomek_links.py 6KB

_condensed_nearest_neighbour.py 8KB

tests

test_repeated_edited_nearest_neighbours.py 8KB

test_neighbourhood_cleaning_rule.py 5KB

test_edited_nearest_neighbours.py 4KB

test_random_under_sampler.py 3KB

__init__.py 0B

test_tomek_links.py 3KB

test_instance_hardness_threshold.py 5KB

test_allknn.py 8KB

test_one_sided_selection.py 4KB

test_condensed_nearest_neighbour.py 4KB

test_nearmiss.py 8KB

_edited_nearest_neighbours.py 19KB

tests

__init__.py 0B

test_base.py 2KB

test_pipeline.py 35KB

test_common.py 3KB

test_exceptions.py 375B

"""Class to perform over-sampling using SMOTE.""" # Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com> # Fernando Nogueira # Christos Aridas # Dzianis Dudnik # License: MIT from __future__ import division import types import warnings from collections import Counter import numpy as np from scipy import sparse from sklearn.base import clone from sklearn.preprocessing import OneHotEncoder from sklearn.svm import SVC from sklearn.utils import check_random_state from sklearn.utils import safe_indexing from sklearn.utils import check_array from sklearn.utils import check_X_y from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0 from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0 from .base import BaseOverSampler from ..exceptions import raise_isinstance_error from ..utils import check_neighbors_object from ..utils import check_target_type from ..utils import Substitution from ..utils._docstring import _random_state_docstring # FIXME: remove in 0.6 SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm') class BaseSMOTE(BaseOverSampler): """Base class for the different SMOTE algorithms.""" def __init__(self, sampling_strategy='auto', random_state=None, k_neighbors=5, n_jobs=1, ratio=None): super(BaseSMOTE, self).__init__( sampling_strategy=sampling_strategy, ratio=ratio) self.random_state = random_state self.k_neighbors = k_neighbors self.n_jobs = n_jobs def _validate_estimator(self): """Check the NN estimators shared across the different SMOTE algorithms. """ self.nn_k_ = check_neighbors_object( 'k_neighbors', self.k_neighbors, additional_neighbor=1) self.nn_k_.set_params(**{'n_jobs': self.n_jobs}) def _make_samples(self, X, y_dtype, y_type, nn_data, nn_num, n_samples, step_size=1.): """A support function that returns artificial samples constructed along the line connecting nearest neighbours. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Points from which the points will be created. y_dtype : dtype The data type of the targets. y_type : str or int The minority target value, just so the function can return the target values for the synthetic variables with correct length in a clear format. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. n_samples : int The number of samples to generate. step_size : float, optional (default=1.) The step size to create samples. Returns ------- X_new : {ndarray, sparse matrix}, shape (n_samples_new, n_features) Synthetically generated samples. y_new : ndarray, shape (n_samples_new,) Target values for synthetic samples. """ random_state = check_random_state(self.random_state) samples_indices = random_state.randint( low=0, high=len(nn_num.flatten()), size=n_samples) steps = step_size * random_state.uniform(size=n_samples) rows = np.floor_divide(samples_indices, nn_num.shape[1]) cols = np.mod(samples_indices, nn_num.shape[1]) y_new = np.array([y_type] * len(samples_indices), dtype=y_dtype) if sparse.issparse(X): row_indices, col_indices, samples = [], [], [] for i, (row, col, step) in enumerate(zip(rows, cols, steps)): if X[row].nnz: sample = self._generate_sample(X, nn_data, nn_num, row, col, step) row_indices += [i] * len(sample.indices) col_indices += sample.indices.tolist() samples += sample.data.tolist() return (sparse.csr_matrix((samples, (row_indices, col_indices)), [len(samples_indices), X.shape[1]], dtype=X.dtype), y_new) else: X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype) for i, (row, col, step) in enumerate(zip(rows, cols, steps)): X_new[i] = self._generate_sample(X, nn_data, nn_num, row, col, step) return X_new, y_new def _generate_sample(self, X, nn_data, nn_num, row, col, step): r"""Generate a synthetic sample. The rule for the generation is: .. math:: \mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times (\mathbf{s_{i}} - \mathbf{s_{nn}}) \, where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of \mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1). Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Points from which the points will be created. nn_data : ndarray, shape (n_samples_all, n_features) Data set carrying all the neighbours to be used. nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours) The nearest neighbours of each sample in `nn_data`. row : int Index pointing at feature vector in X which will be used as a base for creating new sample. col : int Index pointing at which nearest neighbor of base feature vector will be used when creating new sample. step : float Step size for new sample. Returns ------- X_new : {ndarray, sparse matrix}, shape (n_features,) Single synthetically generated sample. """ return X[row] - step * (X[row] - nn_data[nn_num[row, col]]) def _in_danger_noise(self, nn_estimator, samples, target_class, y, kind='danger'): """Estimate if a set of sample are in danger or noise. Used by BorderlineSMOTE and SVMSMOTE. Parameters ---------- nn_estimator : estimator An estimator that inherits from :class:`sklearn.neighbors.base.KNeighborsMixin` use to determine if a sample is in danger/noise. samples : {array-like, sparse matrix}, shape (n_samples, n_features) The samples to check if either they are in danger or not. target_class : int or str The target corresponding class being over-sampled. y : array-like, shape (n_samples,) The true label in order to check the neighbour labels. kind : str, optional (default='danger') The type of classification to use. Can be either: - If 'danger', check if samples are in danger, - If 'noise', check if samples are noise. Returns ------- output : ndarray, shape (n_samples,) A boolean array where True refer to samples in danger or noise. """ x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:] nn_label = (y[x] != target_class).astype(int) n_maj = np.sum(nn_label, axis=1) if kind == 'danger': # Samples are in danger for m/2 <= m' < m return np.bitwise_and(n_maj >= (nn_estimator.n_neighbors - 1) / 2, n_maj < nn_estim

评论收藏

内容反馈