"""Class to perform over-sampling using SMOTE."""
# Authors: Guillaume Lemaitre <g.lemaitre58@gmail.com>
# Fernando Nogueira
# Christos Aridas
# Dzianis Dudnik
# License: MIT
from __future__ import division
import types
import warnings
from collections import Counter
import numpy as np
from scipy import sparse
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVC
from sklearn.utils import check_random_state
from sklearn.utils import safe_indexing
from sklearn.utils import check_array
from sklearn.utils import check_X_y
from sklearn.utils.sparsefuncs_fast import csr_mean_variance_axis0
from sklearn.utils.sparsefuncs_fast import csc_mean_variance_axis0
from .base import BaseOverSampler
from ..exceptions import raise_isinstance_error
from ..utils import check_neighbors_object
from ..utils import check_target_type
from ..utils import Substitution
from ..utils._docstring import _random_state_docstring
# FIXME: remove in 0.6
SMOTE_KIND = ('regular', 'borderline1', 'borderline2', 'svm')
class BaseSMOTE(BaseOverSampler):
"""Base class for the different SMOTE algorithms."""
def __init__(self,
sampling_strategy='auto',
random_state=None,
k_neighbors=5,
n_jobs=1,
ratio=None):
super(BaseSMOTE, self).__init__(
sampling_strategy=sampling_strategy, ratio=ratio)
self.random_state = random_state
self.k_neighbors = k_neighbors
self.n_jobs = n_jobs
def _validate_estimator(self):
"""Check the NN estimators shared across the different SMOTE
algorithms.
"""
self.nn_k_ = check_neighbors_object(
'k_neighbors', self.k_neighbors, additional_neighbor=1)
self.nn_k_.set_params(**{'n_jobs': self.n_jobs})
def _make_samples(self,
X,
y_dtype,
y_type,
nn_data,
nn_num,
n_samples,
step_size=1.):
"""A support function that returns artificial samples constructed along
the line connecting nearest neighbours.
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Points from which the points will be created.
y_dtype : dtype
The data type of the targets.
y_type : str or int
The minority target value, just so the function can return the
target values for the synthetic variables with correct length in
a clear format.
nn_data : ndarray, shape (n_samples_all, n_features)
Data set carrying all the neighbours to be used
nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
The nearest neighbours of each sample in `nn_data`.
n_samples : int
The number of samples to generate.
step_size : float, optional (default=1.)
The step size to create samples.
Returns
-------
X_new : {ndarray, sparse matrix}, shape (n_samples_new, n_features)
Synthetically generated samples.
y_new : ndarray, shape (n_samples_new,)
Target values for synthetic samples.
"""
random_state = check_random_state(self.random_state)
samples_indices = random_state.randint(
low=0, high=len(nn_num.flatten()), size=n_samples)
steps = step_size * random_state.uniform(size=n_samples)
rows = np.floor_divide(samples_indices, nn_num.shape[1])
cols = np.mod(samples_indices, nn_num.shape[1])
y_new = np.array([y_type] * len(samples_indices), dtype=y_dtype)
if sparse.issparse(X):
row_indices, col_indices, samples = [], [], []
for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
if X[row].nnz:
sample = self._generate_sample(X, nn_data, nn_num,
row, col, step)
row_indices += [i] * len(sample.indices)
col_indices += sample.indices.tolist()
samples += sample.data.tolist()
return (sparse.csr_matrix((samples, (row_indices, col_indices)),
[len(samples_indices), X.shape[1]],
dtype=X.dtype),
y_new)
else:
X_new = np.zeros((n_samples, X.shape[1]), dtype=X.dtype)
for i, (row, col, step) in enumerate(zip(rows, cols, steps)):
X_new[i] = self._generate_sample(X, nn_data, nn_num,
row, col, step)
return X_new, y_new
def _generate_sample(self, X, nn_data, nn_num, row, col, step):
r"""Generate a synthetic sample.
The rule for the generation is:
.. math::
\mathbf{s_{s}} = \mathbf{s_{i}} + \mathcal{u}(0, 1) \times
(\mathbf{s_{i}} - \mathbf{s_{nn}}) \,
where \mathbf{s_{s}} is the new synthetic samples, \mathbf{s_{i}} is
the current sample, \mathbf{s_{nn}} is a randomly selected neighbors of
\mathbf{s_{i}} and \mathcal{u}(0, 1) is a random number between [0, 1).
Parameters
----------
X : {array-like, sparse matrix}, shape (n_samples, n_features)
Points from which the points will be created.
nn_data : ndarray, shape (n_samples_all, n_features)
Data set carrying all the neighbours to be used.
nn_num : ndarray, shape (n_samples_all, k_nearest_neighbours)
The nearest neighbours of each sample in `nn_data`.
row : int
Index pointing at feature vector in X which will be used
as a base for creating new sample.
col : int
Index pointing at which nearest neighbor of base feature vector
will be used when creating new sample.
step : float
Step size for new sample.
Returns
-------
X_new : {ndarray, sparse matrix}, shape (n_features,)
Single synthetically generated sample.
"""
return X[row] - step * (X[row] - nn_data[nn_num[row, col]])
def _in_danger_noise(self, nn_estimator, samples, target_class, y,
kind='danger'):
"""Estimate if a set of sample are in danger or noise.
Used by BorderlineSMOTE and SVMSMOTE.
Parameters
----------
nn_estimator : estimator
An estimator that inherits from
:class:`sklearn.neighbors.base.KNeighborsMixin` use to determine if
a sample is in danger/noise.
samples : {array-like, sparse matrix}, shape (n_samples, n_features)
The samples to check if either they are in danger or not.
target_class : int or str
The target corresponding class being over-sampled.
y : array-like, shape (n_samples,)
The true label in order to check the neighbour labels.
kind : str, optional (default='danger')
The type of classification to use. Can be either:
- If 'danger', check if samples are in danger,
- If 'noise', check if samples are noise.
Returns
-------
output : ndarray, shape (n_samples,)
A boolean array where True refer to samples in danger or noise.
"""
x = nn_estimator.kneighbors(samples, return_distance=False)[:, 1:]
nn_label = (y[x] != target_class).astype(int)
n_maj = np.sum(nn_label, axis=1)
if kind == 'danger':
# Samples are in danger for m/2 <= m' < m
return np.bitwise_and(n_maj >= (nn_estimator.n_neighbors - 1) / 2,
n_maj < nn_estim
没有合适的资源?快使用搜索试试~ 我知道了~
python imbalanced-learn库实用例子(examples)代码
共96个文件
py:96个
需积分: 44 15 下载量 47 浏览量
2018-11-30
20:51:46
上传
评论
收藏 163KB ZIP 举报
温馨提示
python imbalanced-learn库实用例子(examples)代码 ,包括安装py文件,doc介绍,和examples源码。
资源推荐
资源详情
资源评论
收起资源包目录
imblearn.zip (96个子文件)
imblearn
exceptions.py 404B
ensemble
_bagging.py 9KB
_weight_boosting.py 12KB
base.py 2KB
__init__.py 588B
_easy_ensemble.py 10KB
tests
test_bagging.py 17KB
__init__.py 0B
test_weight_boosting.py 4KB
test_balance_cascade.py 7KB
test_forest.py 4KB
test_easy_ensemble.py 11KB
_forest.py 19KB
_balance_cascade.py 8KB
utils
_validation.py 22KB
estimator_checks.py 15KB
testing.py 6KB
__init__.py 412B
_docstring.py 1KB
deprecation.py 2KB
tests
test_docstring.py 1024B
test_estimator_checks.py 2KB
test_deprecation.py 584B
__init__.py 0B
test_testing.py 2KB
test_validation.py 14KB
tensorflow
__init__.py 193B
_generator.py 6KB
tests
test_generator.py 3KB
base.py 7KB
over_sampling
_adasyn.py 7KB
base.py 2KB
_smote.py 40KB
__init__.py 398B
_random_over_sampler.py 4KB
tests
test_adasyn.py 4KB
test_smote_nc.py 7KB
test_smote.py 13KB
test_svm_smote.py 2KB
__init__.py 0B
test_random_over_sampler.py 4KB
test_borderline_smote.py 2KB
metrics
__init__.py 658B
_classification.py 34KB
tests
test_classification.py 15KB
test_score_objects.py 2KB
_version.py 628B
__init__.py 2KB
pipeline.py 22KB
datasets
_zenodo.py 12KB
__init__.py 208B
tests
test_imbalance.py 2KB
test_zenodo.py 3KB
__init__.py 0B
_imbalance.py 5KB
keras
__init__.py 267B
_generator.py 9KB
tests
test_generator.py 4KB
__init__.py 0B
combine
_smote_enn.py 4KB
__init__.py 209B
_smote_tomek.py 5KB
tests
__init__.py 0B
test_smote_tomek.py 5KB
test_smote_enn.py 4KB
under_sampling
base.py 3KB
__init__.py 957B
_prototype_generation
__init__.py 232B
tests
test_cluster_centroids.py 4KB
__init__.py 0B
_cluster_centroids.py 6KB
_prototype_selection
_neighbourhood_cleaning_rule.py 7KB
_instance_hardness_threshold.py 7KB
_nearmiss.py 11KB
__init__.py 964B
_one_sided_selection.py 7KB
_random_under_sampler.py 5KB
_tomek_links.py 6KB
_condensed_nearest_neighbour.py 8KB
tests
test_repeated_edited_nearest_neighbours.py 8KB
test_neighbourhood_cleaning_rule.py 5KB
test_edited_nearest_neighbours.py 4KB
test_random_under_sampler.py 3KB
__init__.py 0B
test_tomek_links.py 3KB
test_instance_hardness_threshold.py 5KB
test_allknn.py 8KB
test_one_sided_selection.py 4KB
test_condensed_nearest_neighbour.py 4KB
test_nearmiss.py 8KB
_edited_nearest_neighbours.py 19KB
tests
__init__.py 0B
test_base.py 2KB
test_pipeline.py 35KB
test_common.py 3KB
test_exceptions.py 375B
共 96 条
- 1
资源评论
流水不毒
- 粉丝: 50
- 资源: 117
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功