import numpy as np
import pandas as pd
def SplitData(df, col, numOfSplit, special_attribute=[]):
'''
:param df: 按照col排序后的数据集
:param col: 待分箱的变量
:param numOfSplit: 切分的组别数
:param special_attribute: 在切分数据集的时候,某些特殊值需要排除在外
:return: 在原数据集上增加一列,把原始细粒度的col重新划分成粗粒度的值,便于分箱中的合并处理
'''
df2 = df.copy()
if special_attribute != []:
df2 = df.loc[~df[col].isin(special_attribute)]
N = df2.shape[0]
n = int(N/numOfSplit)
splitPointIndex = [i*n for i in range(1,numOfSplit)]
rawValues = sorted(list(df2[col]))
splitPoint = [rawValues[i] for i in splitPointIndex]
splitPoint = sorted(list(set(splitPoint)))
return splitPoint # col中“切分点“右边第一个值
# def Chi2(df, total_col, bad_col, overallRate):
# '''
# :param df: 包含全部样本总计与坏样本总计的数据框
# :param total_col: 全部样本的个数
# :param bad_col: 坏样本的个数
# :param overallRate: 全体样本的坏样本占比
# :return: 卡方值
# '''
# df2 = df.copy()
# # 期望坏样本个数=全部样本个数*平均坏样本占比
# df2['expected'] = df[total_col].apply(lambda x: x*overallRate)
# combined = zip(df2['expected'], df2[bad_col])
# chi = [(i[0]-i[1])**2/i[0] for i in combined]
# chi2 = sum(chi)
# return chi2
def Chi2(df, total_col, bad_col):
'''
:param df: 包含全部样本总计与坏样本总计的数据框
:param total_col: 全部样本的个数
:param bad_col: 坏样本的个数
:return: 卡方值
'''
df2 = df.copy()
# 求出df中,总体的坏样本率和好样本率
badRate = sum(df2[bad_col])*1.0/sum(df2[total_col])
# 当全部样本只有好或者坏样本时,卡方值为0
if badRate in [0,1]:
return 0
df2['good'] = df2.apply(lambda x: x[total_col] - x[bad_col], axis = 1)
goodRate = sum(df2['good']) * 1.0 / sum(df2[total_col])
# 期望坏(好)样本个数=全部样本个数*平均坏(好)样本占比
df2['badExpected'] = df[total_col].apply(lambda x: x*badRate)
df2['goodExpected'] = df[total_col].apply(lambda x: x * goodRate)
badCombined = zip(df2['badExpected'], df2[bad_col])
goodCombined = zip(df2['goodExpected'], df2['good'])
badChi = [(i[0]-i[1])**2/i[0] for i in badCombined]
goodChi = [(i[0] - i[1]) ** 2 / i[0] for i in goodCombined]
chi2 = sum(badChi) + sum(goodChi)
return chi2
# Chi2 的另外一种计算方法
# def Chi2(df, total_col, bad_col):
# df2 = df.copy()
# df2['good'] = df2[total_col] - df2[bad_col]
# goodTotal = sum(df2['good'])
# badTotal = sum(df2[bad_col])
# p1 = df2.loc[0]['good']*1.0/df2.loc[0][total_col]
# p2 = df2.loc[1]['good']*1.0/df2.loc[1][total_col]
# w1 = df2.loc[0]['good']*1.0/goodTotal
# w2 = df2.loc[0][bad_col]*1.0/badTotal
# N = sum(df2[total_col])
# return N*(p1-p2)*(w1-w2)
def BinBadRate(df, col, target, grantRateIndicator=0):
'''
:param df: 需要计算好坏比率的数据集
:param col: 需要计算好坏比率的特征
:param target: 好坏标签
:param grantRateIndicator: 1返回总体的坏样本率,0不返回
:return: 每箱的坏样本率,以及总体的坏样本率(当grantRateIndicator==1时)
'''
total = df.groupby([col])[target].count()
total = pd.DataFrame({'total': total})
bad = df.groupby([col])[target].sum()
bad = pd.DataFrame({'bad': bad})
regroup = total.merge(bad, left_index=True, right_index=True, how='left') # 每箱的坏样本数,总样本数
regroup.reset_index(level=0, inplace=True)
regroup['bad_rate'] = regroup.apply(lambda x: x.bad * 1.0 / x.total, axis=1) # 加上一列坏样本率
dicts = dict(zip(regroup[col],regroup['bad_rate'])) # 每箱对应的坏样本率组成的字典
if grantRateIndicator==0:
return (dicts, regroup)
N = sum(regroup['total'])
B = sum(regroup['bad'])
overallRate = B * 1.0 / N
return (dicts, regroup, overallRate)
### ChiMerge_MaxInterval: split the continuous variable using Chi-square value by specifying the max number of intervals
def ChiMerge(df, col, target, max_interval=5,special_attribute=[],minBinPcnt=0):
'''
:param df: 包含目标变量与分箱属性的数据框
:param col: 需要分箱的属性
:param target: 目标变量,取值0或1
:param max_interval: 最大分箱数。如果原始属性的取值个数低于该参数,不执行这段函数
:param special_attribute: 不参与分箱的属性取值
:param minBinPcnt:最小箱的占比,默认为0
:return: 分箱结果
'''
colLevels = sorted(list(set(df[col])))
N_distinct = len(colLevels)
if N_distinct <= max_interval: #如果原始属性的取值个数低于max_interval,不执行这段函数
print ("The number of original levels for {} is less than or equal to max intervals".format(col))
return colLevels[:-1]
else:
if len(special_attribute)>=1:
df1 = df.loc[df[col].isin(special_attribute)]
df2 = df.loc[~df[col].isin(special_attribute)]
else:
df2 = df.copy() # 去掉special_attribute后的df
N_distinct = len(list(set(df2[col])))
# 步骤一: 通过col对数据集进行分组,求出每组的总样本数与坏样本数
if N_distinct > 100:
split_x = SplitData(df2, col, 100)
df2['temp'] = df2[col].map(lambda x: AssignGroup(x, split_x))
# Assgingroup函数:每一行的数值和切分点做对比,返回原值在切分后的映射,
# 经过map以后,生成该特征的值对象的“分箱”后的值
else:
df2['temp'] = df2[col]
# 总体bad rate将被用来计算expected bad count
(binBadRate, regroup, overallRate) = BinBadRate(df2, 'temp', target, grantRateIndicator=1)
# 首先,每个单独的属性值将被分为单独的一组
# 对属性值进行排序,然后两两组别进行合并
colLevels = sorted(list(set(df2['temp'])))
groupIntervals = [[i] for i in colLevels] #把每个箱的值打包成[[],[]]的形式
# 步骤二:建立循环,不断合并最优的相邻两个组别,直到:
# 1,最终分裂出来的分箱数<=预设的最大分箱数
# 2,每箱的占比不低于预设值(可选)
# 3,每箱同时包含好坏样本
# 如果有特殊属性,那么最终分裂出来的分箱数=预设的最大分箱数-特殊属性的个数
split_intervals = max_interval - len(special_attribute)
while (len(groupIntervals) > split_intervals): # 终止条件: 当前分箱数=预设的分箱数
# 每次循环时, 计算合并相邻组别后的卡方值。具有最小卡方值的合并方案,是最优方案
chisqList = []
for k in range(len(groupIntervals)-1):
temp_group = groupIntervals[k] + groupIntervals[k+1]
df2b = regroup.loc[regroup['temp'].isin(temp_group)]
#chisq = Chi2(df2b, 'total', 'bad', overallRate)
chisq = Chi2(df2b, 'total', 'bad')
chisqList.append(chisq)
best_comnbined = chisqList.index(min(chisqList))
# 把groupIntervals的值改成类似的值改成类似从[[1][2],[3]]到[[1,2],[3]]
groupIntervals[best_comnbined] = groupIntervals[best_comnbined] + groupIntervals[best_comnbined+1]
groupIntervals.remove(groupIntervals[best_comnbined+1])
groupIntervals = [sorted(i) for i in groupIntervals]
cutOffPoints = [max(i) for i in groupIntervals[:-1]] #
# 检查是否有箱没有好或者坏样本。如果有,需要跟相邻的箱