# coding:utf-8
import datetime
import time
import warnings
import networkx as nx
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
def gen_thres_new(df_train, oof_preds):
df_train['oof_preds'] = oof_preds
quantile_point = df_train['black_flag'].mean()
thres = df_train['oof_preds'].quantile(1 - quantile_point)
_thresh = []
for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01):
_thresh.append(
[thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])
_thresh = np.array(_thresh)
best_id = _thresh[:, 1].argmax()
best_thresh = _thresh[best_id][0]
print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
return best_thresh
def lgb_f1_score(y_hat, data):
y_true = data.get_label()
# y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
submit = pd.DataFrame()
submit['bz_predict'] = y_hat
submit['flag'] = y_true
submit = submit.sort_values(['bz_predict'], ascending=False).reset_index(drop=True)
submit['black_flag'] = 0
submit.loc[submit.index < submit.shape[0] * 0.25, 'black_flag'] = 1
return 'f1', f1_score(submit['black_flag'], submit['flag']), True
def calc_max_coutinut_times(a, b=1):
t = 0
w = 1
for k, v in enumerate(a):
if k > 0:
if v == b and a[k - 1] == b:
t += 1
if w < t:
w = t
else:
t = 1
return w
def gen_target_encoding_feats(train, test, encode_cols, target_col, n_fold=5):
'''生成target encoding特征'''
# for training set - cv
tg_feats = np.zeros((train.shape[0], len(encode_cols)))
kfold = StratifiedKFold(n_splits=n_fold, random_state=42, shuffle=True)
for _, (train_index, val_index) in enumerate(kfold.split(train[encode_cols], train[target_col])):
df_train, df_val = train.iloc[train_index], train.iloc[val_index]
for idx, col in enumerate(encode_cols):
target_mean_dict = df_train.groupby(col)[target_col].mean()
df_val[f'{col}_mean_target'] = df_val[col].map(target_mean_dict)
tg_feats[val_index, idx] = df_val[f'{col}_mean_target'].values
for idx, encode_col in enumerate(encode_cols):
train[f'{encode_col}_mean_target'] = tg_feats[:, idx]
# for testing set
for col in encode_cols:
target_mean_dict = train.groupby(col)[target_col].mean()
test[f'{col}_mean_target'] = test[col].map(target_mean_dict)
return train, test
def findMaxAverage(nums, k):
average = [] # 平均数的列表,最后取最大即可
sum_, start = 0, 0 # 末尾下划线避免与关键字冲突
for end in range(len(nums)):
sum_ += nums[end]
if end >= k - 1: # 达到窗口大小
average.append(sum_ / k) # 计算平均值
sum_ -= nums[start] # 减去窗口外的元素
start += 1 # 滑动窗口一位
return max(average)
def findMinAverage(nums, k):
average = [] # 平均数的列表,最后取最大即可
sum_, start = 0, 0 # 末尾下划线避免与关键字冲突
for end in range(len(nums)):
sum_ += nums[end]
if end >= k - 1: # 达到窗口大小
average.append(sum_ / k) # 计算平均值
sum_ -= nums[start] # 减去窗口外的元素
start += 1 # 滑动窗口一位
return min(average)
path = '../data/'
static_info = pd.read_csv(path + '账户静态信息.csv')
time_info = pd.read_csv(path + '账户交易信息.csv')
train_label = pd.read_csv(path + '训练集标签.csv')
test_no_label = pd.read_csv(path + 'test_dataset.csv')
time_info['tt'] = time_info['jyrq'] + ' ' + time_info['jysj']
time_info['tt'] = pd.to_datetime(time_info['tt'])
time_info = time_info.sort_values(['zhdh','tt'])
time_info['ttt'] = time_info['jdbj'].apply(lambda x:1 if x == 0 else -1)
time_info['ttt_jyje'] = time_info['jyje'] * time_info['ttt']
time_info['tttt'] = time_info['zhye'] + time_info['ttt_jyje']
a = time_info[['zhdh','jyje','zhye','tttt','ttt_jyje','ttt']]
a['ttttt'] = a.groupby(['zhdh'])['tttt'].shift(-1)
a['zhye'] = a['zhye'].astype('float')
a['ttttt'] = a['ttttt'].astype('float')
a['ttttt_tttt'] = a['zhye'] - a['ttttt']
a = a.dropna()
a['ttttt_tttt'] = a['ttttt_tttt'].astype(int)
ff1 = a.groupby(['zhdh']).agg({'ttttt_tttt':['mean','var']}).reset_index()
ff1.columns = ['zhdh','ttttt_tttt_mean','ttttt_tttt_var']
del time_info['ttt']
del time_info['ttt_jyje']
del time_info['tt']
del time_info['tttt']
# 3月模型
add_train_3 = pd.read_csv('../submit/train_3_lgb.csv')
add_train_3.columns = ['zhdh', 'black_flag_3', 'predict_3']
add_test_3 = pd.read_csv('../submit/test_3_lgb.csv')
add_test_3.columns = ['zhdh', 'black_flag_3', 'predict_3']
# 4月模型
add_train_4 = pd.read_csv('../submit/train_4_lgb.csv')
add_train_4.columns = ['zhdh', 'black_flag_4', 'predict_4']
add_test_4 = pd.read_csv('../submit/test_4_lgb.csv')
add_test_4.columns = ['zhdh', 'black_flag_4', 'predict_4']
# 5月模型
add_train_5 = pd.read_csv('../submit/train_5_lgb.csv')
add_train_5.columns = ['zhdh', 'black_flag_5', 'predict_5']
add_test_5 = pd.read_csv('../submit/test_5_lgb.csv')
add_test_5.columns = ['zhdh', 'black_flag_5', 'predict_5']
add_train = pd.merge(add_train_3[['zhdh', 'predict_3']], add_train_4[['zhdh', 'predict_4']], on=['zhdh'])
add_train = pd.merge(add_train, add_train_5[['zhdh', 'predict_5']], on=['zhdh'])
add_test = pd.merge(add_test_3[['zhdh', 'predict_3']], add_test_4[['zhdh', 'predict_4']], on=['zhdh'])
add_test = pd.merge(add_test, add_test_5[['zhdh', 'predict_5']], on=['zhdh'])
# # 3月模型
# add_train_xgb_3 = pd.read_csv('../submit/train_3_xgb.csv')
# add_train_xgb_3.columns = ['zhdh', 'black_flag_xgb_3', 'predict_xgb_3']
# add_test_xgb_3 = pd.read_csv('../submit/test_3_xgb.csv')
# add_test_xgb_3.columns = ['zhdh', 'black_flag_xgb_3', 'predict_xgb_3']
# # 4月模型
# add_train_xgb_4 = pd.read_csv('../submit/train_4_xgb.csv')
# add_train_xgb_4.columns = ['zhdh', 'black_flag_xgb_4', 'predict_xgb_4']
# add_test_xgb_4 = pd.read_csv('../submit/test_4_xgb.csv')
# add_test_xgb_4.columns = ['zhdh', 'black_flag_xgb_4', 'predict_xgb_4']
# # 5月模型
# add_train_xgb_5 = pd.read_csv('../submit/train_5_xgb.csv')
# add_train_xgb_5.columns = ['zhdh', 'black_flag_xgb_5', 'predict_xgb_5']
# add_test_xgb_5 = pd.read_csv('../submit/test_5_xgb.csv')
# add_test_xgb_5.columns = ['zhdh', 'black_flag_xgb_5', 'predict_xgb_5']
#
# add_train_xgb = pd.merge(add_train_xgb_3[['zhdh', 'predict_xgb_3']], add_train_xgb_4[['zhdh', 'predict_xgb_4']],
# on=['zhdh'])
# add_train_xgb = pd.merge(add_train_xgb, add_train_xgb_5[['zhdh', 'predict_xgb_5']], on=['zhdh'])
#
# add_test_xgb = pd.merge(add_test_xgb_3[['zhdh', 'predict_xgb_3']], add_test_xgb_4[['zhdh', 'predict_xgb_4']],
# on=['zhdh'])
# add_test_xgb = pd.merge(add_test_xgb, add_test_xgb_5[['zhdh', 'predict_xgb_5']], on=['zhdh'])
###########################
# # 3月模型
# add_train_cat_3 = pd.read_csv('../submit/train_cat_3.csv')
# add_train_cat_3.columns = ['zhdh', 'black_flag_cat_3', 'predict_cat_3']
# add_test_cat_3 = pd.read_csv('../submit/test_3_xgb.csv')
# add_test_cat_3.columns = ['zhdh', 'black_flag_cat_3', 'predict_cat_3']
# # 4月模型
# add_train_cat_4 = pd.read_csv('../submit/train_cat_4.csv')
# add_train_cat_4.columns = ['zhdh', 'black_flag_cat_4', 'predict_cat_4']
# add_test_cat_4 = pd.read_csv('../submit/test_cat_4.csv')
# add_test_cat_4.columns = ['zhdh', 'black_flag_cat_4', 'predict_cat_4']
# # 5月模型
# add_tra
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
1.项目代码功能经验证ok,确保稳定可靠运行。欢迎下载使用!在使用过程中,如有问题或建议,请及时私信沟通。 2.主要针对各个计算机相关专业,包括计科、信息安全、数据科学与大数据技术、人工智能、通信、物联网等领域的在校学生、专业教师或企业员工使用。 3.项目具有丰富的拓展空间,不仅可作为入门进阶,也可直接作为毕设、课程设计、大作业、初期项目立项演示等用途。 4.当然也鼓励大家基于此进行二次开发。 5.期待你能在项目中找到乐趣和灵感,也欢迎你的分享和反馈! 【资源说明】 基于python实现的金融欺诈风险识别项目源码(含数据集+说明+详细注释).zip 科技金融应用:欺诈风险识别 # 基本思路 特征其实都是一些聚合统计,思路主要是3 4 5按照月份分别建立三个子模型(原因:发现如果3月有流水信息、4月突然间没有流水信息的样本,很容易判定为欺诈样本,因此做了三个子模型学习一下当月的情况)将预测值作为特征;模型2中把绝对时间全部修改为相对时间+三个子模型的预测值作为模型;测试过程发现不同随机种子影响很大,因此做了多种子融合。提交1200样本,大约881线上成绩;提交1100样本,大约885-886线上成绩 基于python实现的金融欺诈风险识别项目源码(含数据集+说明+详细注释).zip 基于python实现的金融欺诈风险识别项目源码(含数据集+说明+详细注释).zip 基于python实现的金融欺诈风险识别项目源码(含数据集+说明+详细注释).zip 基于python实现的金融欺诈风险识别项目源码(含数据集+说明+详细注释).zip
资源推荐
资源详情
资源评论
收起资源包目录
基于python实现的金融欺诈风险识别项目源码(含数据集+说明+详细注释).zip (13个子文件)
提交最终源码备份
说明.md 624B
训练集标签.csv 22KB
200.lgb_split_month.py 27KB
submit_example.csv 94KB
test_dataset.csv 84KB
203.xgb_v2.py 39KB
说明.md 624B
提交最终源码备份.zip 123KB
训练集标签.csv 22KB
200.lgb_split_month.py 27KB
submit_example.csv 94KB
test_dataset.csv 84KB
203.xgb_v2.py 39KB
共 13 条
- 1
资源评论
.whl
- 粉丝: 3824
- 资源: 4648
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功