# -*- coding: utf-8 -*-
"""
@author: quincyqiang
@software: PyCharm
@file: gen_feas.py
@time: 2020/9/2 23:36
@description:
"""
import time
import pickle
import multiprocessing
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
# import nltk
from sklearn.cluster import KMeans
# from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestRegressor
from gensim.models.word2vec import Word2Vec
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from tqdm import tqdm
import os
import datetime
from tqdm import tqdm
import warnings
import datetime
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")
tqdm.pandas()
def w2v_transform(X, word2vec, length):
# length = len(base_col[3:])
return np.array([np.hstack([
np.mean([word2vec[w]
for w in words if w in word2vec] or
[np.zeros(length)], axis=1)
, np.max([word2vec[w]
for w in words if w in word2vec] or
[np.zeros(length)], axis=1)
]) for words in X
])
def get_w2v(data_frame, feat, length):
model = Word2Vec(data_frame[feat].values, size=length, window=20, min_count=1,
workers=10, iter=10)
return model
def w2v_feat(data):
tr_w2v = get_w2v(data[['rid']], 'rid', 50)
vect = w2v_transform(data.rid.values, tr_w2v.wv, 50)
for i in range(vect.shape[1]):
data['w2vn' + str(i)] = vect[:, i]
return data
def load_data():
train = pd.read_csv('data/train.csv')
train_size = len(train)
test = pd.read_csv('data/testA.csv')
if not os.path.exists('data/data_v1.pkl'):
start_time = time.time()
print(f'generate features started at {time.ctime()}')
# 去除重复列
del train['n2.1']
del test['n2.1'], test['n2.2'], test['n2.3']
data = pd.concat([train, test], axis=0).reset_index(drop=True)
# 删除列
del data['policyCode']
print("data.shape:", data.shape)
# ========== 数据处理 ===================
numerical_fea = list(data.select_dtypes(exclude=['object']).columns)
numerical_fea.remove('isDefault')
numerical_fea.remove('id')
# category_fea = list(filter(lambda x: x not in numerical_fea, list(data.columns)))
# 按照中位数填充数值型特征
loss_numerical_feas = ['revolUtil', 'pubRecBankruptcies', 'dti']
data[loss_numerical_feas] = data[loss_numerical_feas].fillna(data[loss_numerical_feas].median())
# 按照众数填充类别型特征
loss_categorical_feas = ['employmentLength', 'employmentTitle', 'title', 'postCode']
for cate_fea in loss_categorical_feas:
data[cate_fea] = data[cate_fea].fillna(data[cate_fea].mode()[0])
# 匿名特征
n_feas = ['n0', 'n1', 'n2', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']
# 使用 -1 填充匿名特征 方面后续统计确实特征
data[loss_numerical_feas] = data[loss_numerical_feas].fillna(-1)
# ================ 时间特征提取 ==================
# employmentLength对象类型特征转换到数值 雇佣年限
def employmentLength_to_int(s):
if s == '10+ years':
return 10
elif s == '< 1 year':
return 0
elif pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
tqdm.pandas(desc="employmentLength_to_int", postfix=None)
data['employmentLength_years'] = data['employmentLength'].progress_apply(lambda x: employmentLength_to_int(x))
# issueDate:贷款发放的月份
data['issueDate'] = pd.to_datetime(data['issueDate'], format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d') # 最小日期
# 构造时间特征
tqdm.pandas(desc="issueDate_start_lag", postfix=None)
data['issueDate_start_lag'] = data['issueDate'].progress_apply(lambda x: x - startdate).dt.days
data['issueDate_start_lag2year'] = data['issueDate_start_lag'] / 365
data['issueDate_year'] = data['issueDate'].dt.year
data['issueDate_month'] = data['issueDate'].dt.month
# data['issueDate_hour'] = data['issueDate'].dt.hour
data['issueDate_week'] = data['issueDate'].dt.dayofweek
data['issueDate_day'] = data['issueDate'].dt.day
# earliesCreditLine 借款人最早报告的信用额度开立的月份
tqdm.pandas(desc="earliesCreditLine", postfix=None)
month_maps = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4,
'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8,
'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
def ym(x):
month, year = x.split('-')
month = month_maps[month]
return year + '-' + str(month)
data['earliesCreditLine'] = data['earliesCreditLine'].progress_apply(lambda x: ym(x))
data['earliesCreditLine'] = pd.to_datetime(data['earliesCreditLine'], format='%Y-%m')
data['earliesCreditLine'].value_counts()
data['earliesCreditLine_year'] = data['earliesCreditLine'].dt.year
data['earliesCreditLine_month'] = data['earliesCreditLine'].dt.month
# 间隔特征
## 贷款日期与就业年限的差值
data['issueDate_employmentLength_years_lag'] = data['issueDate_year'] - data['employmentLength_years']
## 贷款日期与就业年限的差值
data['issueDate_earliesCreditLine_daylag'] = (data['issueDate'] - data['earliesCreditLine']).dt.days # 天数间隔
data['issueDate_earliesCreditLine_yearlag'] = (data['issueDate'] - data[
'earliesCreditLine']).dt.days / 365 # 年间隔
# 地区编码
data['province'] = data['postCode'].apply(lambda x: str(x)[:2])
# ================ 类别特征等级编码 ==================
data['grade'] = data['grade'].map({'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7})
tqdm.pandas(desc="subGrade_value", postfix=None)
data['subGrade_value'] = data['subGrade'].progress_apply(lambda x: int(x[1]))
for fea in ['subGrade', 'employmentLength', 'issueDate', 'earliesCreditLine', 'province']:
lb = LabelEncoder()
data[fea] = lb.fit_transform(data[fea])
# =============== 长尾分布特征处理 ================
# cat_list = [i for i in train.columns if i not in ['id', 'isDefault', 'policyCode']]
cat_list = [i for i in data.columns if i not in ['id', 'isDefault', 'policyCode']]
for i in tqdm(cat_list, desc="长尾分布特征处理"):
if data[i].nunique() > 3:
data['{}_count'.format(i)] = data.groupby(['{}'.format(i)])['id'].transform('count') # 计数特征
# data[i + '_rank'] = data.groupby(i)['id'].transform('rank') # 排序特征
# ===================== amount_feas 分箱特征 ===============
amount_feas = ['loanAmnt', 'interestRate', 'installment', 'annualIncome', 'dti',
'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'revolBal', 'revolUtil', 'totalAcc']
for fea in tqdm(amount_feas, desc="分箱特征"):
# 通过除法映射到间隔均匀的分箱中,每个分箱的取值范围都是loanAmnt/1000
data['{}_bin1'.format(fea)] = np.floor_divide(data[fea], 1000)
## 通过对数函数映射到指数宽度分箱
data['{}_bin2'.format(fea)] = np.floor(np.log10(data[fea]))
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
【资源说明】 零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip 零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip 零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip 零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip 【备注】 1、该资源内项目代码都经过测试运行成功,功能ok的情况下才上传的,请放心下载使用! 2、本项目适合计算机相关专业(如计科、人工智能、通信工程、自动化、电子信息等)的在校学生、老师或者企业员工下载使用,也适合小白学习进阶,当然也可作为毕设项目、课程设计、作业、项目初期立项演示等。 3、如果基础还行,也可在此代码基础上进行修改,以实现其他功能,也可直接用于毕设、课设、作业等。 欢迎下载,沟通交流,互相学习,共同进步!
资源推荐
资源详情
资源评论
收起资源包目录
零基础入门金融风控-基于python的贷款违约预测源码+项目说明.zip (9个子文件)
lgb.py 10KB
xgb.py 9KB
项目说明.md 8KB
others
gen_feas.py 4KB
features_importance.png 124KB
gen_feas.py 15KB
catboot.py 10KB
ensemble.py 3KB
EDA.ipynb 243KB
共 9 条
- 1
资源评论
- 珍爱19872024-03-08终于找到了超赞的宝藏资源,果断冲冲冲,支持!
onnx
- 粉丝: 9619
- 资源: 5597
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
最新资源
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功