import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score
data = pd.read_csv('data/data_model.csv')
print("读入数据的样本与变量个数:",data.shape)#(148077, 262)
# 删除缺失率高的特征
high_missing_features=['xaccount_phonechange','xaccount_addrchange','xaccount_empchange','xaccount_autopaytime','tran_cyclenum_STMT3roi','tran_cyclenum_STMT6roi','tran_cyclenum_STMT12roi']
data_cleaned = data.drop(high_missing_features, axis=1)
print("删除缺失率高的特征:",data_cleaned.shape)
# 使用get_dummies()函数将变量类型为字符的特征转换为类别型变量
data_dummies = pd.get_dummies(data_cleaned)
#划分数据标签
y = data_dummies['target']
X = data_dummies.drop('target', axis=1)
print("划分数据标签后的特征与标签:",X.shape,y.shape)
# 划分训练集测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("划分训练集测试集:",X_train.shape,X_test.shape)
# 数据标准化
def standardize(X):
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_std = np.where(X_std==0, 1, X_std)
X_std = X_std.reshape(1, -1)
X_std = np.repeat(X_std, X.shape[0], axis=0)
X_std = (X - X_mean) / X_std
return X_std
X_train_std = standardize(X_train)
X_test_std = standardize(X_test)
# 绘制直方图
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
axs[0].hist(X_train.iloc[:, 1], bins=50)
axs[0].set_xlabel('Feature 1')
axs[0].set_ylabel('Frequency')
axs[0].set_title('Original Data')
# axs[1].hist(X_norm[:, 0], bins=50)
# axs[1].set_xlabel('Feature 1')
# axs[1].set_ylabel('Frequency')
# axs[1].set_title('Normalized Data')
axs[1].hist(X_train_std.iloc[:, 1], bins=50)
axs[1].set_xlabel('Feature 1')
axs[1].set_ylabel('Frequency')
axs[1].set_title('Standardized Data')
# plt.show()
# 绘制箱线图
fig, axs = plt.subplots(1, 2, figsize=(12, 4))
axs[0].boxplot(X_train.iloc[:, 10:14])
axs[0].set_xticklabels(['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4'])
axs[0].set_ylabel('Value')
axs[0].set_title('Original Data')
# axs[1].boxplot(X_norm)
# axs[1].set_xticklabels(['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4'])
# axs[1].set_ylabel('Value')
# axs[1].set_title('Normalized Data')
axs[1].boxplot(X_train_std.iloc[:, 10:14])
axs[1].set_xticklabels(['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4'])
axs[1].set_ylabel('Value')
axs[1].set_title('Standardized Data')
# plt.show()
# 变量筛选
# 过滤式特征选择
selector_filter = SelectKBest(f_classif, k=50)
X_train_filter = selector_filter.fit_transform(X_train_std, y_train)
X_test_filter = selector_filter.transform(X_test_std)
# # 包裹式特征选择
# clf_wrapper = RandomForestClassifier(n_estimators=10, random_state=42)
# selector_wrapper = RFE(clf_wrapper, n_features_to_select=50, step=1)
# X_train_wrapper = selector_wrapper.fit_transform(X_train_std, y_train)
# X_test_wrapper = selector_wrapper.transform(X_test_std)
# # 嵌入式特征选择
# clf_embedded = LogisticRegression(penalty='l1', solver='liblinear')
# clf_embedded.fit(X_train, y_train)
# selector_embedded = SelectFromModel(clf_embedded, prefit=True, max_features=50)
# X_train_embedded = selector_embedded.transform(X_train_std)
# X_test_embedded = selector_embedded.transform(X_test_std)
# 测试性能
clf = LogisticRegression()
clf.fit(X_train_filter, y_train)
score_filter = clf.score(X_test_filter, y_test)
# clf.fit(X_train_wrapper, y_train)
# score_wrapper = clf.score(X_test_wrapper, y_test)
# clf.fit(X_train_embedded, y_train)
# score_embedded = clf.score(X_test_embedded, y_test)
print('过滤式特征选择的测试准确率:', score_filter)
# print('包裹式特征选择的测试准确率:', score_wrapper)
# print('嵌入式特征选择的测试准确率:', score_embedded)
# 使用过滤式特征选择得到的特征
X_train=X_train_filter
X_test=X_test_filter
# 模型选择与模型表现
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('knn \naccuracy:', accuracy, '\n recall:',recall, '\n f1_score:',f1)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print('dt \naccuracy:', accuracy, '\n recall:',recall, '\n f1_score:',f1)
没有合适的资源?快使用搜索试试~ 我知道了~
机器学习实验-基于信用卡数据建立行为评分模型的机器学习方法
共13个文件
xml:4个
pyc:2个
csv:2个
需积分: 10 3 下载量 153 浏览量
2023-03-31
17:01:23
上传
评论
收藏 41.61MB ZIP 举报
温馨提示
机器学习实验-基于信用卡数据建立行为评分模型的机器学习方法-python, 包括数据与源代码(.ipython为主要文件, .py可补充特征选择方法)
资源推荐
资源详情
资源评论
收起资源包目录
源代码.zip (13个子文件)
CreditForecast
__init__.py 0B
forecast.py 5KB
data
missing_rate.csv 7KB
data_model.csv 141MB
.idea
workspace.xml 6KB
misc.xml 294B
inspectionProfiles
profiles_settings.xml 174B
modules.xml 301B
CreditForecast-master.iml 566B
preprocess.ipynb 900KB
.gitignore 48B
__pycache__
data_helper.cpython-37.pyc 8KB
m1_xgb.cpython-37.pyc 5KB
共 13 条
- 1
资源评论
lagoon_lala
- 粉丝: 355
- 资源: 55
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功