机器学习实验-基于信用卡数据建立行为评分模型的机器学习方法

共13个文件

xml：4个

pyc：2个

csv：2个

机器学习

需积分: 10 153 浏览量 2023-03-31 17:01:23 上传评论收藏 41.61MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

源代码.zip （13个子文件）

CreditForecast

__init__.py 0B

forecast.py 5KB

data

missing_rate.csv 7KB

data_model.csv 141MB

.idea

workspace.xml 6KB

misc.xml 294B

inspectionProfiles

profiles_settings.xml 174B

modules.xml 301B

CreditForecast-master.iml 566B

preprocess.ipynb 900KB

.gitignore 48B

__pycache__

data_helper.cpython-37.pyc 8KB

m1_xgb.cpython-37.pyc 5KB

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer from sklearn.feature_selection import SelectKBest, f_classif, RFE from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.feature_selection import SelectFromModel from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score, recall_score, f1_score data = pd.read_csv('data/data_model.csv') print("读入数据的样本与变量个数:",data.shape)#(148077, 262) # 删除缺失率高的特征 high_missing_features=['xaccount_phonechange','xaccount_addrchange','xaccount_empchange','xaccount_autopaytime','tran_cyclenum_STMT3roi','tran_cyclenum_STMT6roi','tran_cyclenum_STMT12roi'] data_cleaned = data.drop(high_missing_features, axis=1) print("删除缺失率高的特征:",data_cleaned.shape) # 使用get_dummies()函数将变量类型为字符的特征转换为类别型变量 data_dummies = pd.get_dummies(data_cleaned) #划分数据标签 y = data_dummies['target'] X = data_dummies.drop('target', axis=1) print("划分数据标签后的特征与标签:",X.shape,y.shape) # 划分训练集测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print("划分训练集测试集:",X_train.shape,X_test.shape) # 数据标准化 def standardize(X): X_mean = np.mean(X, axis=0) X_std = np.std(X, axis=0) X_std = np.where(X_std==0, 1, X_std) X_std = X_std.reshape(1, -1) X_std = np.repeat(X_std, X.shape[0], axis=0) X_std = (X - X_mean) / X_std return X_std X_train_std = standardize(X_train) X_test_std = standardize(X_test) # 绘制直方图 fig, axs = plt.subplots(1, 2, figsize=(12, 4)) axs[0].hist(X_train.iloc[:, 1], bins=50) axs[0].set_xlabel('Feature 1') axs[0].set_ylabel('Frequency') axs[0].set_title('Original Data') # axs[1].hist(X_norm[:, 0], bins=50) # axs[1].set_xlabel('Feature 1') # axs[1].set_ylabel('Frequency') # axs[1].set_title('Normalized Data') axs[1].hist(X_train_std.iloc[:, 1], bins=50) axs[1].set_xlabel('Feature 1') axs[1].set_ylabel('Frequency') axs[1].set_title('Standardized Data') # plt.show() # 绘制箱线图 fig, axs = plt.subplots(1, 2, figsize=(12, 4)) axs[0].boxplot(X_train.iloc[:, 10:14]) axs[0].set_xticklabels(['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4']) axs[0].set_ylabel('Value') axs[0].set_title('Original Data') # axs[1].boxplot(X_norm) # axs[1].set_xticklabels(['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4']) # axs[1].set_ylabel('Value') # axs[1].set_title('Normalized Data') axs[1].boxplot(X_train_std.iloc[:, 10:14]) axs[1].set_xticklabels(['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4']) axs[1].set_ylabel('Value') axs[1].set_title('Standardized Data') # plt.show() # 变量筛选 # 过滤式特征选择 selector_filter = SelectKBest(f_classif, k=50) X_train_filter = selector_filter.fit_transform(X_train_std, y_train) X_test_filter = selector_filter.transform(X_test_std) # # 包裹式特征选择 # clf_wrapper = RandomForestClassifier(n_estimators=10, random_state=42) # selector_wrapper = RFE(clf_wrapper, n_features_to_select=50, step=1) # X_train_wrapper = selector_wrapper.fit_transform(X_train_std, y_train) # X_test_wrapper = selector_wrapper.transform(X_test_std) # # 嵌入式特征选择 # clf_embedded = LogisticRegression(penalty='l1', solver='liblinear') # clf_embedded.fit(X_train, y_train) # selector_embedded = SelectFromModel(clf_embedded, prefit=True, max_features=50) # X_train_embedded = selector_embedded.transform(X_train_std) # X_test_embedded = selector_embedded.transform(X_test_std) # 测试性能 clf = LogisticRegression() clf.fit(X_train_filter, y_train) score_filter = clf.score(X_test_filter, y_test) # clf.fit(X_train_wrapper, y_train) # score_wrapper = clf.score(X_test_wrapper, y_test) # clf.fit(X_train_embedded, y_train) # score_embedded = clf.score(X_test_embedded, y_test) print('过滤式特征选择的测试准确率：', score_filter) # print('包裹式特征选择的测试准确率：', score_wrapper) # print('嵌入式特征选择的测试准确率：', score_embedded) # 使用过滤式特征选择得到的特征 X_train=X_train_filter X_test=X_test_filter # 模型选择与模型表现 knn = KNeighborsClassifier() dt = DecisionTreeClassifier() knn.fit(X_train, y_train) y_pred = knn.predict(X_test) accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print('knn \naccuracy:', accuracy, '\n recall:',recall, '\n f1_score:',f1) dt.fit(X_train, y_train) y_pred = dt.predict(X_test) accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print('dt \naccuracy:', accuracy, '\n recall:',recall, '\n f1_score:',f1)

评论收藏

内容反馈