import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from datetime import datetime
import time
from sklearn.metrics import roc_curve, auc
import math
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from scipy import stats, integrate
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVR, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error # 评价指标
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
data = pd.read_csv("train.csv") # 1 3 7 是 预测列
data.dropna(axis=0, how='any')
data = data.fillna(0)
print(data.columns)
data=data[['用户页面访问持续时间', '跳出率', '网页价值', '浏览器', '信息相关页面访问持续时间', '退出率',
'产品相关页面', '产品相关页面访问持续时间', '流量类型', '信息相关页面', '节假日', '区域', '操作系统',
'用户相关页面', '是否购买']]
print(data.head())
# ['用户页面访问持续时间', '跳出率', '网页价值', '浏览器', '信息相关页面访问持续时间', '退出率',
# '产品相关页面', '产品相关页面访问持续时间', '流量类型', '信息相关页面', '节假日', '区域', '操作系统',
# '用户相关页面', '是否购买']
data_y = data['是否购买'].values
data_x = data[
['用户页面访问持续时间', '跳出率', '网页价值', '浏览器', '信息相关页面访问持续时间', '退出率',
'产品相关页面', '产品相关页面访问持续时间', '流量类型', '信息相关页面', '节假日', '区域', '操作系统',
'用户相关页面',]].values
x_train, x_test, y_train, y_test = train_test_split(np.array(data_x), np.array(data_y), test_size=0.2)
plt.subplots(figsize=(16, 16))
sns.heatmap(data.corr(method='spearman').round(5), annot=True)
plt.show()
# 设置Axes的标题
data_acc = []
data_f1 = []
# knn算法
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
test_pred = knn.predict(x_test) # 进行预测
print("knn算法----------------------------------------- ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
# svm算法
svm = SVC()
svm.fit(x_train, y_train)
test_pred = svm.predict(x_test)
print("svm算法 ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
# RandomForestRClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve, average_precision_score
RandomForest= RandomForestClassifier()
RandomForest.fit(x_train, y_train)
test_pred = RandomForest.predict(x_test)
print("RandomForest算法 ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
pred_probas = RandomForest.predict_proba(x_test)[:, 1]
fpr, tpr, _ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
# 保存到csv文件
df_tmp = pd.DataFrame({'fpr': fpr,'tpr': tpr,})
df_tmp.to_csv('plot.csv', index=False, encoding='utf_8_sig')
# 绘制折线图
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.title('ROC曲线')
plt.legend(loc='lower right')
plt.savefig('ROC曲线.png')
plt.show()
# pr 曲线
y_score= RandomForest.predict_proba(x_test)
y_score=[i[0] for i in y_score]
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
plt.figure("P-R Curve")
plt.title('Precision/Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.plot(recall, precision)
plt.show()
# 计算AP
AP = average_precision_score(y_test, y_score, average='macro', pos_label=1, sample_weight=None)
print('AP:', AP)
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
svm = AdaBoostClassifier()
svm.fit(x_train, y_train)
test_pred = svm.predict(x_test)
print("AdaBoost算法 ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
# XGBRClassifier
from xgboost import XGBClassifier
svm = XGBClassifier()
svm.fit(x_train, y_train)
test_pred = svm.predict(x_test)
print("XGBRegressor算法 ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
# GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
svm = GradientBoostingClassifier()
svm.fit(x_train, y_train)
test_pred = svm.predict(x_test)
print("GradientBoosting算法 ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
# LGBMClassifier
from lightgbm import LGBMClassifier
from lightgbm import LGBMClassifier
svm = LGBMClassifier()
svm.fit(x_train, y_train)
test_pred = svm.predict(x_test)
print("LGB算法 ")
print(test_pred[:10])
print(y_test[:10])
print("accuracy_score:", accuracy_score(test_pred, y_test))
print("f1_score:", f1_score(test_pred, y_test))
data_acc.append(accuracy_score(test_pred, y_test))
data_f1.append(f1_score(test_pred, y_test))
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
def zhu_zhuang_tu(label_list, size, title_name, y_name, x_name):
"""
# 柱状图
label_list = ["第一部分", "第二部分", "第三部分"]
size = [55, 35, 10] # 各部分大小
"""
fig = plt.figure()
plt.bar(label_list, size, 0.5, color="green")
plt.xlabel(x_name)
plt.ylabel(y_name)
plt.title(title_name)
plt.show()
label_list = ["knn", "svm", "RandomForest", "AdaBoost", "xgboost ", "GradientBoosting", "LGBM"]
size = data_acc
zhu_zhuang_tu(label_list, size, "算法对比图", "Accuracy", "算法模型")
label_list = ["knn", "svm", "RandomForest", "AdaBoost", "xgboost ", "GradientBoosting", "LGBM"]
size = data_f1
zhu_zhuang_tu(label_list, size, "算法对比图", "F1_score", "算法模