import pandas as pd
import numpy as np
import seaborn
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
import xlwt as xlwt
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
# 训练数据
data_train = pd.read_csv("E:\\mltest1\\train.csv")
# 测试数据
data_test = pd.read_excel("E:\\mltest1\\test.xlsx")
# 显示中文标题
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
'''# 查看各列属性的数据量和缺失情况
print(data_train.info())
print(data_test.info())
# 查看各列属性的基本统计信息,
print(data_train.describe())
print(data_test.describe())
# 查看胎儿的健康分布情况
print(data_train['fetal_health'].value_counts())
#查看直方图趋势的数量
print(data_train['histogram_tendency'].value_counts())
# 绘图
fig = plt.figure()
# 基线值分布
plt.subplot2grid((2, 4), (0, 0))
data_train['baseline value'].hist()
plt.ylabel(u"人数")
plt.xlabel(u'基线值')
plt.title(u'基数值分布')
# 加速分布
plt.subplot2grid((2, 4), (0, 1))
data_train['accelerations'].hist()
plt.ylabel(u"人数")
plt.xlabel(u'加速')
plt.title(u'加速分布')
# 胎动分布
plt.subplot2grid((2, 4), (0, 2))
data_train['fetal_movement'].hist()
plt.xlabel(u'胎动')
plt.title(u'胎动分布')
# 子宫收缩分布
plt.subplot2grid((2, 4), (0, 3))
data_train['uterine_contractions'].hist()
plt.xlabel(u'子宫收缩')
plt.title(u'子宫收缩分布')
# 轻度减速分布
plt.subplot2grid((2, 4), (1, 0))
data_train['light_decelerations'].hist()
plt.xlabel(u'轻度减速')
plt.title(u'轻度减速分布')
# 重度分布情况
plt.subplot2grid((2, 4), (1, 1))
data_train['severe_decelerations'].hist()
plt.xlabel(u'重度减速')
plt.title(u'重度减速分布')
#持续减速分布情况
plt.subplot2grid((2, 4), (1, 2))
data_train['prolongued_decelerations'].hist()
plt.xlabel(u'持续减速')
plt.title(u'持续减速分布')
#异常短期变异性分布情况
plt.subplot2grid((2, 4), (1, 3))
data_train['abnormal_short_term_variability'].hist()
plt.xlabel(u'异常短期变异性')
plt.title(u'异常短期变异性情况')
# 短期变异性的平均值分布
plt.subplot2grid((3, 4), (0, 0))
data_train['mean_value_of_short_term_variability'].hist()
plt.ylabel(u"人数")
plt.xlabel(u'短期变异性的平均值')
plt.title(u'短期变异性的平均值分布')
# 异常长期变异性的时间百分比分布
plt.subplot2grid((3, 4), (0, 1))
data_train['percentage_of_time_with_abnormal_long_term_variability'].hist()
plt.ylabel(u"人数")
plt.xlabel(u'异常长期变异性的时间百分比')
plt.title(u'异常长期变异性的时间百分比分布')
# 长期变异性的平均值分布
plt.subplot2grid((3, 4), (0, 2))
data_train['mean_value_of_long_term_variability'].hist()
plt.xlabel(u'长期变异性的平均值')
plt.title(u'长期变异性的平均值分布')
# 直方图宽度分布
plt.subplot2grid((3, 4), (0, 3))
data_train['histogram_width'].hist()
plt.xlabel(u'直方图宽度')
plt.title(u'直方图宽度分布')
# 直方图最小值分布
plt.subplot2grid((3, 4), (1, 0))
data_train['histogram_min'].hist()
plt.xlabel(u'直方图最小值')
plt.title(u'直方图最小值分布')
# 直方图最大值分布情况
plt.subplot2grid((3, 4), (1, 1))
data_train['histogram_max'].hist()
plt.xlabel(u'直方图最大值')
plt.title(u'直方图最大值分布')
# 直方图峰值数量分布情况
plt.subplot2grid((3, 4), (1, 2))
data_train['histogram_number_of_peaks'].hist()
plt.xlabel(u'直方图峰值数量')
plt.title(u'直方图峰值数量分布')
# 直方图零值数量分布情况
plt.subplot2grid((3, 4), (1, 3))
data_train['histogram_number_of_zeroes'].hist()
plt.xlabel(u'直方图零值数量')
plt.title(u'直方图零值数量情况')
# 直方图模式分布情况
plt.subplot2grid((3, 4), (2, 0))
data_train['histogram_mode'].hist()
plt.xlabel(u'直方图模式')
plt.title(u'直方图模式情况')
# 直方图均值分布情况
plt.subplot2grid((3, 4), (2, 1))
data_train['histogram_mean'].hist()
plt.xlabel(u'直方图均值')
plt.title(u'直方图均值情况')
# 直方图中位数分布情况
plt.subplot2grid((3, 4), (2, 2))
data_train['histogram_median'].hist()
plt.xlabel(u'直方图中位数')
plt.title(u'直方图中位数情况')
# 直方图方差分布情况
plt.subplot2grid((3, 4), (2, 3))
data_train['histogram_variance'].hist()
plt.xlabel(u'直方图方差')
plt.title(u'直方图方差情况')
plt.show()
'''
df = data_test[['baseline value', 'accelerations', 'fetal_movement', 'uterine_contractions',
'light_decelerations', 'severe_decelerations', 'prolongued_decelerations',
'abnormal_short_term_variability', 'mean_value_of_short_term_variability',
'percentage_of_time_with_abnormal_long_term_variability', 'mean_value_of_long_term_variability',
'histogram_width', 'histogram_min', 'histogram_max', 'histogram_number_of_peaks',
'histogram_number_of_zeroes', 'histogram_mode', 'histogram_mean', 'histogram_median',
'histogram_variance', 'histogram_tendency']]
"""# 属性间相关系数
cor = df.corr()
print(cor)
# 属性间相关系数热力图
seaborn.heatmap(cor)
plt.show()"""
'''# 选择关键数据
key_data = df[['baseline value', 'accelerations', 'fetal_movement', 'uterine_contractions',
'light_decelerations', 'severe_decelerations', 'prolongued_decelerations',
'abnormal_short_term_variability', 'mean_value_of_short_term_variability',
'percentage_of_time_with_abnormal_long_term_variability','mean_value_of_long_term_variability',
'histogram_width', 'histogram_min', 'histogram_max','histogram_number_of_peaks',
'histogram_number_of_zeroes', 'histogram_mode','histogram_mean','histogram_median',
'histogram_variance','histogram_tendency']]
# 标准化数据
scaler = StandardScaler()
scaled_data = scaler.fit_transform(key_data)
# 使用PCA进行降维
pca = PCA(n_components=2) # 选择保留2个主成分
transformed_data = pca.fit_transform(scaled_data)
# 绘制降维后的数据散点图
plt.scatter(transformed_data[:, 0], transformed_data[:, 1])
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
'''
'''# 训练集:
y_train = np.array(data_train)
y_train = np.asarray(y_train) # 使用np.asarray将其转换为NumPy数组
y_train1 = y_train[:1000, 22]
y_test1 = y_train[1000:, 22]
y_train1 = np.array(y_train1)
print(y_train)
x_train = np.array(data_train)
x_train1 = np.asarray(x_train) # 使用np.asarray将其转换为NumPy数组
x_train1 = x_train[:1000, 1:22]
x_test1 = x_train[1000:, 1:22]
# 测试集:
x_test = np.array(data_test)
x_test = np.asarray(x_test) # 使用np.asarray将其转换为NumPy数组
x_test = x_test[:, 1:22]
# SVM分类器参数设置
clf = svm.SVC(C=1,
kernel='linear',
decision_function_shape='ovr')
# 模型训练
def train(clf, x_train, y_train):
clf.fit(x_train, y_train.ravel()) # 训练集目标值
# 训练SVM模型
train(clf, x_train1, y_train1)
# 输出准确率
# 训练集:
print('training prediction:%.3f' % (clf.score(x_train1, y_train1)))
print('training_test prediction:%.3f' % (clf.score(x_test1, y_test1)))
test_predict = clf.predict(x_test)
print(test_predict)
print((clf.predict(x_test) == 1).sum())
print((clf.predict(x_test) == 2).sum())
print((clf.predict(x_test) == 3).sum())