#!/usr/bin/env python
# coding: utf-8
from sklearn import tree
from sklearn.datasets import load_wine
import pandas as pda
#数据读取及预处理
fname="E:\model.csv"
dataf=pda.read_csv(fname,encoding="gbk")
#数据预处理,缺失的空值填上平均值
dataf.iloc[:,2:201]=dataf.iloc[:,2:201].fillna(dataf.iloc[:,2:201].median())
dataf.dropna(axis=0,inplace=True)
x=dataf.iloc[:,2:201].as_matrix()
y=dataf.iloc[:,0].as_matrix()
xf=pda.DataFrame(x)
yf=pda.DataFrame(y)
from sklearn.model_selection import train_test_split
Xtrain,Xtest,Ytrain,Ytest = train_test_split(xf,yf,test_size=0.1)
#剪枝,调值
'''
import matplotlib.pyplot as plt
test=[]
Max=0
I=J=K=0
for i in range(1,30):
for j in range(2,30):
for k in range(2,30):
clf=tree.DecisionTreeClassifier(
min_samples_leaf=i
,min_samples_split=j
,max_leaf_nodes=k
,criterion="entropy"
,random_state=30
,splitter="random"
)
clf=clf.fit(Xtrain,Ytrain)
score=clf.score(Xtest,Ytest)
if score>Max:
Max=score
I=i
J=j
K=k
#test.append(score)
#plt.plot(range(1,30),test,color="red",label="max_depth")
#plt.legend()
#plt.show()
'''
#剪枝后的树,输出模型精确度
clf=tree.DecisionTreeClassifier(criterion="entropy"
,min_samples_leaf=1
,min_samples_split=2
,max_leaf_nodes=27
,random_state=30
#,max_depth=6
,splitter="random"
)
clf=clf.fit(Xtrain,Ytrain)
score=clf.score(Xtest,Ytest)
#score=clf.score(Xtrain,Ytrain)
score
#决策树可视化
import graphviz
import pydotplus
feature_name=['x_001','x_002','x_003','x_004','x_005','x_006','x_007','x_008','x_009','x_010',
'x_011','x_012','x_013','x_014','x_015','x_016','x_017','x_018','x_019','x_020',
'x_021','x_022','x_023','x_024','x_025','x_026','x_027','x_028','x_029','x_030',
'x_031','x_032','x_033','x_034','x_035','x_036','x_037','x_038','x_039','x_040',
'x_041','x_042','x_043','x_044','x_045','x_046','x_047','x_048','x_049','x_050',
'x_051','x_052','x_053','x_054','x_055','x_056','x_057','x_058','x_059','x_060',
'x_061','x_062','x_063','x_064','x_065','x_066','x_067','x_068','x_069','x_070',
'x_071','x_072','x_073','x_074','x_075','x_076','x_077','x_078','x_079','x_080',
'x_081','x_082','x_083','x_084','x_085','x_086','x_087','x_088','x_089','x_090',
'x_091','x_092','x_093','x_094','x_095','x_096','x_097','x_098','x_099','x_100',
'x_101','x_102','x_103','x_104','x_105','x_106','x_107','x_108','x_109','x_110',
'x_111','x_112','x_113','x_114','x_115','x_116','x_117','x_118','x_119','x_120',
'x_121','x_122','x_123','x_124','x_125','x_126','x_127','x_128','x_129','x_130',
'x_131','x_132','x_133','x_134','x_135','x_136','x_137','x_138','x_139','x_140',
'x_141','x_142','x_143','x_144','x_145','x_146','x_147','x_148','x_149','x_150',
'x_151','x_152','x_153','x_154','x_155','x_156','x_157','x_158','x_159','x_160',
'x_161','x_162','x_163','x_164','x_165','x_166','x_167','x_168','x_169','x_170',
'x_171','x_172','x_173','x_174','x_175','x_176','x_177','x_178','x_179','x_180',
'x_181','x_182','x_183','x_184','x_185','x_186','x_187','x_188','x_189','x_190',
'x_191','x_192','x_193','x_194','x_195','x_196','x_197','x_198','x_199']
dot_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=['0','1']
,filled=True
,rounded=True)
#graph=graphviz.Source(dot_data)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("tree.pdf")
#[*zip(feature_name,clf.feature_importances_)]
#预测Xtest
anwser=clf.predict(Xtest)
y2=Ytest.as_matrix().astype(int)
#预测分类
count=0
for i in range(0,1002):
if(y2[i]!=anwser[i]):
count+=1
print(1-count/len(Xtest))
#pda.DataFrame(anwser)
#预测test
fname1="test.csv"
dataf1=pda.read_csv(fname1,encoding="gbk")
#数据预处理,同dataf
dataf1.iloc[:,1:200]=dataf1.iloc[:,1:200].fillna(dataf1.iloc[:,1:200].median())
dataf1.dropna(axis=0,inplace=True)
test=dataf1.iloc[:,1:200].as_matrix()
anwser1=clf.predict(test)
#输出结果
pda.DataFrame(anwser1)
#写入文件
data = pda.DataFrame(pda.DataFrame(anwser1))
writer = pda.ExcelWriter('E:\result.xlsx')
data.to_excel(writer, 'page_1', float_format='%.5f')
writer.save()
writer.close()