import pandas as pd
# 从sklearn的特征工程的文本模块导入词频统计函数
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,roc_auc_score,accuracy_score
test_data=pd.read_excel("未标注数据汇总.xlsx").values
train_data=pd.read_excel("已标注数据.xlsx")
tfCoder = CountVectorizer(token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")
import pandas as pd
import re
train_data=pd.read_excel("已标注数据.xlsx")
train_x=[]
Y=[]
for line in train_data.values:
if line[1]==0 or line[1]==1:
train_x.append(line[0])
Y.append(line[1])
print(len(train_x))
X = tfCoder.fit_transform(train_x)
# print(X)
# print(Y)
# print(tfCoder.get_feature_names())
# print(X.toarray())
# 建立 贝叶斯模型
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)
data_acc=[]
data_f1=[]
from sklearn.naive_bayes import MultinomialNB
nomialNB = MultinomialNB()
nomialNB.fit(x_train,y_train)
test_pred = nomialNB.predict(x_test)
print("朴素贝叶斯模型")
print("accuracy_score:",accuracy_score(test_pred,y_test))
print("f1_score:",f1_score(test_pred,y_test))
data_acc.append(accuracy_score(test_pred,y_test))
data_f1.append(f1_score(test_pred,y_test))
# 建立 svm模型
from sklearn.svm import SVR,SVC
svm = SVC()
svm.fit(x_train,y_train)
test_pred = svm.predict(x_test)
print("svm模型")
print("accuracy_score:",accuracy_score(test_pred,y_test))
print("f1_score:",f1_score(test_pred,y_test))
data_acc.append(accuracy_score(test_pred,y_test))
data_f1.append(f1_score(test_pred,y_test))
# 建立 随机森林模型
from sklearn.ensemble import RandomForestClassifier
RandomForest= RandomForestClassifier()
RandomForest.fit(x_train,y_train)
test_pred = RandomForest.predict(x_test)
print("随机森林模型")
print("accuracy_score:",accuracy_score(test_pred,y_test))
print("f1_score:",f1_score(test_pred,y_test))
data_acc.append(accuracy_score(test_pred,y_test))
data_f1.append(f1_score(test_pred,y_test))
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
mpl.rcParams["font.sans-serif"] = ["SimHei"]
mpl.rcParams["axes.unicode_minus"] = False
def zhu_zhuang_tu(label_list, size, title_name, y_name, x_name):
"""
# 柱状图
label_list = ["第一部分", "第二部分", "第三部分"]
size = [55, 35, 10] # 各部分大小
"""
fig = plt.figure()
plt.bar(label_list, size, 0.5, color="green")
plt.xlabel(x_name)
plt.ylabel(y_name)
plt.title(title_name)
plt.show()
label_list = ["MultinomialNB", "svm", "RandomForest"]
size = data_acc
zhu_zhuang_tu(label_list, size, "算法对比图", "Accuracy", "算法模型")
label_list = ["MultinomialNB", "svm", "RandomForest"]
size = data_f1
zhu_zhuang_tu(label_list, size, "算法对比图", "F1_score", "算法模型")
test_d=[i[0] for i in test_data]
test_d1=[]
for i in test_d:
res = re.findall('[\u4e00-\u9fa5]', str(i))
res="".join(res)
if len(res)<2:
test_d1.append('111111111111111111')
else:
test_d1.append(i)
# 需要预测的数据
y1=nomialNB.predict(tfCoder.transform(test_d1))
y2=svm.predict(tfCoder.transform(test_d1))
y3=RandomForest.predict(tfCoder.transform(test_d1))
name=['数据', '预测类别']
result=[]
for i in range(len(RandomForest.predict(tfCoder.transform(test_d1)))):
result.append([test_data[i][0],y3[i]])
test=pd.DataFrame(columns=name,data=result)
test.to_excel('result.xlsx')