# -*- coding: utf-8 -*-
# @Time : 2017/6/20 19:18
# @Author : Zhao Zhufei
# @Site :
# @File : myproject01.py
# @Software: PyCharm
import pandas as pd
import numpy as np
import re
import jieba
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score
# 设置文件路径
d_path = "C:\Users\yangeryong\Desktop\pdf\encitest\cs.csv"
n_path = "C:\Users\yangeryong\Desktop\pdf\encitest\cs.csv"
# stop_path = "D:/pythonWork/data/newsdata/stopwords.txt"
# 读取数据
df_d = pd.read_csv(d_path)
df_n = pd.read_csv(n_path)
print(df_d.shape)
print(df_n.shape)
# stopwords = open(stop_path, "r", encoding="utf-8").read()
# 生成随机索引
index_d = np.random.choice(2357, 200)
index_n = np.random.choice(78583, 200)
print(index_n)
# 根据索引取数据
data_d = df_d.iloc[index_d]
data_n = df_n.iloc[index_n]
# 将两个DataFrame 合并到一起
data = pd.concat([data_d,data_n])
print(data)
# 定义一个函数,对文本进行处理,返回一个list
def getNewsList(list):
wordList = []
lableList = []
for a in list["title"]:
fen = jieba.lcut(a)
# 去除数字
for word in fen:
pattern = re.compile(r"\d*")
cc = pattern.match(word)
if len(cc.group()) != 0:
fen.remove(word)
# 去除空格
for cc in fen:
if cc == " ":
fen.remove(cc)
fen = ' '.join(fen)
wordList.append(fen)
for b in data["type"]:
lableList.append(int(b))
return wordList ,lableList
# 调用函数
cutWordList,newslable = getNewsList(df_n)
# 实例化函数
vectorizer = CountVectorizer()
transformer = TfidfTransformer()
#向量化
tfidf = transformer.fit_transform(vectorizer.fit_transform(cutWordList))
weight = tfidf.toarray()
print(weight.shape)
label = map(int, newslable)
label = np.array(list(label))
print(label.shape)
print(type(label))
X_train, X_test, y_train, y_test = train_test_split(weight, label,test_size=0.25, random_state=5)
# 支持向量机
svm_list =[]
model = SVC(gamma=0.001, C=1000)
model.fit(X_train, y_train)
svc_predict = model.predict(X_test)
svm_asc = accuracy_score(y_test, svc_predict)
svm_list.append(svm_asc)
svm_ps = precision_score(y_test, svc_predict)
svm_list.append(svm_ps)
svm_rs = recall_score(y_test, svc_predict)
svm_list.append(svm_rs)
svm_fs = f1_score(y_test, svc_predict)
svm_list.append(svm_fs)
print('svc accuracy:', svm_asc)
print('svc precision:', svm_ps)
print('svc f1 score:', svm_fs)
print('svc recall:', svm_rs)
# 随机森林
rs_list =[]
model2 = RandomForestClassifier(n_estimators=10)
model2.fit(X_train, y_train)
# print(model2.score(weight, label))
rf_predict = model2.predict(X_test)
rf_asc = accuracy_score(y_test, rf_predict)
rs_list.append(rf_asc)
rf_ps = precision_score(y_test, rf_predict)
rs_list.append(rf_ps)
rf_rs = recall_score(y_test, rf_predict)
rs_list.append(rf_rs)
rf_fs = f1_score(y_test, rf_predict)
rs_list.append(rf_fs)
print('rf accuracy:', rf_asc)
print('rf precision:', rf_ps)
print('rf f1 score:', rf_fs)
print('rf recall:', rf_rs)
# 朴素贝叶斯
nb_list =[]
model3 = MultinomialNB()
model3.fit(X_train, y_train)
nb_predict = model3.predict(X_test)
nb_asc = accuracy_score(y_test, nb_predict)
nb_list.append(nb_asc)
nb_ps = precision_score(y_test, nb_predict)
nb_list.append(nb_ps)
nb_rs = recall_score(y_test, nb_predict)
nb_list.append(nb_rs)
nb_fs = f1_score(y_test, nb_predict)
nb_list.append(nb_fs)
print('nb accuracy:', nb_asc)
print('nb precision:', nb_ps)
print('nb f1 score:', nb_fs)
print('nb recall:', nb_rs)
# KNN
knn_list =[]
model4 = KNeighborsClassifier()
# print(model4.score(weight, label))
model4.fit(X_train, y_train)
knn_predict = model4.predict(X_test)
knn_asc = accuracy_score(y_test, knn_predict)
knn_list.append(knn_asc)
knn_ps = precision_score(y_test, knn_predict)
knn_list.append(knn_ps)
knn_rs = recall_score(y_test, knn_predict)
knn_list.append(knn_rs)
knn_fs = f1_score(y_test, knn_predict)
knn_list.append(knn_fs)
print('knn accuracy:', knn_asc)
print('knn precision:', knn_ps)
print('knn f1 score:', knn_fs)
print('knn recall:', knn_rs)
print(svm_list)
print(rs_list)
print(nb_list)
print(knn_list)