# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
# 读取训练集csv文件
df_train = pd.read_csv('train.csv', index_col = 0,lineterminator='\n')
df_test = pd.read_csv('20190520_test.csv', index_col = 0,lineterminator='\n')
rev_train = [re.sub(r'[^\w\s?.!/-_,.:$%^*()\[\]\"\']+|[^\w\s+——!,。?、~@#¥%……&*()]+','',sentence) for sentence in df_train['review']]
rev_test = [re.sub(r'[^\w\s?.!/-_,.:$%^*()\[\]\"\']+|[^\w\s+——!,。?、~@#¥%……&*()]+','',sentence) for sentence in df_test['review']]
rev_all = rev_train + rev_test
labels = [0 if x == "Negative" else 1 for x in df_train['label']]
tfv = TFIV(min_df=3, max_features=None,
analyzer='word', ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1)
tfv.fit(rev_all)
tfidf = tfv.transform(rev_all)
train_len = len(rev_train)
print(train_len)
x_train = tfidf[:train_len]
x_test = tfidf[train_len:]
tfidf_train_array = x_train.toarray()
labels_array = np.array(labels)
X_train,X_test,y_train,y_test = train_test_split(tfidf_train_array,labels_array,test_size=0.20)
#朴素贝叶斯
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
print ('The accuracy of NB Classifier is',mnb.score(X_test,y_test))
#逻辑回归
classifier_lg = LogisticRegression()
classifier_lg.fit(X_train,y_train)
print ('The accuracy of lg Classifier is',classifier_lg.score(X_test,y_test))
#预测输出
tfidf_test_array = x_test.toarray()
test_len = len(rev_test)
id_lt = [i for i in range(1,test_len+1)]
#朴素贝叶斯
predict_list = mnb.predict_proba(tfidf_test_array)
predict_list = [predict[1] for predict in predict_list]
dataframe = pd.DataFrame({'ID':id_lt,'Pred':predict_list})
dataframe.to_csv("Pred_nb.csv",index=False)
# #逻辑回归
predict_list = classifier_lg.predict_proba(tfidf_test_array)
predict_list = [predict[1] for predict in predict_list]
dataframe = pd.DataFrame({'ID':id_lt,'Pred':predict_list})
dataframe.to_csv("Pred_lg.csv",index=False)
SentimentClassification.rar_2OO7_equipmentz67_情感分类
版权申诉
17 浏览量
2022-07-15
21:21:49
上传
评论
收藏 1KB RAR 举报
局外狗
- 粉丝: 64
- 资源: 1万+
最新资源
- Screenshot_20240509_034911_com.tencent.mtt.jpg
- 基于python实现的医学影像体脂分割+源代码+文档说明(课程设计)
- 基于python实现的医学影像(MIR, CT )图像分割源码+文档说明(高分课程设计)
- 基于python+JavaScript实现的医学影像分割+源代码+文档说明+截图演示+数据(高分毕业设计)
- 基于U-net+pytorch实现的医学影像分割python源码+文档说明+数据+界面截图+博客介绍
- 课程设计-基于Pytorch实现MNIST数据集的手写数字识别源码+数据(Gui界面)+文档说明+模型
- 软件开发国家标准.xls
- pytorch-CNN-SBATM-ubuntudemo
- matplotlibdemo
- pytorch-CNN-dht11温湿度传感器笔记
资源上传下载、课程学习等过程中有任何疑问或建议,欢迎提出宝贵意见哦~我们会及时处理!
点击此处反馈