# _*_ coding:utf-8 _*_
'''
@Author: Ruan Yang
@Date: 2018.12.16
@Purpose: 使用传统的机器学习的方法进行文本情感分析
'''
import codecs
import jieba
import numpy as np
from gensim.models.word2vec import Word2Vec
from sklearn.externals import joblib
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
datapaths=r"C:\Users\RY\Desktop\情感分析\SentimentAnalysis-master\data\\"
storedpaths=r"C:\Users\RY\Desktop\\"
positive_data=[]
y_positive=[]
neutral_data=[]
y_neutral=[]
negative_data=[]
y_negative=[]
print("#------------------------------------------------------#")
print("加载数据集")
with codecs.open(datapaths+"pos.csv","r","utf-8") as f1,\
codecs.open(datapaths+"neutral.csv","r","utf-8") as f2,\
codecs.open(datapaths+"neg.csv","r","utf-8") as f3:
for line in f1:
positive_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
#y_positive.append([1,0,0])
y_positive.append([0])
for line in f2:
neutral_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
#y_neutral.append([0,1,0])
y_neutral.append([1])
for line in f3:
negative_data.append(" ".join(i for i in jieba.lcut(line.strip(),cut_all=False)))
#y_negative.append([0,0,1])
y_negative.append([2])
print("positive data:{}".format(len(positive_data)))
print("neutral data:{}".format(len(neutral_data)))
print("negative data:{}".format(len(negative_data)))
x_text=positive_data+neutral_data+negative_data
y_label=y_positive+y_neutral+y_negative
print("#------------------------------------------------------#")
print("\n")
# 数据集混洗
shuffle_indices = np.random.permutation(np.arange(len(y_label)))
train_test_percent=0.2
x_train=[]
x_test=[]
y_train=[]
y_test=[]
for i in shuffle_indices[:-(int(len(shuffle_indices)*train_test_percent))]:
x_train.append(x_text[i])
y_train.append(y_label[i])
for i in shuffle_indices[-(int(len(shuffle_indices)*train_test_percent)):]:
x_test.append(x_text[i])
y_test.append(y_label[i])
x_train_pos=0
x_train_neu=0
x_train_neg=0
x_test_pos=0
x_test_neu=0
x_test_neg=0
for i in y_train:
if i[0] == 0:
x_train_pos += 1
elif i[0] == 1:
x_train_neu += 1
else:
x_train_neg += 1
for i in y_test:
if i[0] == 0:
x_test_pos += 1
elif i[0] == 1:
x_test_neu += 1
else:
x_test_neg += 1
print("#------------------------------------------------------#")
print("保存标签数据")
np.save(storedpaths+"y_train.npy",np.array(y_train))
np.save(storedpaths+"y_test.npy",np.array(y_test))
print("训练集总数:{}".format(len(x_train)))
print("训练集正样本:{}".format(x_train_pos))
print("训练集中性样本:{}".format(x_train_neu))
print("训练集负样本:{}".format(x_train_neg))
print("测试集总数:{}".format(len(x_test)))
print("测试集正样本:{}".format(x_test_pos))
print("测试集中性样本:{}".format(x_test_neu))
print("测试集负样本:{}".format(x_test_neg))
print("#------------------------------------------------------#")
print("\n")
#对每个句子的所有词向量取均值
# text 需要是切完词的 词列表
# size 一般是词向量的维度
# word_vector_model: 训练好的词向量模型 (一般使用 gensim 中的 WordVector 进行词向量训练)
# 或者是直接加载训练好的模型
def buildWordVector(text,size,word_vector_model):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += word_vector_model[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
# 计算词向量
def get_train_vecs(x_train,x_test,n_dim):
'''
x_train: 训练集
x_test: 测试集
n_dim: 训练词向量的维度
'''
n_dim=n_dim
# 初始化模型和生成词汇表
all_text=x_train+x_test
text_w2v=Word2Vec(size=n_dim,min_count=5,workers=1)
text_w2v.build_vocab(all_text)
text_w2v.train(all_text,total_examples=text_w2v.corpus_count,epochs=5)
# 分别得到训练集和测试集文本的词向量合集,这个数据集就很大了
train_vecs=np.concatenate([buildWordVector(text,n_dim,text_w2v) for text in x_train])
np.save(storedpaths+"train_vecs.npy",train_vecs)
print("训练集数据的词向量维度:{}".format(train_vecs.shape))
test_vecs=np.concatenate([buildWordVector(text,n_dim,text_w2v) for text in x_test])
np.save(storedpaths+"test_vecs.npy",test_vecs)
print("测试集数据的词向量维度:{}".format(test_vecs.shape))
# 保存词向量
text_w2v.save(storedpaths+"w2v_model.pkl")
# 加载向量化的文本和标签
def get_data():
train_vecs=np.load(storedpaths+'train_vecs.npy')
y_train=np.load(storedpaths+'y_train.npy')
test_vecs=np.load(storedpaths+'test_vecs.npy')
y_test=np.load(storedpaths+'y_test.npy')
return train_vecs,y_train,test_vecs,y_test
# 训练svm模型
def svm_train(train_vecs,y_train,test_vecs,y_test):
clf=SVC(kernel='rbf',verbose=True)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练朴素贝叶斯模型
def NB_train(train_vecs,y_train,test_vecs,y_test):
gnb = GaussianNB()
gnb.fit(train_vecs,y_train)
joblib.dump(gnb,storedpaths+'model_gnb.pkl')
test_scores=gnb.score(test_vecs,y_test)
return test_scores
# 训练决策树模型
def decision_tree(train_vecs,y_train,test_vecs,y_test):
clf=DecisionTreeClassifier(max_depth=10, min_samples_split=2,random_state=0)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_dtree.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练随机森林算法
def random_forest(train_vecs,y_train,test_vecs,y_test):
clf = RandomForestClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_randomforest.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练 ExtraTreesClassifier 分类算法
def extract_tree(train_vecs,y_train,test_vecs,y_test):
clf = ExtraTreesClassifier(n_estimators=10, max_depth=10,min_samples_split=2,n_jobs=1,random_state=0)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_extracttree.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练 GBDT 分类算法
def gbdt_classifier(train_vecs,y_train,test_vecs,y_test):
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=10,random_state=0)
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_gbdt.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练近邻分类算法
def nn_classifier(n_neighbors,train_vecs,y_train,test_vecs,y_test):
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(train_vecs,y_train)
joblib.dump(clf,storedpaths+'model_nn.pkl')
test_scores=clf.score(test_vecs,y_test)
return test_scores
# 训练 LogisticRegression 分�
没有合适的资源?快使用搜索试试~ 我知道了~
温馨提示
利用CNN,LSTM,CNN_LSTM,TextCNN,Bi_LSTM和传统的机器学习算法进行情感分析: 1. keras_sentiment_analysis_v1.py: LSTM算法 2. keras_sentiment_analysis_v2.py: CNN_LSTM算法 3. keras_sentiment_analysis_v3.py: CNN算法 4. keras_sentiment_analysis_v4.py: TextCNN算法 5. keras_sentiment_analysis_v4.py: Bi_LSTM算法 6. sentiment_analysis_ml.py: 传统机器学习算法
资源推荐
资源详情
资源评论
收起资源包目录
利用CNN,LSTM,CNN_LSTM,TextCNN,Bi-LSTM和传统的机器学习算法进行情感分析.zip (9个子文件)
Sentiment_Analysis_cnn_lstm_cnnlstm_textcnn_bilstm-master
keras_sentiment_analysis_v2.py 8KB
keras_sentiment_analysis_v4.py 10KB
keras_sentiment_analysis_v3.py 8KB
data
neg.csv 1.67MB
neutral.csv 2.12MB
pos.csv 2.08MB
sentiment_analysis_ml.py 12KB
keras_sentiment_analysis_v1.py 8KB
keras_sentiment_analysis_v5.py 8KB
共 9 条
- 1
资源评论
- m0_561829842024-04-19超级好的资源,很值得参考学习,对我启发很大,支持!
博士僧小星
- 粉丝: 1760
- 资源: 5875
下载权益
C知道特权
VIP文章
课程特权
开通VIP
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功