基于朴素贝叶斯机器学习算法实现情感文本分析与分类

共12个文件

xml：5个

csv：2个

py：2个

机器学习

machine

learning

朴素贝叶斯算法

5星 · 超过95%的资源需积分: 26 114 浏览量 2022-10-06 14:46:55 上传评论 20 收藏 173.42MB ZIP 举报

资源详情

资源评论

资源推荐

收起资源包目录

基于朴素贝叶斯机器学习算法实现情感文本分析与分类.zip （12个子文件）

main.py 6KB

data

data1.csv 779KB

data2.csv 8KB

test.py 1KB

.idea

misc.xml 192B

modules.xml 285B

finalhomework.iml 328B

workspace.xml 6KB

.gitignore 184B

inspectionProfiles

Project_Default.xml 502B

profiles_settings.xml 174B

vectors

sgns.weibo.bigram-char.bz2 173.16MB

import tensorflow as tf import numpy as np import matplotlib.pyplot as plt import jieba from gensim.models import KeyedVectors from sklearn.model_selection import train_test_split import re import os # 使用gensim加载预训练中文分词，需要等待一段时间 cn_model = KeyedVectors.load_word2vec_format('vectors/sgns.weibo.bigram-char', binary=False, unicode_errors='ignore') # 读取训练数据及标签 x_train = [] y_train = [] fr = open("data/data2.csv",encoding="utf-8") for line in fr: temp = line.strip() x_train.append(temp[2:]) y_train.append(int(temp[0])) # 打乱训练样本和标签的顺序 np.random.seed(116) # 使用相同的seed，保证输入特征和标签一一对应 np.random.shuffle(x_train) np.random.seed(116) np.random.shuffle(y_train) x_train_tokens = [] for text in x_train: # 使用jieba进行分词 cut = jieba.cut(text) cut_list = [x for x in cut] for i,word in enumerate(cut_list): try: # 将词转换为索引index cut_list[i] = cn_model.get_index(word) pass except KeyError: # 如果词不在字典中，则输出0 cut_list[i] = 0 pass pass x_train_tokens.append(cut_list) pass # 索引长度标准化 # 因为每段评语的长度是不一样的，我们如果单纯取最长的一个评语，并把其他评语填充成同样的长度， # 这样十分浪费计算资源，所以我们去一个折衷的长度 tokens_count = [len(tokens) for tokens in x_train_tokens] tokens_count.sort(reverse=True) # 画图查看词的长度分布 # plt.plot(tokens_count) # plt.ylabel('tokens count') # plt.xlabel('tokens length') # plt.show() # 可以看出大部分词的长度都是在500以下的 # 当tokens长度分布满足正态分布的时候， # 可以使用取tokens的平均值并且加上两个tokens的标准差，来选用tokens的长度 tokens_length = np.mean(tokens_count) + 2 * np.std(tokens_count) print(tokens_length) # 可以看到当tokens的长度为244，大约95%的样本被覆盖， # 我们需要对长度不足的tokens进行padding，超长的进行修剪 np.sum(tokens_count < tokens_length) / len(tokens_count) # 定义一个把tokens转换成文本的方法 def reverse_tokens(tokens): text = '' for index in tokens: if index != 0: text = text + cn_model.index_to_key[index] else: text = text + '' pass return text pass #测试 print(reverse_tokens(x_train_tokens[0])) print(y_train[0]) #准备一个维度为(60000,120)的词向量矩阵，每一个词汇都用一个长度为300的向量表示。 embedding_matrix = np.zeros((60000, 300)) for i in range(60000): embedding_matrix[i, :] = cn_model[cn_model.index_to_key[i]] pass embedding_matrix = embedding_matrix.astype('float32') #对训练样本进行padding(填充)和truncating(修剪) x_train_tokens_pad = tf.keras.preprocessing.sequence.pad_sequences(x_train_tokens, maxlen=int(tokens_length), padding='pre', truncating='pre') x_train_tokens_pad[x_train_tokens_pad >= 60000] = 0 # x_train_tokens_pad.shape np.sum(cn_model[cn_model.index_to_key[300]] == embedding_matrix[300]) # 使用90%进行训练，10%进行测试 x_tokens_train, x_tokens_test, y_tokens_train, y_tokens_test = train_test_split( x_train_tokens_pad, y_train, test_size=0.1, random_state=12 ) print(y_tokens_train) print(y_tokens_test) print(x_tokens_train.shape) print(embedding_matrix.shape) # 构建模型 model = tf.keras.models.Sequential([ tf.keras.layers.Embedding(60000,300, weights=[embedding_matrix], input_length=int(tokens_length), trainable=False, ), tf.keras.layers.Bidirectional( tf.keras.layers.LSTM( units=64, return_sequences=True )), tf.keras.layers.LSTM(32, return_sequences=False), tf.keras.layers.Dense(4, activation='softmax') ]) model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'] ) # , kernel_regularizer=tf.keras.regularizers.l2(0.01) #print(model.summary()) x_tokens_train = np.array(x_tokens_train) y_tokens_train = np.array(y_tokens_train) history = model.fit(x_tokens_train, y_tokens_train, batch_size=128, epochs=40, validation_split=0.1, validation_freq=1 ) #print(model.summary()) x_tokens_test = np.array(x_tokens_test) y_tokens_test = np.array(y_tokens_test) result = model.evaluate(x_tokens_test, y_tokens_test) print(f'Accuracy : {result[1]}') '''plt.plot(history.history['loss'],label="$Loss$") plt.plot(history.history['val_loss'],label='$val_loss$') plt.title('Loss') plt.xlabel('epoch') plt.ylabel('num') plt.legend() plt.show() plt.plot(history.history['sparse_categorical_accuracy'],label="$sparse_categorical_accuracy$") plt.plot(history.history['val_sparse_categorical_accuracy'],label='$val_sparse_categorical_accuracy$') plt.title('Accuracy') plt.xlabel('epoch') plt.ylabel('num') plt.legend() plt.show()'''