python-weibo-analyze.rar微博情感分析训练模型

共3个文件

py：1个

keras：1个

csv：1个

python

121 浏览量 2024-06-04 18:50:41 上传评论收藏 131.21MB RAR 举报

资源推荐

资源详情

资源评论

收起资源包目录

python_weibo_analyze.rar （3个子文件）

python22

weibo_senti_100k.csv 18.9MB

weibo_emotion_analyze.py 3KB

my_model.keras 141.69MB

# -*- coding: utf-8 -*- import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional from tensorflow.keras.optimizers import Adam import jieba def cut_text(text): seg_list = jieba.cut(text) return ' '.join(seg_list) df = pd.read_csv('weibo_senti_100k.csv', nrows=50000) # 分词 word_cut = df['review'].map(cut_text) # # 分割数据集 x_train, x_test, y_train, y_test = train_test_split(word_cut, df['label'], test_size=0.2, random_state=42) # 文本数据转换为序列 tokenizer = Tokenizer() tokenizer.fit_on_texts(word_cut) # print(tokenizer.word_index) X_train_seq = tokenizer.texts_to_sequences(x_train) X_test_seq = tokenizer.texts_to_sequences(x_test) # 填充序列 max_len = max(len(x) for x in X_train_seq) print(max_len) X_train_pad = pad_sequences(X_train_seq, maxlen=120, padding='post') X_test_pad = pad_sequences(X_test_seq, maxlen=120, padding='post') # 将标签转换为numpy数组 # y_train = np.array(y_train) # y_test = np.array(y_test) #构建更复杂的模型 vocab_size = len(tokenizer.word_index) + 1 # 词汇表大小 embedding_dim = 100 # 词嵌入维度 lstm_units = 128 # LSTM单元数 model = Sequential() model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)) # model.add(Bidirectional(LSTM(lstm_units, return_sequences=True))) model.add(Dropout(0.5)) model.add(Bidirectional(LSTM(lstm_units))) model.add(Dense(1, activation='sigmoid')) # # 编译模型 optimizer = Adam(learning_rate=1e-3) model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy']) # 训练模型 model.fit(X_train_pad, y_train, epochs=10, batch_size=200, validation_data=(X_test_pad, y_test)) # 评估模型 loss, accuracy = model.evaluate(X_test_pad, y_test) print(f'测试集上的准确率: {accuracy:.2f}') model.save('my_model.keras') # 测试模型 new_sentences = [ '这手机用起来真是太流畅了，推荐！', '看了这部电影，感觉浪费了两个小时。', '新出的餐厅环境很好，菜品也很棒，值得一试！', '快递太慢了，等了一个星期还没到。', '今天的天气真好，心情也变得很美丽。', '这次更新的功能非常实用，点赞！', '真搞不懂这些人怎么想的，这样也能接受？', '旅游景点人太多了，根本没法好好玩。', '听说这个品牌的化妆品很不错，打算试试。', '真是失望，服务态度这么差，还怎么做生意。' ] new_sentences = pd.Series(new_sentences) new_sentences_cut = new_sentences.map(cut_text) new_sequences2 = tokenizer.texts_to_sequences(new_sentences_cut) new_padded = pad_sequences(new_sequences2, maxlen=120, padding='post') predictions = model.predict(new_padded) predictions = model.predict(new_padded) predictions = [1 if p > 0.5 else 0 for p in predictions] for sentence, sentiment in zip(new_sentences, predictions): print(f'句子: {sentence} -> 情感: {"积极" if sentiment == 1 else "消极"}')

评论收藏

内容反馈