基于Python注意力机制的虚假信息检测.zip资源-CSDN文库

共8个文件

csv：3个

py：2个

cached_comment_distilbert_128_cnews：1个

python

需积分: 5 79 浏览量 2024-02-23 21:25:01 上传评论收藏 6.22MB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

基于Python注意力机制的虚假信息检测.zip （8个子文件）

post_data

.keep 0B

post_comment_1.py 54KB

data

cached_post_distilbert_128_cnews 1.31MB

comment.csv 1.43MB

cached_comment_distilbert_128_cnews 20.69MB

train.csv 13.3MB

post.csv 221KB

post_test.py 47KB

# coding=utf-8 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa).""" # 只用pheme数据集，然后用train_test_split方式划分数据集 from __future__ import absolute_import, division, print_function import argparse import glob import logging import os import random import torch import csv import copy import numpy as np import pandas as pd from torch import nn from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, Dataset, TensorDataset) from torch.utils.data.distributed import DistributedSampler from transformers import DataProcessor, InputExample, InputFeatures from transformers.modeling_utils import PreTrainedModel, SequenceSummary from zmq import device try: from torch.utils.tensorboard import SummaryWriter except: from tensorboardX import SummaryWriter from tqdm import tqdm, trange from transformers import (WEIGHTS_NAME, BertConfig, BertModel, BertTokenizer, RobertaConfig, RobertaModel, RobertaTokenizer, XLNetConfig, XLNetModel, XLNetTokenizer, DistilBertConfig, DistilBertModel, DistilBertTokenizer, AlbertConfig, AlbertModel, AlbertTokenizer, ) from transformers import AdamW, get_linear_schedule_with_warmup from transformers import glue_compute_metrics as compute_metrics from transformers import glue_output_modes as output_modes from transformers import glue_processors as processors from transformers import glue_convert_examples_to_features as convert_examples_to_features from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP from transformers import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '5678' os.environ["RANK"] = "0" #进程序号 #local_rank是进程内的gpu编号 os.environ['WORLD_SIZE'] = '1' #进程总数 os.environ["CUDA_VISIBLE_DEVICES"] = '1, 2' logger = logging.getLogger(__name__) #ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig, DistilBertConfig, AlbertConfig)), ()) #ALL_MODELS=tuple(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) ALL_MODELS=tuple(ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP) #ALL_MODELS=tuple(XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP) #ALL_MODELS=tuple(DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP) #ALL_MODELS=tuple(ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP) MODEL_CLASSES = { 'bert': (BertConfig, BertModel, BertTokenizer), 'xlnet': (XLNetConfig, XLNetModel, XLNetTokenizer), 'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer), 'distilbert': (DistilBertConfig, DistilBertModel, DistilBertTokenizer), 'albert': (AlbertConfig, AlbertModel, AlbertTokenizer) } def set_seed(args): random.seed(args.seed) # 设置随机数生成器的种子 np.random.seed(args.seed) # 用于生成指定随机数，“堆” torch.manual_seed(args.seed) #设置随机数种子，保证每次运行文件生成的随机数是一样的，返回torch.Generator对象 if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) # 为所有的GPU设置种子 def simple_accuracy(preds, labels): return (preds == labels).mean() # mean函数是求平均值，axis=0——压缩行，axis=1——压缩列 def acc_and_f1(preds, labels): acc = simple_accuracy(preds, labels) f1 = f1_score(y_true=labels, y_pred=preds,average='weighted') # F1分数可以被解释为精度和召回率的加权平均。 return { "acc": acc, "f1": f1, "acc_and_f1": (acc + f1) / 2, } class TextDataset(Dataset): def __init__(self, data_list): self.data_list = data_list def __getitem__(self, idx): return (self.data_list[idx]["source_tweet_text"], self.data_list[idx]["replies_tweet_text_list"], self.data_list[idx]["label_b"]) def __len__(self): return len(self.data_list) fixed_replies_num = 16 class CnesProcessor(DataProcessor): """Processor for the cnews data set (GLUE version).""" def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" return InputExample(tensor_dict['idx'].numpy(), tensor_dict['sentence'].numpy().decode('utf-8'), None, str(tensor_dict['label'].numpy())) def get_post_examples(self, data_dir): """See base class.""" return self.create_post_examples( self._read_tsv(os.path.join(data_dir, "post.csv")), "post") def get_comment_examples(self, data_dir): """See base class.""" return self.create_comment_examples( self._read_tsv(os.path.join(data_dir, "comment.csv")), "comment") def get_labels(self): """See base class.""" return ["true", "false", "unverified" ] def create_post_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line[1] label = line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) return examples def create_comment_examples(self, lines, set_type): """Creates examples for the training and dev sets.""" examples = [] for (i, line) in enumerate(lines): guid = "%s-%s" % (set_type, i) text_a = line[0] examples.append( InputExample(guid=guid, text_a=text_a, text_b=None, label=None)) return examples def source_data_change(self, source_data_dir): return self._data_change(os.path.join(source_data_dir, "train.csv")) #self._read_tsv(os.path.join(source_data_dir, "testing_features_extraction.csv"))) def _data_change(self, source_data_dir): pd.set_option('display.float_format',lambda x : '%.3f' % x) #禁用科学计数法 all_dataframe = pd.read_csv(source_data_dir, lineterminator="\n") all_dataframe.rename(columns={"label_b\r": "label_b"}, inplace=True) all_dataframe["label_b"] = all_dataframe["label_b"].apply(lambda x: x.replace('\r', '')) all_dataframe["in_reply_to_status_id"].fillna(0, inplace=True) all_dataframe["in_reply_to_user_id_str"].fillna(0, inplace=True) all_dataframe["in_reply_to_status_id"] = all_dataframe["in_reply_to_status_id"].astype(int).astype(str) all_dataframe["in_reply_to_user_id_str"] = all_dataframe["in_reply_to_user_id_str"].astype(int).astype(str) all_dataframe[["belong_to_which_source_tweet", "twitter_id", "user_id"]] = all_dataframe[["belong_to_which_source_

评论收藏

内容反馈