# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
# 只用pheme数据集,然后用train_test_split方式划分数据集
from __future__ import absolute_import, division, print_function
import argparse
import glob
import logging
import os
import random
import torch
import csv
import copy
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, Dataset, TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from transformers import DataProcessor, InputExample, InputFeatures
from transformers.modeling_utils import PreTrainedModel, SequenceSummary
from zmq import device
try:
from torch.utils.tensorboard import SummaryWriter
except:
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange
from transformers import (WEIGHTS_NAME, BertConfig, BertModel, BertTokenizer,
RobertaConfig, RobertaModel, RobertaTokenizer,
XLNetConfig, XLNetModel, XLNetTokenizer,
DistilBertConfig, DistilBertModel, DistilBertTokenizer,
AlbertConfig, AlbertModel, AlbertTokenizer,
)
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import glue_compute_metrics as compute_metrics
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
from transformers import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '5678'
os.environ["RANK"] = "0" #进程序号 #local_rank是进程内的gpu编号
os.environ['WORLD_SIZE'] = '1' #进程总数
os.environ["CUDA_VISIBLE_DEVICES"] = '1, 2'
logger = logging.getLogger(__name__)
#ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig, DistilBertConfig, AlbertConfig)), ())
#ALL_MODELS=tuple(BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
ALL_MODELS=tuple(ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
#ALL_MODELS=tuple(XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP)
#ALL_MODELS=tuple(DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
#ALL_MODELS=tuple(ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP)
MODEL_CLASSES = {
'bert': (BertConfig, BertModel, BertTokenizer),
'xlnet': (XLNetConfig, XLNetModel, XLNetTokenizer),
'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertModel, DistilBertTokenizer),
'albert': (AlbertConfig, AlbertModel, AlbertTokenizer)
}
def set_seed(args):
random.seed(args.seed) # 设置随机数生成器的种子
np.random.seed(args.seed) # 用于生成指定随机数,“堆”
torch.manual_seed(args.seed) #设置随机数种子,保证每次运行文件生成的随机数是一样的,返回torch.Generator对象
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed) # 为所有的GPU设置种子
def simple_accuracy(preds, labels):
return (preds == labels).mean() # mean函数是求平均值,axis=0——压缩行,axis=1——压缩列
def acc_and_f1(preds, labels):
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds,average='weighted') # F1分数可以被解释为精度和召回率的加权平均。
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
class TextDataset(Dataset):
def __init__(self, data_list):
self.data_list = data_list
def __getitem__(self, idx):
return (self.data_list[idx]["source_tweet_text"], self.data_list[idx]["replies_tweet_text_list"], self.data_list[idx]["label_b"])
def __len__(self):
return len(self.data_list)
fixed_replies_num = 16
class CnesProcessor(DataProcessor):
"""Processor for the cnews data set (GLUE version)."""
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(tensor_dict['idx'].numpy(),
tensor_dict['sentence'].numpy().decode('utf-8'),
None,
str(tensor_dict['label'].numpy()))
def get_post_examples(self, data_dir):
"""See base class."""
return self.create_post_examples(
self._read_tsv(os.path.join(data_dir, "post.csv")), "post")
def get_comment_examples(self, data_dir):
"""See base class."""
return self.create_comment_examples(
self._read_tsv(os.path.join(data_dir, "comment.csv")), "comment")
def get_labels(self):
"""See base class."""
return ["true",
"false",
"unverified"
]
def create_post_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = line[1]
label = line[0]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples
def create_comment_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i)
text_a = line[0]
examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=None))
return examples
def source_data_change(self, source_data_dir):
return self._data_change(os.path.join(source_data_dir, "train.csv"))
#self._read_tsv(os.path.join(source_data_dir, "testing_features_extraction.csv")))
def _data_change(self, source_data_dir):
pd.set_option('display.float_format',lambda x : '%.3f' % x) #禁用科学计数法
all_dataframe = pd.read_csv(source_data_dir, lineterminator="\n")
all_dataframe.rename(columns={"label_b\r": "label_b"}, inplace=True)
all_dataframe["label_b"] = all_dataframe["label_b"].apply(lambda x: x.replace('\r', ''))
all_dataframe["in_reply_to_status_id"].fillna(0, inplace=True)
all_dataframe["in_reply_to_user_id_str"].fillna(0, inplace=True)
all_dataframe["in_reply_to_status_id"] = all_dataframe["in_reply_to_status_id"].astype(int).astype(str)
all_dataframe["in_reply_to_user_id_str"] = all_dataframe["in_reply_to_user_id_str"].astype(int).astype(str)
all_dataframe[["belong_to_which_source_tweet", "twitter_id", "user_id"]] = all_dataframe[["belong_to_which_source_
没有合适的资源?快使用搜索试试~ 我知道了~
基于Python注意力机制的虚假信息检测.zip
共8个文件
csv:3个
py:2个
cached_comment_distilbert_128_cnews:1个
需积分: 5 0 下载量 79 浏览量
2024-02-23
21:25:01
上传
评论
收藏 6.22MB ZIP 举报
温馨提示
注意力机制
资源推荐
资源详情
资源评论
收起资源包目录
基于Python注意力机制的虚假信息检测.zip (8个子文件)
1
post_data
.keep 0B
post_comment_1.py 54KB
data
cached_post_distilbert_128_cnews 1.31MB
comment.csv 1.43MB
cached_comment_distilbert_128_cnews 20.69MB
train.csv 13.3MB
post.csv 221KB
post_test.py 47KB
共 8 条
- 1
资源评论
YOLO数据集工作室
- 粉丝: 477
- 资源: 1568
上传资源 快速赚钱
- 我的内容管理 展开
- 我的资源 快来上传第一个资源
- 我的收益 登录查看自己的收益
- 我的积分 登录查看自己的积分
- 我的C币 登录后查看C币余额
- 我的收藏
- 我的下载
- 下载帮助
安全验证
文档复制为VIP权益,开通VIP直接复制
信息提交成功