图卷积神经网络GCN经典案例，CORA数据集，分类任务，纯pytoch编写，注释清晰，可视化直观感受

共4个文件

readme：1个

py：1个

content：1个

数据集

72 浏览量 2023-06-03 10:59:01 上传评论 5 收藏 179KB ZIP 举报

资源推荐

资源详情

资源评论

收起资源包目录

gcn_test.zip （4个子文件）

cora

README 2KB

cora.cites 68KB

cora.content 7.46MB

gcn_test.py 9KB

# 图神经网络GNN，是利用图进行机器学习的一种神经网络，可以用于图这种数据类型的分类、预测等任务 # GCN，图卷积神经网络是GNN的一种，类型CNN是神经网络的一种一样，主要是用卷积操作来提取特征 # GCN的入门是从看教程和跑经典案例开始的 # CORE数据集，是入门GNN的一个经典案例 # CORE数据集，包含2708篇论文，每篇论文归类为一种类型并打上了标签，每篇论文的特性用一个维度为1433的词向量来表示，论文之间又存在引用关系，所以是一个典型的图 # CORE数据集的任务，是给定一篇新论文，已知它的词向量，让你预测这篇论文的分类 # import numpy as np import matplotlib.pyplot as plt import torch import torch.nn.functional as F from torch_geometric.nn import GCNConv from torch_geometric.data import Data # 分类标签，对于CORA数据集，就是文章的7种类型 classes = ['Case_Based', 'Genetic_Algorithms', 'Neural_Networks', 'Probabilistic_Methods', 'Reinforcement_Learning', 'Rule_Learning', 'Theory'] # 自定义数据的类型 class DATA(Data): # 继承自 torch.Data，这个类型是可以直接放到 cuda 里面的，当然放CPU里面也可以，如果你实在没有GPU的话 # torch.Data类，可以自动统计num_features, num_nodes，我们只管给 x y edge_index 赋值即可 def __init__(self): super().__init__() # 输入样本 x self.x = torch.empty(size=(0, self.num_features), dtype=torch.float) # 一开始是空的，read中装填数据 # 输出样本 y self.y = torch.empty(0, dtype=torch.int64) # 这里必须是int64，不能是int32，否则报错! # 边 edge_index，放图的所有边 self.edge_index = torch.empty(size=(2, 0), dtype=torch.int) # 2行xn列，n是边的条数 # 节点编号，是我们自己建立的变量，并非继承自torch.Data，只是为了方便后面read时，生成 edge_index 用的 self.paper_ids = np.array([], dtype=int) # 指定数据集中哪些是训练集、验证集、测试集，这个必须要加，否则报错 self.train_mask = torch.zeros(0, dtype=torch.bool) # 先用空的，后面read后，再指定 self.val_mask = torch.zeros(0, dtype=torch.bool) # 先用空的，后面read后，再指定 self.test_mask = torch.zeros(0, dtype=torch.bool) # 先用空的，后面read后，再指定 def read(self, content_path, cites_path): """ content_path: cora.content文件所在的路径 cites_path： cora.cites文件所在的路径 """ # 读取content_path，这是个文本文件，可以直接打开查看，数据包括论文的编号、论文的词向量、以及论文分类的标签 with open(content_path, 'r') as f: # 读取第一行 line = f.readline() s = line.strip().split('\t') # 用tab隔开的数据 n_features = len(s[1:-1]) # 特征的维度，实际上就是词向量的维度，打开cora.content可以看到，去掉首尾的部分，中间就是词向量 # 重新定义输入 x 的大小 self.x = torch.empty(size=(0, n_features), dtype=torch.float) # 逐行读取文本文件 while line: # 从一条文本中，解析出数据，这里分别是 paper_id, word_attributes, class_label s = line.strip().split('\t') paper_id, word_attributes, class_id = int(s[0]), np.array([s[1:-1]], dtype=int), np.array([classes.index(s[-1])]) # 拼接成数据集 self.paper_ids = np.hstack((self.paper_ids, paper_id)) # self.x = torch.cat((self.x, torch.tensor(word_attributes, dtype=torch.float)), dim=0) self.y = torch.cat((self.y, torch.tensor(class_id)), dim=0) # 继续下一行 line = f.readline() f.close() # 读取cites_path，这是个文本文件，可以直接打开查看，里面放的是论文的引用关系 with open(cites_path, 'r') as f: line = f.readline() # 逐行读取文本文件 while line: # 从一条文本中，解析出数据，这里分别是 paper_id, word_attributes, class_label s = line.strip().split('\t') # 从paper_ids中，查询得到索引 cited_index, citing_index = np.where(self.paper_ids == int(s[0]))[0][0], np.where(self.paper_ids == int(s[1]))[0][0] # 拼接得到由索引构成的边 self.edge_index = torch.cat((self.edge_index, torch.tensor([[cited_index], [citing_index]])), dim=1) # 继续下一行 line = f.readline() f.close() # 分配 train validate test self.train_mask = torch.zeros(self.num_nodes, dtype=torch.bool) self.val_mask = torch.zeros(self.num_nodes, dtype=torch.bool) self.test_mask = torch.zeros(self.num_nodes, dtype=torch.bool) # 这里简单的用前80%做训练集，中间10%做验证集，后10%做测试集 self.train_mask[0: int(0.85 * self.num_nodes)] = True self.val_mask[int(0.85 * self.num_nodes): int(0.95 * self.num_nodes)] = True self.test_mask[int(0.95 * self.num_nodes):] = True # 定义网络结构 class Net(torch.nn.Module): # 继承自 torch.nn.Module def __init__(self, in_channels, out_channels): super(Net, self).__init__() # 两个图卷积层 self.conv1 = GCNConv(in_channels, 16) # 第1层，对输入做gcn，把输入的维度从in_channes变为16维，对于CORA数据集，in_channels=1433维 self.conv2 = GCNConv(16, out_channels) # 第2层，把输入的维度从16维，变为输出的维度，对于CORA数据集，out_channel=7维 # 前馈，这个网络很简单，对于CORA数据集，就是1433维输入，经过第1层卷积变为16维特征，再经过第2层卷积变为7维输出 def forward(self, x, edge_index): # 第1层 x = self.conv1(x, edge_index) # 卷积 x = F.relu(x) # 激活 # 第2层 x = self.conv2(x, edge_index) # 卷积 return F.log_softmax(x, dim=1) # softmax，得到分类 # 可视化，准备画布 fig1 = plt.figure(figsize=(7., 3.)) ax1, ax2 = fig1.add_subplot(121), fig1.add_subplot(122) # 1. 训练是使用GPU，还是使用CPU device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("GCN_TEST: use device=", device) # 2. 读取数据集 # # option 1 直接从本地读取CORA数据集 data = DATA() data.read(content_path='cora/cora.content', cites_path='cora/cora.cites') data.to(device) # 把数据集放到GPU上，如果有的话 # 2. 模型，实际上就是上面建立的GCN网络 model = Net(in_channels=data.num_features, out_channels=len(classes)).to(device) # 模型也放到GPU上，如果有的话 # 3. 训练过程中所使用的优化器，也就是误差怎样反向传播，这里使用Adam optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4) # 4. 模型训练 model.train() plot_loss = np.array([]) for epoch in range(200): # # 初始化梯度 optimizer.zero_grad() # 前向计算，模型的输入有节点特征还有边特征,使用的是全部数据 out = model.forward(data.x, data.edge_index) # 损失仅仅计算的是训练集的损失 loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask]) # 误差反向传播 loss.backward() optimizer.step() # 可视化 plot_loss = np.hstack((plot_loss, loss.cpu().detach().numpy())) ax1.plot(plot_loss, c='b') ax1.set_xlabel('epoch') ax1.set_ylabel('loss') ax1.set_t

评论收藏

内容反馈