程式碼地址: https://github.com/Diego999/pyGAT
我並沒有完整看過這篇論文,但是在大致瞭解其原理之後就直接看了程式碼= =。
使用的資料集:Cora dataset
import numpy as np import pickle as pkl import networkx as nx import scipy.sparse as sp from scipy.sparse.linalg.eigen.arpack import eigsh import sys import time import numpy as np import tensorflow as tf
def load_data(dataset_str): # {'pubmed', 'citeseer', 'cora'} """Load data.""" names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph'] objects = [] for i in range(len(names)): with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, y, tx, ty, allx, ally, graph = tuple(objects) test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str)) test_idx_range = np.sort(test_idx_reorder) if dataset_str == 'citeseer': # Fix citeseer dataset (there are some isolated nodes in the graph) # Find isolated nodes, add them as zero-vecs into the right position test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1) tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1])) tx_extended[test_idx_range-min(test_idx_range), :] = tx tx = tx_extended ty_extended = np.zeros((len(test_idx_range_full), y.shape[1])) ty_extended[test_idx_range-min(test_idx_range), :] = ty ty = ty_extended features = sp.vstack((allx, tx)).tolil() features[test_idx_reorder, :] = features[test_idx_range, :] adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph)) labels = np.vstack((ally, ty)) labels[test_idx_reorder, :] = labels[test_idx_range, :] idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0]) y_train = np.zeros(labels.shape) y_val = np.zeros(labels.shape) y_test = np.zeros(labels.shape) y_train[train_mask, :] = labels[train_mask, :] y_val[val_mask, :] = labels[val_mask, :] y_test[test_mask, :] = labels[test_mask, :] print(adj.shape) print(features.shape) return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
{ 1:[2,3], 4:[6,7,8], }
(0, 1) 1 (0, 30) 1 (0, 33) 1 (0, 99) 1
idx_test = test_idx_range.tolist() idx_train = range(len(y)) idx_val = range(len(y), len(y)+500) train_mask = sample_mask(idx_train, labels.shape[0]) val_mask = sample_mask(idx_val, labels.shape[0]) test_mask = sample_mask(idx_test, labels.shape[0])
def sample_mask(idx, l): """Create mask.""" mask = np.zeros(l) mask[idx] = 1 return np.array(mask, dtype=np.bool)
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = process.load_data(dataset) features, spars = process.preprocess_features(features) nb_nodes = features.shape[0] #節點數 ft_size = features.shape[1] #特徵維度 nb_classes = y_train.shape[1] #類別數 adj = adj.todense() features = features[np.newaxis] adj = adj[np.newaxis] y_train = y_train[np.newaxis] y_val = y_val[np.newaxis] y_test = y_test[np.newaxis] train_mask = train_mask[np.newaxis] val_mask = val_mask[np.newaxis] test_mask = test_mask[np.newaxis] biases = process.adj_to_bias(adj, [nb_nodes], nhood=1)
def preprocess_features(features): """Row-normalize feature matrix and convert to tuple representation""" rowsum = np.array(features.sum(1)) r_inv = np.power(rowsum, -1).flatten() r_inv[np.isinf(r_inv)] = 0. r_mat_inv = sp.diags(r_inv) features = r_mat_inv.dot(features) return features.todense(), sparse_to_tuple(features)
我們獲得的features是由:sp.vstack((allx, tx)).tolil()得到的,也就是一個lil_matrix。lil_matrix基於行連線儲存的稀疏矩陣,看以下例子:
- 對每一行求和
- 計算每個值的倒數,然後展開成一維陣列
- 將inf型別的值置為0
- 生成一個對角矩陣,對角的值為一維陣列的值,其餘的為0
- 最後與特徵進行np.matmul
- 使用todense()函式lil_matrix還原為原始矩陣
features;(2708,1433) ->(1,2708,1433)
adj:(2708,2708) -> (1,2708,2708)
y_train:(2708,7) -> (1,2708,7)
y_val:(2708,7) -> (1,2708,7)
y_test:(2708,7) -> (1,2708,7)
train_mask:(2708,.) -> (1,2708)
val_mask:(2708,.) -> (1,2708)
test_mask:(2708,.) -> (1,2708)
最後是:biases = process.adj_to_bias(adj, [nb_nodes], nhood=1)
看一下adj_to_bias(adj, [nb_nodes], nhood=1)函式:
def adj_to_bias(adj, sizes, nhood=1): nb_graphs = adj.shape[0] #1 mt = np.empty(adj.shape) #(1,2708,2708) for g in range(nb_graphs): #range(1) mt[g] = np.eye(adj.shape[1]) #mt[0] = np.eye(2708) -> (2708,2708) for _ in range(nhood): #range(1) mt[g] = np.matmul(mt[g], (adj[g] + np.eye(adj.shape[1]))) #np.matmul((2708,2708),((2708,2708)+(2708,2708)))->(2708,2708) for i in range(sizes[g]): #range(2078) for j in range(sizes[g]): #range(2078) if mt[g][i][j] > 0.0: mt[g][i][j] = 1.0 return -1e9 * (1.0 - mt)
- 生成一個空(1,2708,2708)的矩陣mt
- 將mt[0],也就是(2708,2708)變為對角值為1的對角矩陣
- 該對角矩陣和(adj[g] + np.eye(adj.shape[1]))進行np.matmul
- 然後將mt中大於0的元素置為1
- 最後返回-1e9*(1-mt),形狀為(1,2708,2708)
import numpy as np import tensorflow as tf from utils import layers from models.base_gattn import BaseGAttN class GAT(BaseGAttN): def inference(inputs, nb_classes, nb_nodes, training, attn_drop, ffd_drop, bias_mat, hid_units, n_heads, activation=tf.nn.elu, residual=False): attns = [] for _ in range(n_heads[0]): attns.append(layers.attn_head(inputs, bias_mat=bias_mat, out_sz=hid_units[0], activation=activation, in_drop=ffd_drop, coef_drop=attn_drop, residual=False)) h_1 = tf.concat(attns, axis=-1) for i in range(1, len(hid_units)): h_old = h_1 attns = [] for _ in range(n_heads[i]): attns.append(layers.attn_head(h_1, bias_mat=bias_mat, out_sz=hid_units[i], activation=activation, in_drop=ffd_drop, coef_drop=attn_drop, residual=residual)) h_1 = tf.concat(attns, axis=-1) out = [] for i in range(n_heads[-1]): out.append(layers.attn_head(h_1, bias_mat=bias_mat, out_sz=nb_classes, activation=lambda x: x, in_drop=ffd_drop, coef_drop=attn_drop, residual=False)) logits = tf.add_n(out) / n_heads[-1] return logits
def attn_head(seq, out_sz, bias_mat, activation, in_drop=0.0, coef_drop=0.0, residual=False): with tf.name_scope('my_attn'): if in_drop != 0.0: seq = tf.nn.dropout(seq, 1.0 - in_drop) seq_fts = tf.layers.conv1d(seq, out_sz, 1, use_bias=False) # simplest self-attention possible f_1 = tf.layers.conv1d(seq_fts, 1, 1) f_2 = tf.layers.conv1d(seq_fts, 1, 1) logits = f_1 + tf.transpose(f_2, [0, 2, 1]) coefs = tf.nn.softmax(tf.nn.leaky_relu(logits) + bias_mat) if coef_drop != 0.0: coefs = tf.nn.dropout(coefs, 1.0 - coef_drop) if in_drop != 0.0: seq_fts = tf.nn.dropout(seq_fts, 1.0 - in_drop) vals = tf.matmul(coefs, seq_fts) ret = tf.contrib.layers.bias_add(vals) # residual connection if residual: if seq.shape[-1] != ret.shape[-1]: ret = ret + conv1d(seq, ret.shape[-1], 1) # activation else: ret = ret + seq return activation(ret) # activation
- 輸入features:(1,2708,1433)
- 第一步對特徵features卷積,得到seq_fts:(1,2708,8)
- 第二步分別對seq_fts進行卷積,得到f_1:(1,2708,1),f_2:(1,2708,1)
- 第三步將f_2調整形狀為:(1,1,2708),利用廣播機制與f_1相加得到logits:(1,2708,2708)
- 第四步加上偏置項biases_mat,我們注意到偏置項的值很小,是為了過濾掉和該節點不相鄰的節點,因為考慮的是某節點和其鄰居節點之間的注意力。並使用leakyrelu進行啟用,最後經過softmax生成注意力得分coefs:(1,2708,2708)
- 第五步將注意力得分與seq_fts進行np.matmul得到帶注意力得分的vals:(1,2708,8)
- 第六步:ret = tf.contrib.layers.bias_add(vals),這裡我執行時是報錯的,說沒有什麼TPU之類的,我將其註釋掉了才執行成功(tensorflow1.14)
- 最後將ret進行啟用:tf.nn.elu
主要是如何對注意力進行建模,實際上f_1+f_2也就是(1,2708,1) * (1,1,2708),這裡就計算了每個節點和其它節點之間的得分。
class BaseGAttN: def loss(logits, labels, nb_classes, class_weights): sample_wts = tf.reduce_sum(tf.multiply(tf.one_hot(labels, nb_classes), class_weights), axis=-1) xentropy = tf.multiply(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits), sample_wts) return tf.reduce_mean(xentropy, name='xentropy_mean') def training(loss, lr, l2_coef): # weight decay vars = tf.trainable_variables() lossL2 = tf.add_n([tf.nn.l2_loss(v) for v in vars if v.name not in ['bias', 'gamma', 'b', 'g', 'beta']]) * l2_coef # optimizer opt = tf.train.AdamOptimizer(learning_rate=lr) # training op train_op = opt.minimize(loss+lossL2) return train_op def preshape(logits, labels, nb_classes): new_sh_lab = [-1] new_sh_log = [-1, nb_classes] log_resh = tf.reshape(logits, new_sh_log) lab_resh = tf.reshape(labels, new_sh_lab) return log_resh, lab_resh def confmat(logits, labels): preds = tf.argmax(logits, axis=1) return tf.confusion_matrix(labels, preds) ########################## # Adapted from tkipf/gcn # ########################## def masked_softmax_cross_entropy(logits, labels, mask): """Softmax cross-entropy loss with masking.""" loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) loss *= mask return tf.reduce_mean(loss) def masked_sigmoid_cross_entropy(logits, labels, mask): """Softmax cross-entropy loss with masking.""" labels = tf.cast(labels, dtype=tf.float32) loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels) loss=tf.reduce_mean(loss,axis=1) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) loss *= mask return tf.reduce_mean(loss) def masked_accuracy(logits, labels, mask): """Accuracy with masking.""" correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1)) accuracy_all = tf.cast(correct_prediction, tf.float32) mask = tf.cast(mask, dtype=tf.float32) mask /= tf.reduce_mean(mask) accuracy_all *= mask return tf.reduce_mean(accuracy_all) def micro_f1(logits, labels, mask): """Accuracy with masking.""" predicted = tf.round(tf.nn.sigmoid(logits)) # Use integers to avoid any nasty FP behaviour predicted = tf.cast(predicted, dtype=tf.int32) labels = tf.cast(labels, dtype=tf.int32) mask = tf.cast(mask, dtype=tf.int32) # expand the mask so that broadcasting works ([nb_nodes, 1]) mask = tf.expand_dims(mask, -1) # Count true positives, true negatives, false positives and false negatives. tp = tf.count_nonzero(predicted * labels * mask) tn = tf.count_nonzero((predicted - 1) * (labels - 1) * mask) fp = tf.count_nonzero(predicted * (labels - 1) * mask) fn = tf.count_nonzero((predicted - 1) * labels * mask) # Calculate accuracy, precision, recall and F1 score. precision = tp / (tp + fp) recall = tp / (tp + fn) fmeasure = (2 * precision * recall) / (precision + recall) fmeasure = tf.cast(fmeasure, tf.float32) return fmeasure
with tf.Graph().as_default(): with tf.name_scope('input'): ftr_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, ft_size)) bias_in = tf.placeholder(dtype=tf.float32, shape=(batch_size, nb_nodes, nb_nodes)) lbl_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes, nb_classes)) msk_in = tf.placeholder(dtype=tf.int32, shape=(batch_size, nb_nodes)) attn_drop = tf.placeholder(dtype=tf.float32, shape=()) ffd_drop = tf.placeholder(dtype=tf.float32, shape=()) is_train = tf.placeholder(dtype=tf.bool, shape=()) logits = model.inference(ftr_in, nb_classes, nb_nodes, is_train, attn_drop, ffd_drop, bias_mat=bias_in, hid_units=hid_units, n_heads=n_heads, residual=residual, activation=nonlinearity) log_resh = tf.reshape(logits, [-1, nb_classes]) lab_resh = tf.reshape(lbl_in, [-1, nb_classes]) msk_resh = tf.reshape(msk_in, [-1]) loss = model.masked_softmax_cross_entropy(log_resh, lab_resh, msk_resh) accuracy = model.masked_accuracy(log_resh, lab_resh, msk_resh) train_op = model.training(loss, lr, l2_coef) saver = tf.train.Saver() init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) vlss_mn = np.inf vacc_mx = 0.0 curr_step = 0 with tf.Session() as sess: sess.run(init_op) train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 for epoch in range(nb_epochs): tr_step = 0 tr_size = features.shape[0] while tr_step * batch_size < tr_size: _, loss_value_tr, acc_tr = sess.run([train_op, loss, accuracy], feed_dict={ ftr_in: features[tr_step*batch_size:(tr_step+1)*batch_size], bias_in: biases[tr_step*batch_size:(tr_step+1)*batch_size], lbl_in: y_train[tr_step*batch_size:(tr_step+1)*batch_size], msk_in: train_mask[tr_step*batch_size:(tr_step+1)*batch_size], is_train: True, attn_drop: 0.6, ffd_drop: 0.6}) train_loss_avg += loss_value_tr train_acc_avg += acc_tr tr_step += 1 vl_step = 0 vl_size = features.shape[0] while vl_step * batch_size < vl_size: loss_value_vl, acc_vl = sess.run([loss, accuracy], feed_dict={ ftr_in: features[vl_step*batch_size:(vl_step+1)*batch_size], bias_in: biases[vl_step*batch_size:(vl_step+1)*batch_size], lbl_in: y_val[vl_step*batch_size:(vl_step+1)*batch_size], msk_in: val_mask[vl_step*batch_size:(vl_step+1)*batch_size], is_train: False, attn_drop: 0.0, ffd_drop: 0.0}) val_loss_avg += loss_value_vl val_acc_avg += acc_vl vl_step += 1 print('Training: loss = %.5f, acc = %.5f | Val: loss = %.5f, acc = %.5f' % (train_loss_avg/tr_step, train_acc_avg/tr_step, val_loss_avg/vl_step, val_acc_avg/vl_step)) if val_acc_avg/vl_step >= vacc_mx or val_loss_avg/vl_step <= vlss_mn: if val_acc_avg/vl_step >= vacc_mx and val_loss_avg/vl_step <= vlss_mn: vacc_early_model = val_acc_avg/vl_step vlss_early_model = val_loss_avg/vl_step saver.save(sess, checkpt_file) vacc_mx = np.max((val_acc_avg/vl_step, vacc_mx)) vlss_mn = np.min((val_loss_avg/vl_step, vlss_mn)) curr_step = 0 else: curr_step += 1 if curr_step == patience: print('Early stop! Min loss: ', vlss_mn, ', Max accuracy: ', vacc_mx) print('Early stop model validation loss: ', vlss_early_model, ', accuracy: ', vacc_early_model) break train_loss_avg = 0 train_acc_avg = 0 val_loss_avg = 0 val_acc_avg = 0 saver.restore(sess, checkpt_file) ts_size = features.shape[0] ts_step = 0 ts_loss = 0.0 ts_acc = 0.0 while ts_step * batch_size < ts_size: loss_value_ts, acc_ts = sess.run([loss, accuracy], feed_dict={ ftr_in: features[ts_step*batch_size:(ts_step+1)*batch_size], bias_in: biases[ts_step*batch_size:(ts_step+1)*batch_size], lbl_in: y_test[ts_step*batch_size:(ts_step+1)*batch_size], msk_in: test_mask[ts_step*batch_size:(ts_step+1)*batch_size], is_train: False, attn_drop: 0.0, ffd_drop: 0.0}) ts_loss += loss_value_ts ts_acc += acc_ts ts_step += 1 print('Test loss:', ts_loss/ts_step, '; Test accuracy:', ts_acc/ts_step) sess.close()