图网络embeding transE及node2vec方法

1、 transE 表示学习
知识图谱中的事实是用三元组 (h,l,t)(h,l,t)(h,l,t) 表示的，那么如何用低维稠密向量来表示它们，才能得到这种依赖关系呢？transE算法的思想非常简单，它受word2vec平移不变性的启发，希望h+l≈t

参考：https://github.com/Anery/transE

import codecs
import random
import math
import numpy as np
import copy
import timeentity2id = {}
relation2id = {}def data_loader(file):file1 = file + "train.txt"file2 = file + "entity2id.txt"file3 = file + "relation2id.txt"with open(file2, 'r') as f1, open(file3, 'r') as f2:lines1 = f1.readlines()lines2 = f2.readlines()for line in lines1:line = line.strip().split('\t')if len(line) != 2:continueentity2id[line[0]] = line[1]for line in lines2:line = line.strip().split('\t')if len(line) != 2:continuerelation2id[line[0]] = line[1]entity_set = set()relation_set = set()triple_list = []with codecs.open(file1, 'r') as f:content = f.readlines()for line in content:triple = line.strip().split("\t")if len(triple) != 3:continueh_ = entity2id[triple[0]]t_ = entity2id[triple[1]]r_ = relation2id[triple[2]]triple_list.append([h_,t_,r_])entity_set.add(h_)entity_set.add(t_)relation_set.add(r_)return entity_set, relation_set, triple_listdef distanceL2(h,r,t):#为方便求梯度，去掉sqrtreturn np.sum(np.square(h + r - t))def distanceL1(h,r,t):return np.sum(np.fabs(h+r-t))class TransE:def __init__(self, entity_set, relation_set, triple_list,embedding_dim=100, learning_rate=0.01, margin=1, L1=True):self.embedding_dim = embedding_dimself.learning_rate = learning_rateself.margin = marginself.entity = entity_setself.relation = relation_setself.triple_list = triple_listself.L1=L1self.loss = 0def emb_initialize(self):relation_dict = {}entity_dict = {}for relation in self.relation:r_emb_temp = np.random.uniform(-6/math.sqrt(self.embedding_dim) ,6/math.sqrt(self.embedding_dim) ,self.embedding_dim)relation_dict[relation] = r_emb_temp / np.linalg.norm(r_emb_temp,ord=2)for entity in self.entity:e_emb_temp = np.random.uniform(-6/math.sqrt(self.embedding_dim) ,6/math.sqrt(self.embedding_dim) ,self.embedding_dim)entity_dict[entity] = e_emb_temp / np.linalg.norm(e_emb_temp,ord=2)self.relation = relation_dictself.entity = entity_dictdef train(self, epochs):nbatches = 100batch_size = len(self.triple_list) // nbatchesprint("batch size: ", batch_size)for epoch in range(epochs):start = time.time()self.loss = 0for k in range(nbatches):# Sbatch:listSbatch = random.sample(self.triple_list, batch_size)Tbatch = []for triple in Sbatch:# 每个triple选3个负样例# for i in range(3):corrupted_triple = self.Corrupt(triple)if (triple, corrupted_triple) not in Tbatch:Tbatch.append((triple, corrupted_triple))self.update_embeddings(Tbatch)end = time.time()print("epoch: ", epoch , "cost time: %s"%(round((end - start),3)))print("loss: ", self.loss)#保存临时结果if epoch % 20 == 0:with codecs.open("entity_temp", "w") as f_e:for e in self.entity.keys():f_e.write(e + "\t")f_e.write(str(list(self.entity[e])))f_e.write("\n")with codecs.open("relation_temp", "w") as f_r:for r in self.relation.keys():f_r.write(r + "\t")f_r.write(str(list(self.relation[r])))f_r.write("\n")print("写入文件...")with codecs.open("entity_50dim_batch400", "w") as f1:for e in self.entity.keys():f1.write(e + "\t")f1.write(str(list(self.entity[e])))f1.write("\n")with codecs.open("relation50dim_batch400", "w") as f2:for r in self.relation.keys():f2.write(r + "\t")f2.write(str(list(self.relation[r])))f2.write("\n")print("写入完成")def Corrupt(self,triple):corrupted_triple = copy.deepcopy(triple)seed = random.random()if seed > 0.5:# 替换headrand_head = triple[0]while rand_head == triple[0]:rand_head = random.sample(self.entity.keys(),1)[0]corrupted_triple[0]=rand_headelse:# 替换tailrand_tail = triple[1]while rand_tail == triple[1]:rand_tail = random.sample(self.entity.keys(), 1)[0]corrupted_triple[1] = rand_tailreturn corrupted_tripledef update_embeddings(self, Tbatch):copy_entity = copy.deepcopy(self.entity)copy_relation = copy.deepcopy(self.relation)for triple, corrupted_triple in Tbatch:# 取copy里的vector累积更新h_correct_update = copy_entity[triple[0]]t_correct_update = copy_entity[triple[1]]relation_update = copy_relation[triple[2]]h_corrupt_update = copy_entity[corrupted_triple[0]]t_corrupt_update = copy_entity[corrupted_triple[1]]# 取原始的vector计算梯度h_correct = self.entity[triple[0]]t_correct = self.entity[triple[1]]relation = self.relation[triple[2]]h_corrupt = self.entity[corrupted_triple[0]]t_corrupt = self.entity[corrupted_triple[1]]if self.L1:dist_correct = distanceL1(h_correct, relation, t_correct)dist_corrupt = distanceL1(h_corrupt, relation, t_corrupt)else:dist_correct = distanceL2(h_correct, relation, t_correct)dist_corrupt = distanceL2(h_corrupt, relation, t_corrupt)err = self.hinge_loss(dist_correct, dist_corrupt)if err > 0:self.loss += errgrad_pos = 2 * (h_correct + relation - t_correct)grad_neg = 2 * (h_corrupt + relation - t_corrupt)if self.L1:for i in range(len(grad_pos)):if (grad_pos[i] > 0):grad_pos[i] = 1else:grad_pos[i] = -1for i in range(len(grad_neg)):if (grad_neg[i] > 0):grad_neg[i] = 1else:grad_neg[i] = -1# head系数为正，减梯度；tail系数为负，加梯度h_correct_update -= self.learning_rate * grad_post_correct_update -= (-1) * self.learning_rate * grad_pos# corrupt项整体为负，因此符号与correct相反if triple[0] == corrupted_triple[0]:  # 若替换的是尾实体，则头实体更新两次h_correct_update -= (-1) * self.learning_rate * grad_negt_corrupt_update -= self.learning_rate * grad_negelif triple[1] == corrupted_triple[1]:  # 若替换的是头实体，则尾实体更新两次h_corrupt_update -= (-1) * self.learning_rate * grad_negt_correct_update -= self.learning_rate * grad_neg#relation更新两次relation_update -= self.learning_rate*grad_posrelation_update -= (-1)*self.learning_rate*grad_neg#batch normfor i in copy_entity.keys():copy_entity[i] /= np.linalg.norm(copy_entity[i])for i in copy_relation.keys():copy_relation[i] /= np.linalg.norm(copy_relation[i])# 达到批量更新的目的self.entity = copy_entityself.relation = copy_relationdef hinge_loss(self,dist_correct,dist_corrupt):return max(0,dist_correct-dist_corrupt+self.margin)if __name__=='__main__':file1 = "FB15k/"entity_set, relation_set, triple_list = data_loader(file1)print("load file...")print("Complete load. entity : %d , relation : %d , triple : %d" % (len(entity_set),len(relation_set),len(triple_list)))transE = TransE(entity_set, relation_set, triple_list,embedding_dim=50, learning_rate=0.01, margin=1,L1=True)transE.emb_initialize()transE.train(epochs=1001)

2、node2vec 图嵌入方法

随机游走，集合了deepwalk dfs及line bfs的方法，

参考：https://github.com/eliorc/node2vec

pip install node2vec

import networkx as nx
from node2vec import Node2Vec# Create a graph
graph = nx.fast_gnp_random_graph(n=100, p=0.5)# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(graph, dimensions=64, walk_length=30, num_walks=200, workers=4)  # Use temp_folder for big graphs# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `diemnsions` and `workers` are automatically passed (from the Node2Vec constructor)# Look for most similar nodes
model.wv.most_similar('2')  # Output node names are always strings# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)# Embed edges using Hadamard method
from node2vec.edges import HadamardEmbedderedges_embs = HadamardEmbedder(keyed_vectors=model.wv)# Look for embeddings on the fly - here we pass normal tuples
edges_embs[('1', '2')]
''' OUTPUT
array([ 5.75068220e-03, -1.10937878e-02,  3.76693785e-01,  2.69105062e-02,... ... ......................................................................],dtype=float32)
'''# Get all edges in a separate KeyedVectors instance - use with caution could be huge for big networks
edges_kv = edges_embs.as_keyed_vectors()# Look for most similar edges - this time tuples must be sorted and as str
edges_kv.most_similar(str(('1', '2')))# Save embeddings for later use
edges_kv.save_word2vec_format(EDGES_EMBEDDING_FILENAME)

图网络embeding transE及node2vec方法相关推荐

从数据结构到算法：图网络方法初探
如果说 2019 年机器学习领域什么方向最火,那么必然有图神经网络的一席之地.其实早在很多年前,图神经网络就以图嵌入.图表示学习.网络嵌入等别名呈现出来,其实所有的这些方法本质上都是作用在图上的机器学 ...
pagerank数据集_从数据结构到算法：图网络方法初探
机器之心原创作者:朱梓豪编辑:Qing Lin 如果说 2019 年机器学习领域什么方向最火,那么必然有图神经网络的一席之地.其实早在很多年前,图神经网络就以图嵌入.图表示学习.网络嵌入等别名呈现出 ...
特征图注意力_从数据结构到算法：图网络方法初探
作者 | 朱梓豪来源 | 机器之心原文 | 从数据结构到算法:图网络方法初探如果说 2019 年机器学习领域什么方向最火,那么必然有图神经网络的一席之地.其实早在很多年前,图神经网络就以图嵌入. ...
当图网络遇上计算机视觉！计算机视觉中基于图神经网络和图Transformer的方法和最新进展...
点击下方卡片,关注"CVer"公众号 AI/CV重磅干货,第一时间送达点击进入-> CV 微信技术交流群可能是目前最全面的<当图网络遇上计算机视觉>综述!近四 ...
路由器连接显示多重网络连接服务器,电脑出现多重网络的原因及解决方法(图)...
原标题:"电脑出现多重网络的原因及解决方法"相关电脑问题教程分享. - 来源:191路由网. 电脑出现多重网络要怎么办呢?我们在使用电脑上网时,难免都会遇到网络方面的问题.这不最近 ...
多重网络与计算机之间是感叹号,电脑出现多重网络的原因及解决方法(图)
原标题:"电脑出现多重网络的原因及解决方法"相关电脑问题教程分享. - 来源:191路由网. 电脑出现多重网络要怎么办呢?我们在使用电脑上网时,难免都会遇到网络方面的问题.这不最近 ...
图谱实战 | 再谈图谱表示：图网络表示GE与知识图谱表示KGE的原理对比与实操效果分析...
转载公众号 | 老刘说NLP 知识图谱嵌入是一个经典话题,在之前的文章<知识表示技术:图谱表示VS图网络表示及基于距离函数的表示学习总结>中,围绕知识图谱嵌入学习这一主题,对比了知识图谱嵌 ...
综述 | 生成对抗网络（GAN）在图网络中的应用
导语: 生成对抗网络(Generative Adversarial Network,简称GAN)是非监督式学习的一种方法,通过让两个神经网络相互博弈的方式进行学习.自2014年GAN网络提出以来,其在 ...
CANE：上下文相关动态图网络表示
相关阅读: DeepWalk:图网络与NLP的巧妙融合 Node2Vec:万物皆可Embedding LINE:不得不看的大规模信息网络嵌入 TADW:当DeepWalk加上外部文本信息 SDNE: ...

图网络embeding transE及node2vec方法

图网络embeding transE及node2vec方法相关推荐

最新文章

热门文章