"""RDF datasetsDatasets from "A Collection of Benchmark Datasets forSystematic Evaluations of Machine Learning onthe Semantic Web""""importabcimportitertoolsimportosimportrefromcollectionsimportOrderedDictimportnetworkxasnximportnumpyasnpimportdglimportdgl.backendasFfrom.dgl_datasetimportDGLBuiltinDatasetfrom.utilsimport(_get_dgl_url,generate_mask_tensor,idx2mask,load_graphs,load_info,save_graphs,save_info,)__all__=["AIFBDataset","MUTAGDataset","BGSDataset","AMDataset"]# Dictionary for renaming reserved node/edge type names to the ones# that are allowed by nn.Module.RENAME_DICT={"type":"rdftype","rev-type":"rev-rdftype",}classEntity:"""Class for entities Parameters ---------- id : str ID of this entity cls : str Type of this entity """def__init__(self,e_id,cls):self.id=e_idself.cls=clsdef__str__(self):return"{}/{}".format(self.cls,self.id)classRelation:"""Class for relations Parameters ---------- cls : str Type of this relation """def__init__(self,cls):self.cls=clsdef__str__(self):returnstr(self.cls)classRDFGraphDataset(DGLBuiltinDataset):"""Base graph dataset class from RDF tuples. To derive from this, implement the following abstract methods: * ``parse_entity`` * ``parse_relation`` * ``process_tuple`` * ``process_idx_file_line`` * ``predict_category`` Preprocessed graph and other data will be cached in the download folder to speedup data loading. The dataset should contain a "trainingSet.tsv" and a "testSet.tsv" file for training and testing samples. Attributes ---------- num_classes : int Number of classes to predict predict_category : str The entity category (node type) that has labels for prediction Parameters ---------- name : str Name of the dataset url : str or path URL to download the raw dataset. predict_category : str Predict category. print_every : int, optional Preprocessing log for every X tuples. insert_reverse : bool, optional If true, add reverse edge and reverse relations to the final graph. raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool, optional If true, force load and process from raw data. Ignore cached pre-processed data. verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. """def__init__(self,name,url,predict_category,print_every=10000,insert_reverse=True,raw_dir=None,force_reload=False,verbose=True,transform=None,):self._insert_reverse=insert_reverseself._print_every=print_everyself._predict_category=predict_categorysuper(RDFGraphDataset,self).__init__(name,url,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)defprocess(self):raw_tuples=self.load_raw_tuples(self.raw_path)self.process_raw_tuples(raw_tuples,self.raw_path)defload_raw_tuples(self,root_path):"""Loading raw RDF dataset Parameters ---------- root_path : str Root path containing the data Returns ------- Loaded rdf data """importrdflibasrdfraw_rdf_graphs=[]for_,filenameinenumerate(os.listdir(root_path)):fmt=Noneiffilename.endswith("nt"):fmt="nt"eliffilename.endswith("n3"):fmt="n3"iffmtisNone:continueg=rdf.Graph()print("Parsing file %s ..."%filename)g.parse(os.path.join(root_path,filename),format=fmt)raw_rdf_graphs.append(g)returnitertools.chain(*raw_rdf_graphs)defprocess_raw_tuples(self,raw_tuples,root_path):"""Processing raw RDF dataset Parameters ---------- raw_tuples: Raw rdf tuples root_path: str Root path containing the data """mg=nx.MultiDiGraph()ent_classes=OrderedDict()rel_classes=OrderedDict()entities=OrderedDict()src=[]dst=[]ntid=[]etid=[]sorted_tuples=[]fortinraw_tuples:sorted_tuples.append(t)sorted_tuples.sort()fori,(sbj,pred,obj)inenumerate(sorted_tuples):ifself.verboseandi%self._print_every==0:print("Processed %d tuples, found %d valid tuples."%(i,len(src)))sbjent=self.parse_entity(sbj)rel=self.parse_relation(pred)objent=self.parse_entity(obj)processed=self.process_tuple((sbj,pred,obj),sbjent,rel,objent)ifprocessedisNone:# ignoredcontinue# meta graphsbjclsid=_get_id(ent_classes,sbjent.cls)objclsid=_get_id(ent_classes,objent.cls)relclsid=_get_id(rel_classes,rel.cls)mg.add_edge(sbjent.cls,objent.cls,key=rel.cls)ifself._insert_reverse:mg.add_edge(objent.cls,sbjent.cls,key="rev-%s"%rel.cls)# instance graphsrc_id=_get_id(entities,str(sbjent))iflen(entities)>len(ntid):# found new entityntid.append(sbjclsid)dst_id=_get_id(entities,str(objent))iflen(entities)>len(ntid):# found new entityntid.append(objclsid)src.append(src_id)dst.append(dst_id)etid.append(relclsid)src=np.asarray(src)dst=np.asarray(dst)ntid=np.asarray(ntid)etid=np.asarray(etid)ntypes=list(ent_classes.keys())etypes=list(rel_classes.keys())# add reverse edge with reverse relationifself._insert_reverse:ifself.verbose:print("Adding reverse edges ...")newsrc=np.hstack([src,dst])newdst=np.hstack([dst,src])src=newsrcdst=newdstetid=np.hstack([etid,etid+len(etypes)])etypes.extend(["rev-%s"%tfortinetypes])hg=self.build_graph(mg,src,dst,ntid,etid,ntypes,etypes)ifself.verbose:print("Load training/validation/testing split ...")idmap=F.asnumpy(hg.nodes[self.predict_category].data[dgl.NID])glb2lcl={glbid:lclidforlclid,glbidinenumerate(idmap)}deffindidfn(ent):ifentnotinentities:returnNoneelse:returnglb2lcl[entities[ent]]self._hg=hgtrain_idx,test_idx,labels,num_classes=self.load_data_split(findidfn,root_path)train_mask=idx2mask(train_idx,self._hg.num_nodes(self.predict_category))test_mask=idx2mask(test_idx,self._hg.num_nodes(self.predict_category))labels=F.tensor(labels,F.data_type_dict["int64"])train_mask=generate_mask_tensor(train_mask)test_mask=generate_mask_tensor(test_mask)self._hg.nodes[self.predict_category].data["train_mask"]=train_maskself._hg.nodes[self.predict_category].data["test_mask"]=test_mask# TODO(minjie): Deprecate 'labels', use 'label' for consistency.self._hg.nodes[self.predict_category].data["labels"]=labelsself._hg.nodes[self.predict_category].data["label"]=labelsself._num_classes=num_classesdefbuild_graph(self,mg,src,dst,ntid,etid,ntypes,etypes):"""Build the graphs Parameters ---------- mg: MultiDiGraph Input graph src: Numpy array Source nodes dst: Numpy array Destination nodes ntid: Numpy array Node types for each node etid: Numpy array Edge types for each edge ntypes: list Node types etypes: list Edge types Returns ------- g: DGLGraph """# create homo graphifself.verbose:print("Creating one whole graph ...")g=dgl.graph((src,dst))g.ndata[dgl.NTYPE]=F.tensor(ntid)g.edata[dgl.ETYPE]=F.tensor(etid)ifself.verbose:print("Total #nodes:",g.num_nodes())print("Total #edges:",g.num_edges())# rename names such as 'type' so that they an be used as keys# to nn.ModuleDictetypes=[RENAME_DICT.get(ty,ty)fortyinetypes]mg_edges=mg.edges(keys=True)mg=nx.MultiDiGraph()forsty,dty,etyinmg_edges:mg.add_edge(sty,dty,key=RENAME_DICT.get(ety,ety))# convert to heterographifself.verbose:print("Convert to heterograph ...")hg=dgl.to_heterogeneous(g,ntypes,etypes,metagraph=mg)ifself.verbose:print("#Node types:",len(hg.ntypes))print("#Canonical edge types:",len(hg.etypes))print("#Unique edge type names:",len(set(hg.etypes)))returnhgdefload_data_split(self,ent2id,root_path):"""Load data split Parameters ---------- ent2id: func A function mapping entity to id root_path: str Root path containing the data Return ------ train_idx: Numpy array Training set test_idx: Numpy array Testing set labels: Numpy array Labels num_classes: int Number of classes """label_dict={}labels=np.zeros((self._hg.num_nodes(self.predict_category),))-1train_idx=self.parse_idx_file(os.path.join(root_path,"trainingSet.tsv"),ent2id,label_dict,labels,)test_idx=self.parse_idx_file(os.path.join(root_path,"testSet.tsv"),ent2id,label_dict,labels)train_idx=np.array(train_idx)test_idx=np.array(test_idx)labels=np.array(labels)num_classes=len(label_dict)returntrain_idx,test_idx,labels,num_classesdefparse_idx_file(self,filename,ent2id,label_dict,labels):"""Parse idx files Parameters ---------- filename: str File to parse ent2id: func A function mapping entity to id label_dict: dict Map label to label id labels: dict Map entity id to label id Return ------ idx: list Entity idss """idx=[]withopen(filename,"r")asf:fori,lineinenumerate(f):ifi==0:continue# first line is the headersample,label=self.process_idx_file_line(line)# person, _, label = line.strip().split('\t')ent=self.parse_entity(sample)entid=ent2id(str(ent))ifentidisNone:print('Warning: entity "%s" does not have any valid links associated. Ignored.'%str(ent))else:idx.append(entid)lblid=_get_id(label_dict,label)labels[entid]=lblidreturnidxdefhas_cache(self):"""check if there is a processed data"""graph_path=os.path.join(self.save_path,self.save_name+".bin")info_path=os.path.join(self.save_path,self.save_name+".pkl")ifos.path.exists(graph_path)andos.path.exists(info_path):returnTruereturnFalsedefsave(self):"""save the graph list and the labels"""graph_path=os.path.join(self.save_path,self.save_name+".bin")info_path=os.path.join(self.save_path,self.save_name+".pkl")save_graphs(str(graph_path),self._hg)save_info(str(info_path),{"num_classes":self.num_classes,"predict_category":self.predict_category,},)defload(self):"""load the graph list and the labels from disk"""graph_path=os.path.join(self.save_path,self.save_name+".bin")info_path=os.path.join(self.save_path,self.save_name+".pkl")graphs,_=load_graphs(str(graph_path))info=load_info(str(info_path))self._num_classes=info["num_classes"]self._predict_category=info["predict_category"]self._hg=graphs[0]# For backward compatibilityif"label"notinself._hg.nodes[self.predict_category].data:self._hg.nodes[self.predict_category].data["label"]=self._hg.nodes[self.predict_category].data["labels"]def__getitem__(self,idx):r"""Gets the graph object"""g=self._hgifself._transformisnotNone:g=self._transform(g)returngdef__len__(self):r"""The number of graphs in the dataset."""return1@propertydefsave_name(self):returnself.name+"_dgl_graph"@propertydefpredict_category(self):returnself._predict_category@propertydefnum_classes(self):returnself._num_classes@abc.abstractmethoddefparse_entity(self,term):"""Parse one entity from an RDF term. Return None if the term does not represent a valid entity and the whole tuple should be ignored. Parameters ---------- term : rdflib.term.Identifier RDF term Returns ------- Entity or None An entity. """pass@abc.abstractmethoddefparse_relation(self,term):"""Parse one relation from an RDF term. Return None if the term does not represent a valid relation and the whole tuple should be ignored. Parameters ---------- term : rdflib.term.Identifier RDF term Returns ------- Relation or None A relation """pass@abc.abstractmethoddefprocess_tuple(self,raw_tuple,sbj,rel,obj):"""Process the tuple. Return (Entity, Relation, Entity) tuple for as the final tuple. Return None if the tuple should be ignored. Parameters ---------- raw_tuple : tuple of rdflib.term.Identifier (subject, predicate, object) tuple sbj : Entity Subject entity rel : Relation Relation obj : Entity Object entity Returns ------- (Entity, Relation, Entity) The final tuple or None if should be ignored """pass@abc.abstractmethoddefprocess_idx_file_line(self,line):"""Process one line of ``trainingSet.tsv`` or ``testSet.tsv``. Parameters ---------- line : str One line of the file Returns ------- (str, str) One sample and its label """passdef_get_id(dict,key):id=dict.get(key,None)ifidisNone:id=len(dict)dict[key]=idreturnid
[docs]classAIFBDataset(RDFGraphDataset):r"""AIFB dataset for node classification task AIFB DataSet is a Semantic Web (RDF) dataset used as a benchmark in data mining. It records the organizational structure of AIFB at the University of Karlsruhe. AIFB dataset statistics: - Nodes: 7262 - Edges: 48810 (including reverse edges) - Target Category: Personen - Number of Classes: 4 - Label Split: - Train: 140 - Test: 36 Parameters ----------- print_every : int Preprocessing log for every X tuples. Default: 10000. insert_reverse : bool If true, add reverse edge and reverse relations to the final graph. Default: True. raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes to predict predict_category : str The entity category (node type) that has labels for prediction Examples -------- >>> dataset = dgl.data.rdf.AIFBDataset() >>> graph = dataset[0] >>> category = dataset.predict_category >>> num_classes = dataset.num_classes >>> >>> train_mask = g.nodes[category].data['train_mask'] >>> test_mask = g.nodes[category].data['test_mask'] >>> label = g.nodes[category].data['label'] """entity_prefix="http://www.aifb.uni-karlsruhe.de/"relation_prefix="http://swrc.ontoware.org/"def__init__(self,print_every=10000,insert_reverse=True,raw_dir=None,force_reload=False,verbose=True,transform=None,):importrdflibasrdfself.employs=rdf.term.URIRef("http://swrc.ontoware.org/ontology#employs")self.affiliation=rdf.term.URIRef("http://swrc.ontoware.org/ontology#affiliation")url=_get_dgl_url("dataset/rdf/aifb-hetero.zip")name="aifb-hetero"predict_category="Personen"super(AIFBDataset,self).__init__(name,url,predict_category,print_every=print_every,insert_reverse=insert_reverse,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, AIFBDataset has only one graph object Return ------- :class:`dgl.DGLGraph` The graph contains: - ``ndata['train_mask']``: mask for training node set - ``ndata['test_mask']``: mask for testing node set - ``ndata['label']``: node labels """returnsuper(AIFBDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset. Return ------- int """returnsuper(AIFBDataset,self).__len__()
[docs]classMUTAGDataset(RDFGraphDataset):r"""MUTAG dataset for node classification task Mutag dataset statistics: - Nodes: 27163 - Edges: 148100 (including reverse edges) - Target Category: d - Number of Classes: 2 - Label Split: - Train: 272 - Test: 68 Parameters ----------- print_every : int Preprocessing log for every X tuples. Default: 10000. insert_reverse : bool If true, add reverse edge and reverse relations to the final graph. Default: True. raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes to predict predict_category : str The entity category (node type) that has labels for prediction graph : :class:`dgl.DGLGraph` Graph structure Examples -------- >>> dataset = dgl.data.rdf.MUTAGDataset() >>> graph = dataset[0] >>> category = dataset.predict_category >>> num_classes = dataset.num_classes >>> >>> train_mask = g.nodes[category].data['train_mask'] >>> test_mask = g.nodes[category].data['test_mask'] >>> label = g.nodes[category].data['label'] """d_entity=re.compile("d[0-9]")bond_entity=re.compile("bond[0-9]")entity_prefix="http://dl-learner.org/carcinogenesis#"relation_prefix=entity_prefixdef__init__(self,print_every=10000,insert_reverse=True,raw_dir=None,force_reload=False,verbose=True,transform=None,):importrdflibasrdfself.is_mutagenic=rdf.term.URIRef("http://dl-learner.org/carcinogenesis#isMutagenic")self.rdf_type=rdf.term.URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type")self.rdf_subclassof=rdf.term.URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf")self.rdf_domain=rdf.term.URIRef("http://www.w3.org/2000/01/rdf-schema#domain")url=_get_dgl_url("dataset/rdf/mutag-hetero.zip")name="mutag-hetero"predict_category="d"super(MUTAGDataset,self).__init__(name,url,predict_category,print_every=print_every,insert_reverse=insert_reverse,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, MUTAGDataset has only one graph object Return ------- :class:`dgl.DGLGraph` The graph contains: - ``ndata['train_mask']``: mask for training node set - ``ndata['test_mask']``: mask for testing node set - ``ndata['label']``: node labels """returnsuper(MUTAGDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset. Return ------- int """returnsuper(MUTAGDataset,self).__len__()
[docs]classBGSDataset(RDFGraphDataset):r"""BGS dataset for node classification task BGS namespace convention: ``http://data.bgs.ac.uk/(ref|id)/<Major Concept>/<Sub Concept>/INSTANCE``. We ignored all literal nodes and the relations connecting them in the output graph. We also ignored the relation used to mark whether a term is CURRENT or DEPRECATED. BGS dataset statistics: - Nodes: 94806 - Edges: 672884 (including reverse edges) - Target Category: Lexicon/NamedRockUnit - Number of Classes: 2 - Label Split: - Train: 117 - Test: 29 Parameters ----------- print_every : int Preprocessing log for every X tuples. Default: 10000. insert_reverse : bool If true, add reverse edge and reverse relations to the final graph. Default: True. raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes to predict predict_category : str All the labels of the entities in ``predict_category`` Examples -------- >>> dataset = dgl.data.rdf.BGSDataset() >>> graph = dataset[0] >>> category = dataset.predict_category >>> num_classes = dataset.num_classes >>> >>> train_mask = g.nodes[category].data['train_mask'] >>> test_mask = g.nodes[category].data['test_mask'] >>> label = g.nodes[category].data['label'] """entity_prefix="http://data.bgs.ac.uk/"status_prefix="http://data.bgs.ac.uk/ref/CurrentStatus"relation_prefix="http://data.bgs.ac.uk/ref"def__init__(self,print_every=10000,insert_reverse=True,raw_dir=None,force_reload=False,verbose=True,transform=None,):importrdflibasrdfurl=_get_dgl_url("dataset/rdf/bgs-hetero.zip")name="bgs-hetero"predict_category="Lexicon/NamedRockUnit"self.lith=rdf.term.URIRef("http://data.bgs.ac.uk/ref/Lexicon/hasLithogenesis")super(BGSDataset,self).__init__(name,url,predict_category,print_every=print_every,insert_reverse=insert_reverse,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, BGSDataset has only one graph object Return ------- :class:`dgl.DGLGraph` The graph contains: - ``ndata['train_mask']``: mask for training node set - ``ndata['test_mask']``: mask for testing node set - ``ndata['label']``: node labels """returnsuper(BGSDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset. Return ------- int """returnsuper(BGSDataset,self).__len__()
[docs]classAMDataset(RDFGraphDataset):"""AM dataset. for node classification task Namespace convention: - Instance: ``http://purl.org/collections/nl/am/<type>-<id>`` - Relation: ``http://purl.org/collections/nl/am/<name>`` We ignored all literal nodes and the relations connecting them in the output graph. AM dataset statistics: - Nodes: 881680 - Edges: 5668682 (including reverse edges) - Target Category: proxy - Number of Classes: 11 - Label Split: - Train: 802 - Test: 198 Parameters ----------- print_every : int Preprocessing log for every X tuples. Default: 10000. insert_reverse : bool If true, add reverse edge and reverse relations to the final graph. Default: True. raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Attributes ---------- num_classes : int Number of classes to predict predict_category : str The entity category (node type) that has labels for prediction Examples -------- >>> dataset = dgl.data.rdf.AMDataset() >>> graph = dataset[0] >>> category = dataset.predict_category >>> num_classes = dataset.num_classes >>> >>> train_mask = g.nodes[category].data['train_mask'] >>> test_mask = g.nodes[category].data['test_mask'] >>> label = g.nodes[category].data['label'] """entity_prefix="http://purl.org/collections/nl/am/"relation_prefix=entity_prefixdef__init__(self,print_every=10000,insert_reverse=True,raw_dir=None,force_reload=False,verbose=True,transform=None,):importrdflibasrdfself.objectCategory=rdf.term.URIRef("http://purl.org/collections/nl/am/objectCategory")self.material=rdf.term.URIRef("http://purl.org/collections/nl/am/material")url=_get_dgl_url("dataset/rdf/am-hetero.zip")name="am-hetero"predict_category="proxy"super(AMDataset,self).__init__(name,url,predict_category,print_every=print_every,insert_reverse=insert_reverse,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, AMDataset has only one graph object Return ------- :class:`dgl.DGLGraph` The graph contains: - ``ndata['train_mask']``: mask for training node set - ``ndata['test_mask']``: mask for testing node set - ``ndata['label']``: node labels """returnsuper(AMDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset. Return ------- int """returnsuper(AMDataset,self).__len__()