"""Cora, citeseer, pubmed dataset.(lingfan): following dataset loading and preprocessing code from tkipf/gcnhttps://github.com/tkipf/gcn/blob/master/gcn/utils.py"""from__future__importabsolute_importimportos,sysimportpickleaspklimportwarningsimportnetworkxasnximportnumpyasnpimportscipy.sparseasspfrom..importbackendasF,convertfrom..batchimportbatchasbatch_graphsfrom..convertimportfrom_networkx,graphasdgl_graph,to_networkxfrom..transformsimportreorder_graphfrom.dgl_datasetimportDGLBuiltinDatasetfrom.utilsimport(_get_dgl_url,deprecate_function,deprecate_property,generate_mask_tensor,load_graphs,load_info,makedirs,save_graphs,save_info,)backend=os.environ.get("DGLBACKEND","pytorch")def_pickle_load(pkl_file):withwarnings.catch_warnings():warnings.simplefilter("ignore",category=DeprecationWarning)ifsys.version_info>(3,0):returnpkl.load(pkl_file,encoding="latin1")else:returnpkl.load(pkl_file)classCitationGraphDataset(DGLBuiltinDataset):r"""The citation graph dataset, including cora, citeseer and pubmeb. Nodes mean authors and edges mean citation relationships. Parameters ----------- name: str name can be 'cora', 'citeseer' or 'pubmed'. raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. reorder : bool Whether to reorder the graph using :func:`~dgl.reorder_graph`. Default: False. """_urls={"cora_v2":"dataset/cora_v2.zip","citeseer":"dataset/citeseer.zip","pubmed":"dataset/pubmed.zip",}def__init__(self,name,raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,reorder=False,):assertname.lower()in["cora","citeseer","pubmed"]# Previously we use the pre-processing in pygcn (https://github.com/tkipf/pygcn)# for Cora, which is slightly different from the one used in the GCN paperifname.lower()=="cora":name="cora_v2"url=_get_dgl_url(self._urls[name])self._reverse_edge=reverse_edgeself._reorder=reordersuper(CitationGraphDataset,self).__init__(name,url=url,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)defprocess(self):"""Loads input data from data directory and reorder graph for better locality ind.name.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object; ind.name.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object; ind.name.allx => the feature vectors of both labeled and unlabeled training instances (a superset of ind.name.x) as scipy.sparse.csr.csr_matrix object; ind.name.y => the one-hot labels of the labeled training instances as numpy.ndarray object; ind.name.ty => the one-hot labels of the test instances as numpy.ndarray object; ind.name.ally => the labels for instances in ind.name.allx as numpy.ndarray object; ind.name.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict object; ind.name.test.index => the indices of test instances in graph, for the inductive setting as list object. """root=self.raw_pathobjnames=["x","y","tx","ty","allx","ally","graph"]objects=[]foriinrange(len(objnames)):withopen("{}/ind.{}.{}".format(root,self.name,objnames[i]),"rb")asf:objects.append(_pickle_load(f))x,y,tx,ty,allx,ally,graph=tuple(objects)test_idx_reorder=_parse_index_file("{}/ind.{}.test.index".format(root,self.name))test_idx_range=np.sort(test_idx_reorder)ifself.name=="citeseer":# Fix citeseer dataset (there are some isolated nodes in the graph)# Find isolated nodes, add them as zero-vecs into the right positiontest_idx_range_full=range(min(test_idx_reorder),max(test_idx_reorder)+1)tx_extended=sp.lil_matrix((len(test_idx_range_full),x.shape[1]))tx_extended[test_idx_range-min(test_idx_range),:]=txtx=tx_extendedty_extended=np.zeros((len(test_idx_range_full),y.shape[1]))ty_extended[test_idx_range-min(test_idx_range),:]=tyty=ty_extendedfeatures=sp.vstack((allx,tx)).tolil()features[test_idx_reorder,:]=features[test_idx_range,:]ifself.reverse_edge:graph=nx.DiGraph(nx.from_dict_of_lists(graph))g=from_networkx(graph)else:graph=nx.Graph(nx.from_dict_of_lists(graph))edges=list(graph.edges())u,v=map(list,zip(*edges))g=dgl_graph((u,v))onehot_labels=np.vstack((ally,ty))onehot_labels[test_idx_reorder,:]=onehot_labels[test_idx_range,:]labels=np.argmax(onehot_labels,1)idx_test=test_idx_range.tolist()idx_train=range(len(y))idx_val=range(len(y),len(y)+500)train_mask=generate_mask_tensor(_sample_mask(idx_train,labels.shape[0]))val_mask=generate_mask_tensor(_sample_mask(idx_val,labels.shape[0]))test_mask=generate_mask_tensor(_sample_mask(idx_test,labels.shape[0]))g.ndata["train_mask"]=train_maskg.ndata["val_mask"]=val_maskg.ndata["test_mask"]=test_maskg.ndata["label"]=F.tensor(labels)g.ndata["feat"]=F.tensor(_preprocess_features(features),dtype=F.data_type_dict["float32"])self._num_classes=onehot_labels.shape[1]self._labels=labelsifself._reorder:self._g=reorder_graph(g,node_permute_algo="rcmk",edge_permute_algo="dst",store_ids=False,)else:self._g=gifself.verbose:print("Finished data loading and preprocessing.")print(" NumNodes: {}".format(self._g.num_nodes()))print(" NumEdges: {}".format(self._g.num_edges()))print(" NumFeats: {}".format(self._g.ndata["feat"].shape[1]))print(" NumClasses: {}".format(self.num_classes))print(" NumTrainingSamples: {}".format(F.nonzero_1d(self._g.ndata["train_mask"]).shape[0]))print(" NumValidationSamples: {}".format(F.nonzero_1d(self._g.ndata["val_mask"]).shape[0]))print(" NumTestSamples: {}".format(F.nonzero_1d(self._g.ndata["test_mask"]).shape[0]))@propertydefgraph_path(self):returnos.path.join(self.save_path,self.save_name+".bin")@propertydefinfo_path(self):returnos.path.join(self.save_path,self.save_name+".pkl")defhas_cache(self):ifos.path.exists(self.graph_path)andos.path.exists(self.info_path):returnTruereturnFalsedefsave(self):"""save the graph list and the labels"""save_graphs(str(self.graph_path),self._g)save_info(str(self.info_path),{"num_classes":self.num_classes})defload(self):graphs,_=load_graphs(str(self.graph_path))info=load_info(str(self.info_path))graph=graphs[0]self._g=graph# for compatabilitygraph=graph.clone()graph.ndata.pop("train_mask")graph.ndata.pop("val_mask")graph.ndata.pop("test_mask")graph.ndata.pop("feat")graph.ndata.pop("label")graph=to_networkx(graph)self._num_classes=info["num_classes"]self._g.ndata["train_mask"]=generate_mask_tensor(F.asnumpy(self._g.ndata["train_mask"]))self._g.ndata["val_mask"]=generate_mask_tensor(F.asnumpy(self._g.ndata["val_mask"]))self._g.ndata["test_mask"]=generate_mask_tensor(F.asnumpy(self._g.ndata["test_mask"]))# hack for mxnet compatabilityifself.verbose:print(" NumNodes: {}".format(self._g.num_nodes()))print(" NumEdges: {}".format(self._g.num_edges()))print(" NumFeats: {}".format(self._g.ndata["feat"].shape[1]))print(" NumClasses: {}".format(self.num_classes))print(" NumTrainingSamples: {}".format(F.nonzero_1d(self._g.ndata["train_mask"]).shape[0]))print(" NumValidationSamples: {}".format(F.nonzero_1d(self._g.ndata["val_mask"]).shape[0]))print(" NumTestSamples: {}".format(F.nonzero_1d(self._g.ndata["test_mask"]).shape[0]))def__getitem__(self,idx):assertidx==0,"This dataset has only one graph"ifself._transformisNone:returnself._gelse:returnself._transform(self._g)def__len__(self):return1@propertydefsave_name(self):returnself.name+"_dgl_graph"@propertydefnum_labels(self):deprecate_property("dataset.num_labels","dataset.num_classes")returnself.num_classes@propertydefnum_classes(self):returnself._num_classes""" Citation graph is used in many examples We preserve these properties for compatability. """@propertydefreverse_edge(self):returnself._reverse_edgedef_preprocess_features(features):"""Row-normalize feature matrix and convert to tuple representation"""features=_normalize(features)returnnp.asarray(features.todense())def_parse_index_file(filename):"""Parse index file."""index=[]forlineinopen(filename):index.append(int(line.strip()))returnindexdef_sample_mask(idx,l):"""Create mask."""mask=np.zeros(l)mask[idx]=1returnmask
[docs]classCoraGraphDataset(CitationGraphDataset):r"""Cora citation network dataset. Nodes mean paper and edges mean citation relationships. Each node has a predefined feature with 1433 dimensions. The dataset is designed for the node classification task. The task is to predict the category of certain paper. Statistics: - Nodes: 2708 - Edges: 10556 - Number of Classes: 7 - Label split: - Train: 140 - Valid: 500 - Test: 1000 Parameters ---------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. reorder : bool Whether to reorder the graph using :func:`~dgl.reorder_graph`. Default: False. Attributes ---------- num_classes: int Number of label classes Notes ----- The node feature is row-normalized. Examples -------- >>> dataset = CoraGraphDataset() >>> g = dataset[0] >>> num_class = dataset.num_classes >>> >>> # get node feature >>> feat = g.ndata['feat'] >>> >>> # get data split >>> train_mask = g.ndata['train_mask'] >>> val_mask = g.ndata['val_mask'] >>> test_mask = g.ndata['test_mask'] >>> >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,reorder=False,):name="cora"super(CoraGraphDataset,self).__init__(name,raw_dir,force_reload,verbose,reverse_edge,transform,reorder,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, CoraGraphDataset has only one graph object Return ------ :class:`dgl.DGLGraph` graph structure, node features and labels. - ``ndata['train_mask']``: mask for training node set - ``ndata['val_mask']``: mask for validation node set - ``ndata['test_mask']``: mask for test node set - ``ndata['feat']``: node feature - ``ndata['label']``: ground truth labels """returnsuper(CoraGraphDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset."""returnsuper(CoraGraphDataset,self).__len__()
[docs]classCiteseerGraphDataset(CitationGraphDataset):r"""Citeseer citation network dataset. Nodes mean scientific publications and edges mean citation relationships. Each node has a predefined feature with 3703 dimensions. The dataset is designed for the node classification task. The task is to predict the category of certain publication. Statistics: - Nodes: 3327 - Edges: 9228 - Number of Classes: 6 - Label Split: - Train: 120 - Valid: 500 - Test: 1000 Parameters ----------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. reorder : bool Whether to reorder the graph using :func:`~dgl.reorder_graph`. Default: False. Attributes ---------- num_classes: int Number of label classes Notes ----- The node feature is row-normalized. In citeseer dataset, there are some isolated nodes in the graph. These isolated nodes are added as zero-vecs into the right position. Examples -------- >>> dataset = CiteseerGraphDataset() >>> g = dataset[0] >>> num_class = dataset.num_classes >>> >>> # get node feature >>> feat = g.ndata['feat'] >>> >>> # get data split >>> train_mask = g.ndata['train_mask'] >>> val_mask = g.ndata['val_mask'] >>> test_mask = g.ndata['test_mask'] >>> >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,reorder=False,):name="citeseer"super(CiteseerGraphDataset,self).__init__(name,raw_dir,force_reload,verbose,reverse_edge,transform,reorder,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, CiteseerGraphDataset has only one graph object Return ------ :class:`dgl.DGLGraph` graph structure, node features and labels. - ``ndata['train_mask']``: mask for training node set - ``ndata['val_mask']``: mask for validation node set - ``ndata['test_mask']``: mask for test node set - ``ndata['feat']``: node feature - ``ndata['label']``: ground truth labels """returnsuper(CiteseerGraphDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset."""returnsuper(CiteseerGraphDataset,self).__len__()
[docs]classPubmedGraphDataset(CitationGraphDataset):r"""Pubmed citation network dataset. Nodes mean scientific publications and edges mean citation relationships. Each node has a predefined feature with 500 dimensions. The dataset is designed for the node classification task. The task is to predict the category of certain publication. Statistics: - Nodes: 19717 - Edges: 88651 - Number of Classes: 3 - Label Split: - Train: 60 - Valid: 500 - Test: 1000 Parameters ----------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. reorder : bool Whether to reorder the graph using :func:`~dgl.reorder_graph`. Default: False. Attributes ---------- num_classes: int Number of label classes Notes ----- The node feature is row-normalized. Examples -------- >>> dataset = PubmedGraphDataset() >>> g = dataset[0] >>> num_class = dataset.num_of_class >>> >>> # get node feature >>> feat = g.ndata['feat'] >>> >>> # get data split >>> train_mask = g.ndata['train_mask'] >>> val_mask = g.ndata['val_mask'] >>> test_mask = g.ndata['test_mask'] >>> >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,reorder=False,):name="pubmed"super(PubmedGraphDataset,self).__init__(name,raw_dir,force_reload,verbose,reverse_edge,transform,reorder,)
[docs]def__getitem__(self,idx):r"""Gets the graph object Parameters ----------- idx: int Item index, PubmedGraphDataset has only one graph object Return ------ :class:`dgl.DGLGraph` graph structure, node features and labels. - ``ndata['train_mask']``: mask for training node set - ``ndata['val_mask']``: mask for validation node set - ``ndata['test_mask']``: mask for test node set - ``ndata['feat']``: node feature - ``ndata['label']``: ground truth labels """returnsuper(PubmedGraphDataset,self).__getitem__(idx)
[docs]def__len__(self):r"""The number of graphs in the dataset."""returnsuper(PubmedGraphDataset,self).__len__()
defload_cora(raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,):"""Get CoraGraphDataset Parameters ----------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Return ------- CoraGraphDataset """data=CoraGraphDataset(raw_dir,force_reload,verbose,reverse_edge,transform)returndatadefload_citeseer(raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,):"""Get CiteseerGraphDataset Parameters ----------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Return ------- CiteseerGraphDataset """data=CiteseerGraphDataset(raw_dir,force_reload,verbose,reverse_edge,transform)returndatadefload_pubmed(raw_dir=None,force_reload=False,verbose=True,reverse_edge=True,transform=None,):"""Get PubmedGraphDataset Parameters ----------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose : bool Whether to print out progress information. Default: True. reverse_edge : bool Whether to add reverse edges in graph. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Return ------- PubmedGraphDataset """data=PubmedGraphDataset(raw_dir,force_reload,verbose,reverse_edge,transform)returndataclassCoraBinary(DGLBuiltinDataset):"""A mini-dataset for binary classification task using Cora. After loaded, it has following members: graphs : list of :class:`~dgl.DGLGraph` pmpds : list of :class:`scipy.sparse.coo_matrix` labels : list of :class:`numpy.ndarray` Parameters ----------- raw_dir : str Raw file directory to download/contains the input data directory. Default: ~/.dgl/ force_reload : bool Whether to reload the dataset. Default: False verbose: bool Whether to print out progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. """def__init__(self,raw_dir=None,force_reload=False,verbose=True,transform=None):name="cora_binary"url=_get_dgl_url("dataset/cora_binary.zip")super(CoraBinary,self).__init__(name,url=url,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)defprocess(self):root=self.raw_path# load graphsself.graphs=[]withopen("{}/graphs.txt".format(root),"r")asf:elist=[]forlineinf.readlines():ifline.startswith("graph"):iflen(elist)!=0:self.graphs.append(dgl_graph(tuple(zip(*elist))))elist=[]else:u,v=line.strip().split(" ")elist.append((int(u),int(v)))iflen(elist)!=0:self.graphs.append(dgl_graph(tuple(zip(*elist))))withopen("{}/pmpds.pkl".format(root),"rb")asf:self.pmpds=_pickle_load(f)self.labels=[]withopen("{}/labels.txt".format(root),"r")asf:cur=[]forlineinf.readlines():ifline.startswith("graph"):iflen(cur)!=0:self.labels.append(np.asarray(cur))cur=[]else:cur.append(int(line.strip()))iflen(cur)!=0:self.labels.append(np.asarray(cur))# sanity checkassertlen(self.graphs)==len(self.pmpds)assertlen(self.graphs)==len(self.labels)@propertydefgraph_path(self):returnos.path.join(self.save_path,self.save_name+".bin")defhas_cache(self):ifos.path.exists(self.graph_path):returnTruereturnFalsedefsave(self):"""save the graph list and the labels"""labels={}fori,labelinenumerate(self.labels):labels["{}".format(i)]=F.tensor(label)save_graphs(str(self.graph_path),self.graphs,labels)ifself.verbose:print("Done saving data into cached files.")defload(self):self.graphs,labels=load_graphs(str(self.graph_path))self.labels=[]foriinrange(len(labels)):self.labels.append(F.asnumpy(labels["{}".format(i)]))# load pmpds under self.raw_pathwithopen("{}/pmpds.pkl".format(self.raw_path),"rb")asf:self.pmpds=_pickle_load(f)ifself.verbose:print("Done loading data into cached files.")# sanity checkassertlen(self.graphs)==len(self.pmpds)assertlen(self.graphs)==len(self.labels)def__len__(self):returnlen(self.graphs)def__getitem__(self,i):r"""Gets the idx-th sample. Parameters ----------- idx : int The sample index. Returns ------- (dgl.DGLGraph, scipy.sparse.coo_matrix, int) The graph, scipy sparse coo_matrix and its label. """ifself._transformisNone:g=self.graphs[i]else:g=self._transform(self.graphs[i])return(g,self.pmpds[i],self.labels[i])@propertydefsave_name(self):returnself.name+"_dgl_graph"@staticmethoddefcollate_fn(cur):graphs,pmpds,labels=zip(*cur)batched_graphs=batch_graphs(graphs)batched_pmpds=sp.block_diag(pmpds)batched_labels=np.concatenate(labels,axis=0)returnbatched_graphs,batched_pmpds,batched_labelsdef_normalize(mx):"""Row-normalize sparse matrix"""rowsum=np.asarray(mx.sum(1))mask=np.equal(rowsum,0.0).flatten()rowsum[mask]=np.nanr_inv=np.power(rowsum,-1).flatten()r_inv[mask]=0.0r_mat_inv=sp.diags(r_inv)returnr_mat_inv.dot(mx)def_encode_onehot(labels):classes=list(sorted(set(labels)))classes_dict={c:np.identity(len(classes))[i,:]fori,cinenumerate(classes)}labels_onehot=np.asarray(list(map(classes_dict.get,labels)),dtype=np.int32)returnlabels_onehot