"""Datasets introduced in the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are WeReally Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper."""importosimportnumpyasnpfrom..convertimportgraphfrom..transforms.functionalimportto_bidirectedfrom.dgl_datasetimportDGLBuiltinDatasetfrom.utilsimportdownloadclassHeterophilousGraphDataset(DGLBuiltinDataset):r"""Datasets introduced in the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper. Parameters ---------- name : str Name of the dataset. One of 'roman-empire', 'amazon-ratings', 'minesweeper', 'tolokers', 'questions'. raw_dir : str Raw file directory to store the processed data. force_reload : bool Whether to re-download the data source. verbose : bool Whether to print progress information. transform : callable A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. """def__init__(self,name,raw_dir=None,force_reload=False,verbose=True,transform=None,):name=name.lower().replace("-","_")url=f"https://github.com/yandex-research/heterophilous-graphs/raw/main/data/{name}.npz"super(HeterophilousGraphDataset,self).__init__(name=name,url=url,raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)defdownload(self):download(url=self.url,path=os.path.join(self.raw_path,f"{self.name}.npz"))defprocess(self):"""Load and process the data."""try:importtorchexceptImportError:raiseModuleNotFoundError("This dataset requires PyTorch to be the backend.")data=np.load(os.path.join(self.raw_path,f"{self.name}.npz"))src=torch.from_numpy(data["edges"][:,0])dst=torch.from_numpy(data["edges"][:,1])features=torch.from_numpy(data["node_features"])labels=torch.from_numpy(data["node_labels"])train_masks=torch.from_numpy(data["train_masks"].T)val_masks=torch.from_numpy(data["val_masks"].T)test_masks=torch.from_numpy(data["test_masks"].T)num_nodes=len(labels)num_classes=len(labels.unique())self._num_classes=num_classesself._g=to_bidirected(graph((src,dst),num_nodes=num_nodes))self._g.ndata["feat"]=featuresself._g.ndata["label"]=labelsself._g.ndata["train_mask"]=train_masksself._g.ndata["val_mask"]=val_masksself._g.ndata["test_mask"]=test_masksdefhas_cache(self):returnos.path.exists(self.raw_path)defload(self):self.process()def__getitem__(self,idx):assertidx==0,"This dataset has only one graph."ifself._transformisNone:returnself._gelse:returnself._transform(self._g)def__len__(self):return1@propertydefnum_classes(self):returnself._num_classes
[docs]classRomanEmpireDataset(HeterophilousGraphDataset):r"""Roman-empire dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper. This dataset is based on the Roman Empire article from English Wikipedia, which was selected since it is one of the longest articles on Wikipedia. Each node in the graph corresponds to one (non-unique) word in the text. Thus, the number of nodes in the graph is equal to the article’s length. Two words are connected with an edge if at least one of the following two conditions holds: either these words follow each other in the text, or these words are connected in the dependency tree of the sentence (one word depends on the other). Thus, the graph is a chain graph with additional shortcut edges corresponding to syntactic dependencies between words. The class of a node is its syntactic role (17 most frequent roles were selected as unique classes and all the other roles were grouped into the 18th class). Node features are word embeddings. Statistics: - Nodes: 22662 - Edges: 65854 - Classes: 18 - Node features: 300 - 10 train/val/test splits Parameters ---------- raw_dir : str, optional Raw file directory to store the processed data. Default: ~/.dgl/ force_reload : bool, optional Whether to re-download the data source. Default: False verbose : bool, optional Whether to print progress information. Default: True transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Default: None Attributes ---------- num_classes : int Number of node classes Examples -------- >>> from dgl.data import RomanEmpireDataset >>> dataset = RomanEmpireDataset() >>> g = dataset[0] >>> num_classes = dataset.num_classes >>> # get node features >>> feat = g.ndata["feat"] >>> # get the first data split >>> train_mask = g.ndata["train_mask"][:, 0] >>> val_mask = g.ndata["val_mask"][:, 0] >>> test_mask = g.ndata["test_mask"][:, 0] >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,transform=None):super(RomanEmpireDataset,self).__init__(name="roman-empire",raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]classAmazonRatingsDataset(HeterophilousGraphDataset):r"""Amazon-ratings dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper. This dataset is based on the Amazon product co-purchasing data. Nodes are products (books, music CDs, DVDs, VHS video tapes), and edges connect products that are frequently bought together. The task is to predict the average rating given to a product by reviewers. All possible rating values were grouped into five classes. Node features are the mean of word embeddings for words in the product description. Statistics: - Nodes: 24492 - Edges: 186100 - Classes: 5 - Node features: 300 - 10 train/val/test splits Parameters ---------- raw_dir : str, optional Raw file directory to store the processed data. Default: ~/.dgl/ force_reload : bool, optional Whether to re-download the data source. Default: False verbose : bool, optional Whether to print progress information. Default: True transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Default: None Attributes ---------- num_classes : int Number of node classes Examples -------- >>> from dgl.data import AmazonRatingsDataset >>> dataset = AmazonRatingsDataset() >>> g = dataset[0] >>> num_classes = dataset.num_classes >>> # get node features >>> feat = g.ndata["feat"] >>> # get the first data split >>> train_mask = g.ndata["train_mask"][:, 0] >>> val_mask = g.ndata["val_mask"][:, 0] >>> test_mask = g.ndata["test_mask"][:, 0] >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,transform=None):super(AmazonRatingsDataset,self).__init__(name="amazon-ratings",raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]classMinesweeperDataset(HeterophilousGraphDataset):r"""Minesweeper dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper. This dataset is inspired by the Minesweeper game. The graph is a regular 100x100 grid where each node (cell) is connected to eight neighboring nodes (with the exception of nodes at the edge of the grid, which have fewer neighbors). 20% of the nodes are randomly selected as mines. The task is to predict which nodes are mines. The node features are one-hot-encoded numbers of neighboring mines. However, for randomly selected 50% of the nodes, the features are unknown, which is indicated by a separate binary feature. Statistics: - Nodes: 10000 - Edges: 78804 - Classes: 2 - Node features: 7 - 10 train/val/test splits Parameters ---------- raw_dir : str, optional Raw file directory to store the processed data. Default: ~/.dgl/ force_reload : bool, optional Whether to re-download the data source. Default: False verbose : bool, optional Whether to print progress information. Default: True transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Default: None Attributes ---------- num_classes : int Number of node classes Examples -------- >>> from dgl.data import MinesweeperDataset >>> dataset = MinesweeperDataset() >>> g = dataset[0] >>> num_classes = dataset.num_classes >>> # get node features >>> feat = g.ndata["feat"] >>> # get the first data split >>> train_mask = g.ndata["train_mask"][:, 0] >>> val_mask = g.ndata["val_mask"][:, 0] >>> test_mask = g.ndata["test_mask"][:, 0] >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,transform=None):super(MinesweeperDataset,self).__init__(name="minesweeper",raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]classTolokersDataset(HeterophilousGraphDataset):r"""Tolokers dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper. This dataset is based on data from the Toloka crowdsourcing platform. The nodes represent tolokers (workers). An edge connects two tolokers if they have worked on the same task. The goal is to predict which tolokers have been banned in one of the projects. Node features are based on the worker’s profile information and task performance statistics. Statistics: - Nodes: 11758 - Edges: 1038000 - Classes: 2 - Node features: 10 - 10 train/val/test splits Parameters ---------- raw_dir : str, optional Raw file directory to store the processed data. Default: ~/.dgl/ force_reload : bool, optional Whether to re-download the data source. Default: False verbose : bool, optional Whether to print progress information. Default: True transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Default: None Attributes ---------- num_classes : int Number of node classes Examples -------- >>> from dgl.data import TolokersDataset >>> dataset = TolokersDataset() >>> g = dataset[0] >>> num_classes = dataset.num_classes >>> # get node features >>> feat = g.ndata["feat"] >>> # get the first data split >>> train_mask = g.ndata["train_mask"][:, 0] >>> val_mask = g.ndata["val_mask"][:, 0] >>> test_mask = g.ndata["test_mask"][:, 0] >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,transform=None):super(TolokersDataset,self).__init__(name="tolokers",raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)
[docs]classQuestionsDataset(HeterophilousGraphDataset):r"""Questions dataset from the 'A Critical Look at the Evaluation of GNNs under Heterophily: Are We Really Making Progress? <https://arxiv.org/abs/2302.11640>'__ paper. This dataset is based on data from the question-answering website Yandex Q. Nodes are users, and an edge connects two nodes if one user answered the other user’s question. The task is to predict which users remained active on the website (were not deleted or blocked). Node features are the mean of word embeddings for words in the user description. Users that do not have description are indicated by a separate binary feature. Statistics: - Nodes: 48921 - Edges: 307080 - Classes: 2 - Node features: 301 - 10 train/val/test splits Parameters ---------- raw_dir : str, optional Raw file directory to store the processed data. Default: ~/.dgl/ force_reload : bool, optional Whether to re-download the data source. Default: False verbose : bool, optional Whether to print progress information. Default: True transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. Default: None Attributes ---------- num_classes : int Number of node classes Examples -------- >>> from dgl.data import QuestionsDataset >>> dataset = QuestionsDataset() >>> g = dataset[0] >>> num_classes = dataset.num_classes >>> # get node features >>> feat = g.ndata["feat"] >>> # get the first data split >>> train_mask = g.ndata["train_mask"][:, 0] >>> val_mask = g.ndata["val_mask"][:, 0] >>> test_mask = g.ndata["test_mask"][:, 0] >>> # get labels >>> label = g.ndata['label'] """def__init__(self,raw_dir=None,force_reload=False,verbose=True,transform=None):super(QuestionsDataset,self).__init__(name="questions",raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)