"""MovieLens dataset"""importosimportnumpyasnpimportpandasaspdfromtorchimportLongTensor,Tensorfrom..baseimportdgl_warningfrom..convertimportheterographfrom.dgl_datasetimportDGLDatasetfrom.utilsimport(_get_dgl_url,download,extract_archive,load_graphs,load_info,save_graphs,save_info,split_dataset,)GENRES_ML_100K=["unknown","Action","Adventure","Animation","Children","Comedy","Crime","Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi","Thriller","War","Western",]GENRES_ML_1M=GENRES_ML_100K[1:]GENRES_ML_10M=GENRES_ML_100K+["IMAX"]try:importtorchexceptImportError:HAS_TORCH=Falseelse:HAS_TORCH=Truedefcheck_pytorch():"""Check if PyTorch is the backend."""ifnotHAS_TORCH:raiseModuleNotFoundError("MovieLensDataset requires PyTorch to be the backend.")
[docs]classMovieLensDataset(DGLDataset):r"""MovieLens dataset for edge prediction tasks. The raw datasets are extracted from `MovieLens <https://grouplens.org/datasets/movielens/>`, introduced by `Movielens unplugged: experiences with an occasionally connected recommender system <https://dl.acm.org/doi/10.1145/604045.604094>`. The datasets consist of user ratings for movies and incorporate additional user/movie information in the form of features. The nodes represent users and movies, and the edges store ratings that users assign to movies. Statistics: MovieLens-100K (ml-100k) - Users: 943 - Movies: 1,682 - Ratings: 100,000 (1, 2, 3, 4, 5) MovieLens-1M (ml-1m) - Users: 6,040 - Movies: 3,706 - Ratings: 1,000,209 (1, 2, 3, 4, 5) MovieLens-10M (ml-10m) - Users: 69,878 - Movies: 10,677 - Ratings: 10,000,054 (0.5, 1, 1.5, ..., 4.5, 5.0) Parameters ---------- name: str Dataset name. (:obj:`"ml-100k"`, :obj:`"ml-1m"`, :obj:`"ml-10m"`). valid_ratio: int Ratio of validation samples out of the whole dataset. Should be in (0.0, 1.0). test_ratio: int, optional Ratio of testing samples out of the whole dataset. Should be in (0.0, 1.0). And its sum with :obj:`valid_ratio` should be in (0.0, 1.0) as well. This parameter is invalid when :obj:`name` is :obj:`"ml-100k"`, since its testing samples are pre-specified. Default: None raw_dir : str, optional Raw file directory to download/store the data. Default: ~/.dgl/ force_reload : bool, optional Whether to re-download(if the dataset has not been downloaded) and re-process the dataset. Default: False verbose : bool, optional Whether to print progress information. Default: True. transform : callable, optional A transform that takes in a :class:`~dgl.DGLGraph` object and returns a transformed version. The :class:`~dgl.DGLGraph` object will be transformed before every access. random_state : int, optional Random seed used for random dataset split. Default: 0 Notes ----- - When :obj:`name` is :obj:`"ml-100k"`, the :obj:`test_ratio` is invalid, and the training ratio is equal to 1-:obj:`valid_ratio`. When :obj:`name` is :obj:`"ml-1m"` or :obj:`"ml-10m"`, the :obj:`test_ratio` is valid, and the training ratio is equal to 1-:obj:`valid_ratio`-:obj:`test_ratio`. - The number of edges is doubled to form an undirected(bidirected) graph structure. Examples -------- >>> from dgl.data import MovieLensDataset >>> dataset = MovieLensDataset(name='ml-100k', valid_ratio=0.2) >>> g = dataset[0] >>> g Graph(num_nodes={'movie': 1682, 'user': 943}, num_edges={('movie', 'movie-user', 'user'): 100000, ('user', 'user-movie', 'movie'): 100000}, metagraph=[('movie', 'user', 'movie-user'), ('user', 'movie', 'user-movie')]) >>> # get ratings of edges in the training graph. >>> rate = g.edges['user-movie'].data['rate'] # or rate = g.edges['movie-user'].data['rate'] >>> rate tensor([5., 5., 3., ..., 3., 3., 5.]) >>> # get train, valid and test mask of edges >>> train_mask = g.edges['user-movie'].data['train_mask'] >>> valid_mask = g.edges['user-movie'].data['valid_mask'] >>> test_mask = g.edges['user-movie'].data['test_mask'] >>> # get train, valid and test ratings >>> train_ratings = rate[train_mask] >>> valid_ratings = rate[valid_mask] >>> test_ratings = rate[test_mask] >>> # get input features of users >>> g.nodes["user"].data["feat"] # or g.nodes["movie"].data["feat"] for movie nodes tensor([[0.4800, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [1.0600, 1.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [0.4600, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], ..., [0.4000, 0.0000, 1.0000, ..., 0.0000, 0.0000, 0.0000], [0.9600, 1.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000], [0.4400, 0.0000, 1.0000, ..., 0.0000, 0.0000, 0.0000]]) """_url={"ml-100k":"dataset/ml-100k.zip","ml-1m":"dataset/ml-1m.zip","ml-10m":"dataset/ml-10m.zip",}def__init__(self,name,valid_ratio,test_ratio=None,raw_dir=None,force_reload=None,verbose=None,transform=None,random_state=0,):check_pytorch()assertnamein["ml-100k","ml-1m","ml-10m",],f"currently movielens does not support {name}"# test regarding valid and test split ratioassert(valid_ratio>0.0andvalid_ratio<1.0),f"valid_ratio {valid_ratio} must be in (0.0, 1.0)"ifnamein["ml-1m","ml-10m"]:assert(test_ratioisnotNoneandtest_ratio>0.0andtest_ratio<1.0),f"test_ratio({test_ratio}) must be set to a value in (0.0, 1.0) when using ml-1m and ml-10m"assert(test_ratio+valid_ratio>0.0andtest_ratio+valid_ratio<1.0),f"test_ratio({test_ratio}) + valid_ratio({valid_ratio}) must be set to (0.0, 1.0) when using ml-1m and ml-10m"ifname=="ml-100k"andtest_ratioisnotNone:dgl_warning(f"test_ratio ({test_ratio}) is not set to None for ml-100k. ""Note that dataset split would not be affected by the test_ratio since ""testing samples of ml-100k have been pre-specified.")self.valid_ratio=valid_ratioself.test_ratio=test_ratioself.random_state=random_stateifname=="ml-100k":self.genres=GENRES_ML_100Kelifname=="ml-1m":self.genres=GENRES_ML_1Melifname=="ml-10m":self.genres=GENRES_ML_10Melse:raiseNotImplementedErrorsuper(MovieLensDataset,self).__init__(name=name,url=_get_dgl_url(self._url[name]),raw_dir=raw_dir,force_reload=force_reload,verbose=verbose,transform=transform,)defcheck_version(self):valid_ratio,test_ratio=load_info(self.version_path)ifself.valid_ratio==valid_ratioand(self.test_ratio==test_ratioifself.name!="ml-100k"elseTrue):returnTrueelse:ifself.name=="ml-100k":print(f"The current valid ratio ({self.valid_ratio}) ""is not the same as the last setting "f"(valid: {valid_ratio}). "f"MovieLens {self.name} will be re-processed with the new dataset split setting.")else:print(f"At least one of current valid ({self.valid_ratio}) and test ({self.test_ratio}) ratio ""are not the same as the last setting "f"(valid: {valid_ratio}, test: {test_ratio}). "f"MovieLens {self.name} will be re-processed with the new dataset split setting.")returnFalsedefdownload(self):zip_file_path=os.path.join(self.raw_dir,self.name+".zip")download(self.url,path=zip_file_path)extract_archive(zip_file_path,self.raw_dir,overwrite=True)defprocess(self):print(f"Starting processing {self.name} ...")# 0. loading movie featuresmovie_feat=load_info(os.path.join(self.raw_path,"movie_feat.pkl")).to(torch.float)# 1. dataset split: train + (valid + ) testifself.name=="ml-100k":train_rating_data=self._load_raw_rates(os.path.join(self.raw_path,"u1.base"),"\t")test_rating_data=self._load_raw_rates(os.path.join(self.raw_path,"u1.test"),"\t")indices=np.arange(len(train_rating_data))train,valid,_=split_dataset(indices,[1-self.valid_ratio,self.valid_ratio,0.0],shuffle=True,random_state=self.random_state,)train_rating_data,valid_rating_data=(train_rating_data.iloc[train.indices],train_rating_data.iloc[valid.indices],)all_rating_data=pd.concat([train_rating_data,valid_rating_data,test_rating_data])elifself.name=="ml-1m"orself.name=="ml-10m":all_rating_data=self._load_raw_rates(os.path.join(self.raw_path,"ratings.dat"),"::")indices=np.arange(len(all_rating_data))train,valid,test=split_dataset(indices,[1-self.valid_ratio-self.test_ratio,self.valid_ratio,self.test_ratio,],shuffle=True,random_state=self.random_state,)train_rating_data,valid_rating_data,test_rating_data=(all_rating_data.iloc[train.indices],all_rating_data.iloc[valid.indices],all_rating_data.iloc[test.indices],)# 2. load user and movie data, and drop those unseen in rating_datauser_data=self._load_raw_user_data()movie_data=self._load_raw_movie_data()user_data=self._drop_unseen_nodes(data_df=user_data,col_name="id",reserved_ids_set=set(all_rating_data["user_id"].values),)movie_data=self._drop_unseen_nodes(data_df=movie_data,col_name="id",reserved_ids_set=set(all_rating_data["movie_id"].values),)user_feat=Tensor(self._process_user_feat(user_data))# 3. generate rating pairs# Map user/movie to the global idself._global_user_id_map={ele:ifori,eleinenumerate(user_data["id"])}self._global_movie_id_map={ele:ifori,eleinenumerate(movie_data["id"])}# pair value is idx rather than idu_indices,v_indices,labels=self._generate_pair_value(all_rating_data)all_rating_pairs=(LongTensor(u_indices),LongTensor(v_indices),)all_rating_values=Tensor(labels)graph=self.construct_g(all_rating_pairs,all_rating_values,user_feat,movie_feat)self.graph=self.add_masks(graph,train_rating_data,valid_rating_data,test_rating_data)print(f"End processing {self.name} ...")defconstruct_g(self,rate_pairs,rate_values,user_feat,movie_feat):g=heterograph({("user","user-movie","movie"):(rate_pairs[0],rate_pairs[1]),("movie","movie-user","user"):(rate_pairs[1],rate_pairs[0]),})ndata={"user":user_feat,"movie":movie_feat}edata={"user-movie":rate_values,"movie-user":rate_values}g.ndata["feat"]=ndatag.edata["rate"]=edatareturngdefadd_masks(self,g,train_rating_data,valid_rating_data,test_rating_data):train_u_indices,train_v_indices,_=self._generate_pair_value(train_rating_data)valid_u_indices,valid_v_indices,_=self._generate_pair_value(valid_rating_data)test_u_indices,test_v_indices,_=self._generate_pair_value(test_rating_data)# user-movietrain_mask=torch.zeros((g.num_edges("user-movie"),),dtype=torch.bool)train_mask[g.edge_ids(train_u_indices,train_v_indices,etype="user-movie")]=Truevalid_mask=torch.zeros((g.num_edges("user-movie"),),dtype=torch.bool)valid_mask[g.edge_ids(valid_u_indices,valid_v_indices,etype="user-movie")]=Truetest_mask=torch.zeros((g.num_edges("user-movie"),),dtype=torch.bool)test_mask[g.edge_ids(test_u_indices,test_v_indices,etype="user-movie")]=Trueg.edges["user-movie"].data["train_mask"]=train_maskg.edges["user-movie"].data["valid_mask"]=valid_maskg.edges["user-movie"].data["test_mask"]=test_mask# movie-usertrain_mask_rev=torch.zeros((g.num_edges("movie-user"),),dtype=torch.bool)train_mask_rev[g.edge_ids(train_v_indices,train_u_indices,etype="movie-user")]=Truevalid_mask_rev=torch.zeros((g.num_edges("movie-user"),),dtype=torch.bool)valid_mask_rev[g.edge_ids(valid_v_indices,valid_u_indices,etype="movie-user")]=Truetest_mask_rev=torch.zeros((g.num_edges("movie-user"),),dtype=torch.bool)test_mask_rev[g.edge_ids(test_v_indices,test_u_indices,etype="movie-user")]=Trueg.edges["movie-user"].data["train_mask"]=train_mask_revg.edges["movie-user"].data["valid_mask"]=valid_mask_revg.edges["movie-user"].data["test_mask"]=test_mask_revreturngdefhas_cache(self):if(os.path.exists(self.graph_path)andos.path.exists(self.version_path)andself.check_version()):returnTruereturnFalsedefsave(self):save_graphs(self.graph_path,[self.graph])save_info(self.version_path,[self.valid_ratio,self.test_ratio])ifself.verbose:print(f"Done saving data into {self.raw_path}.")defload(self):g_list,_=load_graphs(self.graph_path)self.graph=g_list[0]""" To avoid the problem each time loading boolean tensor from the disk, boolean values would be automatically converted into torch.uint8 types, and a deprecation warning would be raised for using torch.uint8 """foreinself.graph.etypes:self.graph.edges[e].data["train_mask"]=(self.graph.edges[e].data["train_mask"].to(torch.bool))self.graph.edges[e].data["valid_mask"]=(self.graph.edges[e].data["valid_mask"].to(torch.bool))self.graph.edges[e].data["test_mask"]=(self.graph.edges[e].data["test_mask"].to(torch.bool))
[docs]def__getitem__(self,idx):assert(idx==0),"This dataset has only one set of training, validation and testing graph"ifself._transformisNone:returnself.graphelse:returnself._transform(self.graph)