Source code for torch_frame.datasets.amazon_fine_food_reviews

from __future__ import annotations

import pandas as pd

import torch_frame
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.config.text_tokenizer import TextTokenizerConfig


[docs]class AmazonFineFoodReviews(torch_frame.data.Dataset): r"""The `Amazon Fine Food Reviews <https://arxiv.org/abs/1303.4402>`_ dataset. It consists of reviews of fine foods from amazon. Args: text_stype (torch_frame.stype): Text stype to use for text columns in the dataset. (default: :obj:`torch_frame.text_embedded`) **STATS:** .. list-table:: :widths: 10 10 10 10 10 20 10 :header-rows: 1 * - #rows - #cols (numerical) - #cols (categorical) - #cols (text) - #classes - Task - Missing value ratio * - 568,454 - 2 - 3 - 2 - 5 - multiclass_classification - 0.0% """ url = "https://data.pyg.org/datasets/tables/amazon_fine_food_reviews.zip" def __init__( self, root: str, text_stype: torch_frame.stype = torch_frame.text_embedded, col_to_text_embedder_cfg: dict[str, TextEmbedderConfig] | TextEmbedderConfig | None = None, col_to_text_tokenizer_cfg: dict[str, TextTokenizerConfig] | TextTokenizerConfig | None = None, ) -> None: self.root = root self.text_stype = text_stype path = self.download_url(self.url, root) col_to_stype = { 'ProductId': torch_frame.categorical, 'UserId': torch_frame.categorical, 'HelpfulnessNumerator': torch_frame.numerical, 'HelpfulnessDenominator': torch_frame.numerical, 'Score': torch_frame.categorical, # 'Time': torch_frame.categorical, # TODO: change to timestamp 'Summary': text_stype, 'Text': text_stype, } df = pd.read_csv(path)[list(col_to_stype.keys())] super().__init__( df, col_to_stype, target_col='Score', col_to_text_embedder_cfg=col_to_text_embedder_cfg, col_to_text_tokenizer_cfg=col_to_text_tokenizer_cfg, )