Source code for torch_frame.datasets.amazon_fine_food_reviews

from __future__ import annotations

import pandas as pd

import torch_frame
from torch_frame.config.text_embedder import TextEmbedderConfig
from torch_frame.config.text_tokenizer import TextTokenizerConfig


[docs]class AmazonFineFoodReviews(torch_frame.data.Dataset):
    r"""The `Amazon Fine Food Reviews <https://arxiv.org/abs/1303.4402>`_
    dataset. It consists of reviews of fine foods from amazon.

    Args:
        text_stype (torch_frame.stype): Text stype to use for text columns
            in the dataset. (default: :obj:`torch_frame.text_embedded`)

    **STATS:**

    .. list-table::
        :widths: 10 10 10 10 10 20 10
        :header-rows: 1

        * - #rows
          - #cols (numerical)
          - #cols (categorical)
          - #cols (text)
          - #classes
          - Task
          - Missing value ratio
        * - 568,454
          - 2
          - 3
          - 2
          - 5
          - multiclass_classification
          - 0.0%
    """

    url = "https://data.pyg.org/datasets/tables/amazon_fine_food_reviews.zip"

    def __init__(
        self,
        root: str,
        text_stype: torch_frame.stype = torch_frame.text_embedded,
        col_to_text_embedder_cfg: dict[str, TextEmbedderConfig]
        | TextEmbedderConfig | None = None,
        col_to_text_tokenizer_cfg: dict[str, TextTokenizerConfig]
        | TextTokenizerConfig | None = None,
    ) -> None:
        self.root = root
        self.text_stype = text_stype
        path = self.download_url(self.url, root)

        col_to_stype = {
            'ProductId': torch_frame.categorical,
            'UserId': torch_frame.categorical,
            'HelpfulnessNumerator': torch_frame.numerical,
            'HelpfulnessDenominator': torch_frame.numerical,
            'Score': torch_frame.categorical,
            # 'Time': torch_frame.categorical,  # TODO: change to timestamp
            'Summary': text_stype,
            'Text': text_stype,
        }

        df = pd.read_csv(path)[list(col_to_stype.keys())]

        super().__init__(
            df,
            col_to_stype,
            target_col='Score',
            col_to_text_embedder_cfg=col_to_text_embedder_cfg,
            col_to_text_tokenizer_cfg=col_to_text_tokenizer_cfg,
        )