Source code for torch_frame.datasets.yandex

from __future__ import annotations

import os.path as osp
import zipfile
from typing import Any

import numpy as np
import pandas as pd

import torch_frame
from torch_frame.utils.split import SPLIT_TO_NUM

SPLIT_COL = 'split_col'
TARGET_COL = 'target_col'


def load_numpy_dict(path: str) -> dict[str, np.ndarray]:
    r"""Load numpy files from a ZIP file.

    Args:
        path (str): A path to the ZIP file containing .npy files.

    Returns:
        numpy_dict (Dict[str, np.ndarray]): A dictionary that maps the name of
            .npy file to the loaded numpy array.
    """
    numpy_dict: dict[str, np.ndarray] = {}
    with zipfile.ZipFile(path, 'r') as zip_ref:
        for file_name in zip_ref.namelist():
            if file_name.endswith('.npy'):
                with zip_ref.open(file_name) as f:
                    array_name = osp.basename(file_name).replace('.npy', '')
                    numpy_dict[array_name] = np.load(f, allow_pickle=True)
    return numpy_dict


def get_df_and_col_to_stype(
        zip_file_path: str) -> tuple[pd.DataFrame, dict[str, Any]]:
    r"""Get DataFrame and :obj:`col_to_stype` from a ZIP file.

    Args:
        zip_file_path (str): A path of the ZIP file containing .npy files.

    Returns:
        df (DataFrame): DataFrame containing train/val/test rows.
        col_to_stype (Dict[str, torch_frame.stype]). A dictionary mapping
            column names to their respective semantic types.
    """
    numpy_dict = load_numpy_dict(zip_file_path)
    dataframes: list[pd.DataFrame] = []
    col_to_stype: dict[str, torch_frame.stype] = {}

    for split in ['train', 'val', 'test']:
        categorical_features = numpy_dict.get(f'C_{split}', None)
        numerical_features = numpy_dict.get(f'N_{split}', None)
        labels = numpy_dict[f'y_{split}']
        assert not ((categorical_features is None) and
                    (numerical_features is None))

        features: np.ndarray | None = None
        if (categorical_features is not None
                and numerical_features is not None):
            features = np.concatenate(
                [categorical_features, numerical_features], axis=1)
            c_col_names = [
                f'C_feature_{i}' for i in range(categorical_features.shape[1])
            ]
            n_col_names = [
                f'N_feature_{i}' for i in range(numerical_features.shape[1])
            ]
            col_names = c_col_names + n_col_names

            for name in c_col_names:
                col_to_stype[name] = torch_frame.categorical
            for name in n_col_names:
                col_to_stype[name] = torch_frame.numerical
        else:
            # if the numpy_dict contains only categorical or numerical features
            features = (categorical_features if categorical_features
                        is not None else numerical_features)
            assert features is not None
            feature_type = 'C' if categorical_features is not None else 'N'
            col_names = [
                f'{feature_type}_feature_{i}' for i in range(features.shape[1])
            ]
            if feature_type == 'N':
                n_col_names = col_names
            for name in col_names:
                col_to_stype[name] = (torch_frame.categorical if feature_type
                                      == 'C' else torch_frame.numerical)
        assert features is not None
        df = pd.DataFrame(features, columns=col_names)
        # Explicitly set dtype for numerical features
        if numerical_features is not None:
            for n_col in n_col_names:
                df[n_col] = df[n_col].astype('float64')
        label_split_df = pd.DataFrame({
            TARGET_COL:
            labels,
            SPLIT_COL:
            np.full((len(df), ), fill_value=SPLIT_TO_NUM[split])
        })
        df = pd.concat([df, label_split_df], axis=1)
        dataframes.append(df)

    df = pd.concat(dataframes, ignore_index=True)

    return df, col_to_stype


[docs]class Yandex(torch_frame.data.Dataset):
    r"""The Yandex dataset collections used by `"Revisiting Deep Learning
    Models for Tabular Data" <https://arxiv.org/abs/2106.11959>`_.
    Originally downloaded from
    `github.com/yandex-research/tabular-dl-revisiting-models
    <https://github.com/yandex-research/tabular-dl-revisiting-models>`_.

    **STATS:**

    .. list-table::
        :widths: 20 10 10 10 10 20 10
        :header-rows: 1

        * - Name
          - #rows
          - #cols (numerical)
          - #cols (categorical)
          - #classes
          - Task
          - Missing value ratio
        * - adult
          - 48,842
          - 6
          - 8
          - 2
          - binary_classification
          - 0.0%
        * - aloi
          - 108,000
          - 128
          - 0
          - 1,000
          - multiclass_classification
          - 0.0%
        * - covtype
          - 581,012
          - 54
          - 0
          - 7
          - multiclass_classification
          - 0.0%
        * - helena
          - 65,196
          - 27
          - 0
          - 100
          - multiclass_classification
          - 0.0%
        * - higgs_small
          - 98,050
          - 28
          - 0
          - 2
          - binary_classification
          - 0.0%
        * - jannis
          - 83,733
          - 54
          - 0
          - 4
          - multiclass_classification
          - 0.0%
        * - california_housing
          - 20,640
          - 8
          - 0
          - 1
          - regression
          - 0.0%
        * - microsoft
          - 1,200,192
          - 136
          - 0
          - 1
          - regression
          - 0.0%
        * - yahoo
          - 709,877
          - 699
          - 0
          - 1
          - regression
          - 0.0%
        * - year
          - 515,345
          - 90
          - 0
          - 1
          - regression
          - 0.0%
    """

    base_url = 'https://data.pyg.org/datasets/tables/revisiting_data/'
    classification_datasets = {
        'adult', 'aloi', 'covtype', 'helena', 'higgs_small', 'jannis'
    }
    regression_datasets = {'california_housing', 'microsoft', 'yahoo', 'year'}
    name_list = sorted(
        list(classification_datasets) + list(regression_datasets))

    def __init__(self, root: str, name: str) -> None:
        assert name in self.classification_datasets | self.regression_datasets
        self.root = root
        self.name = name
        path = self.download_url(osp.join(self.base_url, self.name + '.zip'),
                                 root)
        df, col_to_stype = get_df_and_col_to_stype(path)
        if name in self.regression_datasets:
            col_to_stype[TARGET_COL] = torch_frame.numerical
        else:
            col_to_stype[TARGET_COL] = torch_frame.categorical
        super().__init__(df, col_to_stype, target_col=TARGET_COL,
                         split_col=SPLIT_COL)

    def __repr__(self) -> str:
        return (f"{self.__class__.__name__}(name='{self.name}')")