Source code for torch_frame.datasets.tabular_benchmark

from __future__ import annotations

import os

import pandas as pd
from pandas.api.types import is_numeric_dtype

import torch_frame


[docs]class TabularBenchmark(torch_frame.data.Dataset): r"""A collection of Tabular benchmark datasets introduced in `"Why do tree-based models still outperform deep learning on tabular data?" <https://arxiv.org/abs/2207.08815>`_. **STATS:** .. list-table:: :widths: 20 10 10 10 10 20 10 :header-rows: 1 * - Name - #rows - #cols (numerical) - #cols (categorical) - #classes - Task - Missing value ratio * - albert - 58,252 - 23 - 8 - 2 - binary_classification - 0.0% * - compas-two-years - 4,966 - 2 - 9 - 2 - binary_classification - 0.0% * - covertype - 423,680 - 10 - 44 - 2 - binary_classification - 0.0% * - default-of-credit-card-clients - 13,272 - 20 - 1 - 2 - binary_classification - 0.0% * - electricity - 38,474 - 7 - 1 - 2 - binary_classification - 0.0% * - eye_movements - 7,608 - 18 - 5 - 2 - binary_classification - 0.0% * - road-safety - 111,762 - 24 - 8 - 2 - binary_classification - 0.0% * - Bioresponse - 3,434 - 419 - 0 - 2 - binary_classification - 0.0% * - Diabetes130US - 71,090 - 7 - 0 - 2 - binary_classification - 0.0% * - Higgs - 940,160 - 24 - 0 - 2 - binary_classification - 0.0% * - MagicTelescope - 13,376 - 10 - 0 - 2 - binary_classification - 0.0% * - MiniBooNE - 72,998 - 50 - 0 - 2 - binary_classification - 0.0% * - bank-marketing - 10,578 - 7 - 0 - 2 - binary_classification - 0.0% * - california - 20,634 - 8 - 0 - 2 - binary_classification - 0.0% * - credit - 16,714 - 10 - 0 - 2 - binary_classification - 0.0% * - heloc - 10,000 - 22 - 0 - 2 - binary_classification - 0.0% * - house_16H - 13,488 - 16 - 0 - 2 - binary_classification - 0.0% * - jannis - 57,580 - 54 - 0 - 2 - binary_classification - 0.0% * - pol - 10,082 - 26 - 0 - 2 - binary_classification - 0.0% * - analcatdata_supreme - 4,052 - 1 - 6 - 1 - regression - 0.0% * - Airlines_DepDelay_1M - 1,000,000 - 5 - 0 - 1 - regression - 0.0% * - Allstate_Claims_Severity - 188,318 - 25 - 99 - 1 - regression - 0.0% * - Bike_Sharing_Demand - 17,379 - 6 - 5 - 1 - regression - 0.0% * - Brazilian_houses - 10,692 - 7 - 4 - 1 - regression - 0.0% * - Mercedes_Benz_Greener_Manufacturing - 4,209 - 1 - 358 - 1 - regression - 0.0% * - SGEMM_GPU_kernel_performance - 241,600 - 3 - 6 - 1 - regression - 0.0% * - diamonds - 53,940 - 6 - 3 - 1 - regression - 0.0% * - house_sales - 21,613 - 15 - 2 - 1 - regression - 0.0% * - medical_charges - 163,065 - 3 - 0 - 1 - regression - 0.0% * - particulate-matter-ukair-2017 - 394,299 - 4 - 2 - 1 - regression - 0.0% * - seattlecrime6 - 52,031 - 3 - 1 - 1 - regression - 0.0% * - topo_2_1 - 8,885 - 252 - 3 - 1 - regression - 0.0% * - visualizing_soil - 8,641 - 3 - 1 - 1 - regression - 0.0% * - cpu_act - 8,192 - 21 - 0 - 1 - regression - 0.0% * - elevators - 16,599 - 16 - 0 - 1 - regression - 0.0% * - houses - 20,640 - 8 - 0 - 1 - regression - 0.0% * - delays_zurich_transport - 5,465,575 - 8 - 0 - 1 - regression - 0.0% * - nyc-taxi-green-dec-2016 - 581,835 - 9 - 0 - 1 - regression - 0.0% * - sulfur - 10,081 - 6 - 0 - 1 - regression - 0.0% * - superconduct - 21,263 - 79 - 0 - 1 - regression - 0.0% * - wine_quality - 6,497 - 11 - 0 - 1 - regression - 0.0% * - yprop_4_1 - 8,885 - 42 - 0 - 1 - regression - 0.0% """ name_to_task_category = { 'albert': 'clf_cat', 'compas-two-years': 'clf_cat', 'covertype': 'clf_cat', 'default-of-credit-card-clients': 'clf_cat', 'electricity': 'clf_cat', 'eye_movements': 'clf_cat', 'road-safety': 'clf_cat', 'Bioresponse': 'clf_num', 'Diabetes130US': 'clf_num', 'Higgs': 'clf_num', 'MagicTelescope': 'clf_num', 'MiniBooNE': 'clf_num', 'bank-marketing': 'clf_num', 'california': 'clf_num', 'credit': 'clf_num', 'heloc': 'clf_num', 'house_16H': 'clf_num', 'jannis': 'clf_num', 'pol': 'clf_num', 'analcatdata_supreme': 'reg_cat', 'Airlines_DepDelay_1M': 'reg_cat', 'Allstate_Claims_Severity': 'reg_cat', 'Bike_Sharing_Demand': 'reg_cat', 'Brazilian_houses': 'reg_cat', 'Mercedes_Benz_Greener_Manufacturing': 'reg_cat', 'SGEMM_GPU_kernel_performance': 'reg_cat', 'diamonds': 'reg_cat', 'house_sales': 'reg_cat', 'medical_charges': 'reg_cat', 'particulate-matter-ukair-2017': 'reg_cat', 'seattlecrime6': 'reg_cat', 'topo_2_1': 'reg_cat', 'visualizing_soil': 'reg_cat', 'elevators': 'reg_num', 'houses': 'reg_num', 'cpu_act': 'reg_num', 'delays_zurich_transport': 'reg_num', 'nyc-taxi-green-dec-2016': 'reg_num', 'sulfur': 'reg_num', 'superconduct': 'reg_num', 'wine_quality': 'reg_num', 'yprop_4_1': 'reg_num', } large_datasets = { 'covertype', 'road-safety', 'Higgs', 'MiniBooNE', 'jannis', 'delays_zurich_transport', 'particulate-matter-ukair-2017', 'nyc-taxi-green-dec-2016', 'SGEMM_GPU_kernel_performance', 'Airlines_DepDelay_1M', 'Allstate_Claims_Severity', 'topo_2_1', 'superconduct', } base_url = 'https://huggingface.co/datasets/inria-soda/tabular-benchmark/raw/main/' # noqa # Dedicated URLs for large datasets base_url_large = 'https://huggingface.co/datasets/inria-soda/tabular-benchmark/resolve/main/' # noqa name_list = sorted(list(name_to_task_category.keys())) def __init__(self, root: str, name: str) -> None: self.root = root self.name = name if name not in self.name_to_task_category: raise ValueError( f"The given dataset name ('{name}') is not available. It " f"needs to be chosen from " f"{list(self.name_to_task_category.keys())}.") base_url = (self.base_url_large if name in self.large_datasets else self.base_url) task_category = self.name_to_task_category[name] url = os.path.join( base_url, task_category, f'{name}.csv', ) path = self.download_url(url, root) df = pd.read_csv(path) # The last column is the target column col_to_stype = {} target_col = df.columns[-1] if "clf" in task_category: col_to_stype[target_col] = torch_frame.categorical else: col_to_stype[target_col] = torch_frame.numerical for col in df.columns[:-1]: if "num" in task_category: # "num" implies all features are numerical. col_to_stype[col] = torch_frame.numerical elif df[col].dtype == float: col_to_stype[col] = torch_frame.numerical else: # Heuristics to decide stype if is_numeric_dtype(df[col].dtype) and df[col].nunique() > 10: col_to_stype[col] = torch_frame.numerical else: col_to_stype[col] = torch_frame.categorical super().__init__(df, col_to_stype, target_col=target_col) def __repr__(self) -> str: return f"{self.__class__.__name__}(name='{self.name}')"