Source code for torch_frame.datasets.amphibians

import os.path as osp
import zipfile

import pandas as pd

import torch_frame


[docs]class Amphibians(torch_frame.data.Dataset): r"""The `Amphibians <https://archive.ics.uci.edu/dataset/528/amphibians>`_ dataset. The task is to predict which of the 7 frogs types appeared in the habitat. **STATS:** .. list-table:: :widths: 10 10 10 10 20 10 :header-rows: 1 * - #rows - #cols (numerical) - #cols (categorical) - #cols (text_embedded) - Task - Missing value ratio * - 189 - 3 - 20 - 0 - multilabel classification - 0.0% """ url = 'https://archive.ics.uci.edu/static/public/528/amphibians.zip' def __init__(self, root: str): path = self.download_url(self.url, root) folder_path = osp.dirname(path) with zipfile.ZipFile(path, 'r') as zip_ref: zip_ref.extractall(folder_path) data_path = osp.join(folder_path, 'dataset.csv') names = [ 'ID', 'MV', 'SR', 'NR', 'TR', 'VR', 'SUR1', 'SUR2', 'SUR3', 'UR', 'FR', 'OR', 'RR', 'BR', 'MR', 'CR', 't1', 't2', 't3', 't4', 't5', 't6', 't7' ] df = pd.read_csv(data_path, names=names, sep=';') # Drop the first 2 rows containing metadata df = df.iloc[2:].reset_index(drop=True) target_cols = ['t1', 't2', 't3', 't4', 't5', 't6', 't7'] df['t'] = df.apply( lambda row: [col for col in target_cols if row[col] == '1'], axis=1) df = df.drop(target_cols, axis=1) # Infer the pandas dataframe automatically path = osp.join(root, 'amphibians_posprocess.csv') df.to_csv(path, index=False) df = pd.read_csv(path) col_to_stype = { 'ID': torch_frame.numerical, 'MV': torch_frame.categorical, 'SR': torch_frame.numerical, 'NR': torch_frame.numerical, 'TR': torch_frame.categorical, 'VR': torch_frame.categorical, 'SUR1': torch_frame.categorical, 'SUR2': torch_frame.categorical, 'SUR3': torch_frame.categorical, 'UR': torch_frame.categorical, 'FR': torch_frame.categorical, 'OR': torch_frame.numerical, 'RR': torch_frame.categorical, # Support Ordinal Encoding 'BR': torch_frame.categorical, # Support Ordinal Encoding 'MR': torch_frame.categorical, 'CR': torch_frame.categorical, 't': torch_frame.multicategorical, } super().__init__(df, col_to_stype, target_col='t', col_to_sep=None)