Source code for torch_frame.datasets.adult_census_income

import pandas as pd

import torch_frame


[docs]class AdultCensusIncome(torch_frame.data.Dataset): r"""The `Adult Census Income <https://www.kaggle.com/datasets/uciml/adult-census-income>`_ dataset from Kaggle. It's extracted from census bureau database and the task is to predict whether a person's income exceeds $50K/year. **STATS:** .. list-table:: :widths: 10 10 10 10 20 10 :header-rows: 1 * - #rows - #cols (numerical) - #cols (categorical) - #classes - Task - Missing value ratio * - 32,561 - 4 - 8 - 2 - binary_classification - 0.0% """ url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data' # noqa def __init__(self, root: str): path = self.download_url(self.url, root) names = [ 'age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income', ] df = pd.read_csv(path, names=names) col_to_stype = { 'age': torch_frame.numerical, 'workclass': torch_frame.categorical, 'education': torch_frame.categorical, 'marital.status': torch_frame.categorical, 'occupation': torch_frame.categorical, 'relationship': torch_frame.categorical, 'race': torch_frame.categorical, 'sex': torch_frame.categorical, 'capital.gain': torch_frame.numerical, 'capital.loss': torch_frame.numerical, 'hours.per.week': torch_frame.numerical, 'native.country': torch_frame.categorical, 'income': torch_frame.categorical, } super().__init__(df, col_to_stype, target_col='income')