Source code for torch_frame.datasets.kdd_census_income

from __future__ import annotations

import os
import os.path as osp
import tarfile
import zipfile

import pandas as pd

import torch_frame
from torch_frame import stype


[docs]class KDDCensusIncome(torch_frame.data.Dataset):
    r"""The `KDD Census Income
    <https://archive.ics.uci.edu/dataset/117/census+income+kdd>`_
    dataset. It's a task of forest cover type classification
    based on attributes such as elevation, slop and soil type etc.

    **STATS:**

    .. list-table::
        :widths: 10 10 10 10 20 10
        :header-rows: 1

        * - #rows
          - #cols (numerical)
          - #cols (categorical)
          - #classes
          - Task
          - Missing value ratio
        * - 199,523
          - 7
          - 34
          - 2
          - binary_classification
          - 0.0%
    """

    url = 'https://archive.ics.uci.edu/static/public/117/census+income+kdd.zip'

    def __init__(self, root: str):
        data_dir = osp.join(root, 'census')
        filename = osp.join(data_dir, 'census-income.data')
        if not osp.exists(filename):
            path = self.download_url(self.url, root)
            tar_gz_path = osp.join(root, 'census.tar.gz')
            with zipfile.ZipFile(path, 'r') as zip_ref:
                zip_ref.extractall(root)
            with tarfile.open(tar_gz_path, 'r:gz') as tar_ref:
                tar_ref.extractall(data_dir)
            os.remove(tar_gz_path)
            os.remove(path)

        names = [
            'age',
            'class of worker',
            'industry code',
            'occupation code',
            'education',
            'wage per hour',
            'enrolled in edu inst last wk',
            'marital status',
            'major industry code',
            'major occupation code',
            'race',
            'hispanic Origin',
            'sex',
            'member of a labor union',
            'reason for unemployment',
            'full or part time employment stat',
            'capital gains',
            'capital losses',
            'divdends from stocks',
            'tax filer status',
            'region of previous residence',
            'state of previous residence',
            'detailed household and family stat',
            'detailed household summary in household',
            'migration code-change in msa',
            'migration code-change in reg',
            'migration code-move within reg',
            'live in this house 1 year ago',
            'migration prev res in sunbelt',
            'family members under 18',
            'num persons worked for employer',
            'country of birth father',
            'country of birth mother',
            'country of birth self',
            'citizenship',
            'total person income',
            'own business or self employed',
            "fill inc questionnaire for veteran's admin",
            'veterans benefits',
            'weeks worked in year',
            'year',
            'income above 50000',
        ]

        continuous_cols = {
            'age',
            'wage per hour',
            'capital gains',
            'capital losses',
            'divdends from stocks',
            'num persons worked for employer',
            'weeks worked in year',
        }

        col_to_stype: dict[str, stype] = {}
        for name in names:
            if name in continuous_cols:
                col_to_stype[name] = torch_frame.numerical
            else:
                col_to_stype[name] = torch_frame.categorical

        df = pd.read_csv(filename, names=names)

        super().__init__(df, col_to_stype, target_col='income above 50000')