Source code for torch_frame.datasets.bank_marketing

import os.path as osp
import zipfile

import pandas as pd

import torch_frame


[docs]class BankMarketing(torch_frame.data.Dataset):
    r"""The `Bank Marketing
    <https://github.com/LeoGrin/tabular-benchmark>`_
    dataset. It's related with direct marketing campaigns of
    a Portuguese banking institution. The marketing campaigns
    were based on phone calls. Often, more than one contant to
    the same client was required, in order to access if the
    product (bank term deposit) would be (or not) subscribed.
    The classification goal is to predict if the client will
    subscribe a term deposit.

    **STATS:**

    .. list-table::
        :widths: 10 10 10 10 20 10
        :header-rows: 1

        * - #rows
          - #cols (numerical)
          - #cols (categorical)
          - #classes
          - Task
          - Missing value ratio
        * - 45,211
          - 7
          - 9
          - 2
          - binary_classification
          - 0.0%
    """

    url = 'https://archive.ics.uci.edu/static/public/222/bank+marketing.zip'  # noqa

    def __init__(self, root: str):
        path = self.download_url(self.url, root)
        folder_path = osp.dirname(path)
        with zipfile.ZipFile(path, 'r') as zip_ref:
            zip_ref.extractall(folder_path)
        data_path = osp.join(folder_path, 'bank.zip')
        data_subfolder_path = osp.join(folder_path, 'bank')
        with zipfile.ZipFile(data_path, 'r') as zip_ref:
            zip_ref.extractall(data_subfolder_path)
        df = pd.read_csv(osp.join(data_subfolder_path, 'bank-full.csv'),
                         sep=';')

        col_to_stype = {
            'age': torch_frame.numerical,
            'job': torch_frame.categorical,
            'marital': torch_frame.categorical,
            'education': torch_frame.categorical,
            'default': torch_frame.categorical,
            'balance': torch_frame.numerical,
            'housing': torch_frame.categorical,
            'loan': torch_frame.categorical,
            'contact': torch_frame.categorical,
            'day': torch_frame.numerical,
            'month': torch_frame.categorical,
            'duration': torch_frame.numerical,
            'campaign': torch_frame.numerical,
            'pdays': torch_frame.numerical,
            'previous': torch_frame.numerical,
            'poutcome': torch_frame.categorical,
            'y': torch_frame.categorical,
        }

        super().__init__(df, col_to_stype, target_col='y')