Source code for sklift.datasets.datasets

import os
import shutil
import hashlib

import pandas as pd
import requests
from sklearn.utils import Bunch
from tqdm.auto import tqdm


[docs]def get_data_dir():
    """Return the path of the scikit-uplift data dir.

    This folder is used by some large dataset loaders to avoid downloading the data several times.

    By default the data dir is set to a folder named ``scikit-uplift-data`` in the user home folder.

    Returns:
        string: The path to scikit-uplift data dir.

    """
    return os.path.join(os.path.expanduser("~"), "scikit-uplift-data")


def _create_data_dir(path):
    """Creates a directory, which stores the datasets.

    Args:
        path (str): The path to scikit-uplift data dir.

    """
    if not os.path.isdir(path):
        os.makedirs(path)


def _download(url, dest_path, content_length_header_key='Content-Length'):
    """Download the file from url and save it locally.

    Args:
        url (str): URL address, must be a string.
        dest_path (str): Destination of the file.
        content_length_header_key (str): The key in the HTTP response headers that lists the response size in bytes.
            Used for progress bar.
    """
    if isinstance(url, str):
        req = requests.get(url, stream=True)
        req.raise_for_status()

        with open(dest_path, "wb") as fd:
            total_size_in_bytes = int(req.headers.get(content_length_header_key, 0))
            progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
            for chunk in req.iter_content(chunk_size=2 ** 20):
                progress_bar.update(len(chunk))
                fd.write(chunk)
    else:
        raise TypeError("URL must be a string")


def _get_data(data_home, url, dest_subdir, dest_filename, download_if_missing,
              content_length_header_key='Content-Length'):
    """Return the path to the dataset.

    Args:
        data_home (str): The path to scikit-uplift data dir.
        url (str): The URL to the dataset.
        dest_subdir (str): The name of the folder in which the dataset is stored.
        dest_filename (str): The name of the dataset.
        download_if_missing (bool): If False, raise a IOError if the data is not locally available instead of
            trying to download the data from the source site.
        content_length_header_key (str): The key in the HTTP response headers that lists the response size in bytes.
            Used for progress bar.

    Returns:
        string: The path to the dataset.

    """
    if data_home is None:
        if dest_subdir is None:
            data_dir = get_data_dir()
        else:
            data_dir = os.path.join(get_data_dir(), dest_subdir)
    else:
        if dest_subdir is None:
            data_dir = os.path.abspath(data_home)
        else:
            data_dir = os.path.join(os.path.abspath(data_home), dest_subdir)

    _create_data_dir(data_dir)

    dest_path = os.path.join(data_dir, dest_filename)

    if not os.path.isfile(dest_path):
        if download_if_missing:
            _download(url, dest_path, content_length_header_key)
        else:
            raise IOError("Dataset missing")
    return dest_path

def _get_file_hash(csv_path):
    with open(csv_path, 'rb') as file_to_check:
        data = file_to_check.read()    
        return hashlib.md5(data).hexdigest()


[docs]def clear_data_dir(path=None):
    """Delete all the content of the data home cache.

        Args:
            path (str): The path to scikit-uplift data dir

    """
    if path is None:
        path = get_data_dir()
    if os.path.isdir(path):
        shutil.rmtree(path, ignore_errors=True)


[docs]def fetch_lenta(data_home=None, dest_subdir=None, download_if_missing=True, return_X_y_t=False):
    """Load and return the Lenta dataset (classification).

    An uplift modeling dataset containing data about Lenta's customers grociery shopping and
    related marketing campaigns.

    Major columns:

    - ``group`` (str): treatment/control group flag
    - ``response_att`` (binary): target
    - ``gender`` (str): customer gender
    - ``age`` (float): customer age
    - ``main_format`` (int): store type (1 - grociery store, 0 - superstore)

    Read more in the :ref:`docs <Lenta>`.

    Args:
        data_home (str): The path to the folder where datasets are stored.
        dest_subdir (str): The name of the folder in which the dataset is stored.
        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
        return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object.

    Returns:
        Bunch or tuple: dataset.

        Bunch:
            By default dictionary-like object, with the following attributes:

                * ``data`` (DataFrame object): Dataset without target and treatment.
                * ``target`` (Series object): Column target by values.
                * ``treatment`` (Series object): Column treatment by values.
                * ``DESCR`` (str): Description of the Lenta dataset.
                * ``feature_names`` (list): Names of the features.
                * ``target_name`` (str): Name of the target.
                * ``treatment_name`` (str): Name of the treatment.

        Tuple:
            tuple (data, target, treatment) if `return_X_y_t` is True

    Example::

        from sklift.datasets import fetch_lenta


        dataset = fetch_lenta()
        data, target, treatment = dataset.data, dataset.target, dataset.treatment

        # alternative option
        data, target, treatment = fetch_lenta(return_X_y_t=True)

    See Also:

        :func:`.fetch_x5`: Load and return the X5 RetailHero dataset (classification).

        :func:`.fetch_criteo`: Load and return the Criteo Uplift Prediction Dataset (classification).

        :func:`.fetch_hillstrom`: Load and return Kevin Hillstrom Dataset MineThatData (classification or regression).

        :func:`.fetch_megafon`: Load and return the MegaFon Uplift Competition dataset (classification).
    """

    lenta_metadata = {
        'url': 'https://sklift.s3.eu-west-2.amazonaws.com/lenta_dataset.csv.gz',
        'hash': '6ab28ff0989ed8b8647f530e2e86452f'
    }

    filename = lenta_metadata['url'].split('/')[-1]
    csv_path = _get_data(data_home=data_home, url=lenta_metadata['url'], dest_subdir=dest_subdir,
                         dest_filename=filename,
                         download_if_missing=download_if_missing)
    
    if _get_file_hash(csv_path) != lenta_metadata['hash']:
        raise ValueError(f"The {filename} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")

    target_col = 'response_att'
    treatment_col = 'group'

    data = pd.read_csv(csv_path)
    treatment, target = data[treatment_col], data[target_col]

    data = data.drop([target_col, treatment_col], axis=1)
    feature_names = list(data.columns)

    if return_X_y_t:
        return data, target, treatment

    module_path = os.path.dirname(__file__)
    with open(os.path.join(module_path, 'descr', 'lenta.rst')) as rst_file:
        fdescr = rst_file.read()

    return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr,
                 feature_names=feature_names, target_name=target_col, treatment_name=treatment_col)


[docs]def fetch_x5(data_home=None, dest_subdir=None, download_if_missing=True):
    """Load and return the X5 RetailHero dataset (classification).

    The dataset contains raw retail customer purchases, raw information about products and general info about customers.

    Major columns:

    - ``treatment_flg`` (binary): treatment/control group flag
    - ``target`` (binary): target
    - ``customer_id`` (str): customer id - primary key for joining

    Read more in the :ref:`docs <X5>`.

    Args:
        data_home (str, unicode): The path to the folder where datasets are stored.
        dest_subdir (str, unicode): The name of the folder in which the dataset is stored.
        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing

    Returns:
        Bunch: dataset.

         Dictionary-like object, with the following attributes.

            * ``data`` (Bunch object): dictionary-like object without target and treatment:

                * ``clients`` (ndarray or DataFrame object): General info about clients.
                * ``train`` (ndarray or DataFrame object): A subset of clients for training.
                * ``purchases`` (ndarray or DataFrame object): clients’ purchase history prior to communication.
            * ``target`` (Series object): Column target by values.
            * ``treatment`` (Series object): Column treatment by values.
            * ``DESCR`` (str): Description of the X5 dataset.
            * ``feature_names`` (Bunch object): Names of the features.
            * ``target_name`` (str): Name of the target.
            * ``treatment_name`` (str): Name of the treatment.

    References:
        https://ods.ai/competitions/x5-retailhero-uplift-modeling/data

    Example::

        from sklift.datasets import fetch_x5


        dataset = fetch_x5()
        data, target, treatment = dataset.data, dataset.target, dataset.treatment

        # data - dictionary-like object
        # data contains general info about clients:
        clients = data.clients

        # data contains a subset of clients for training:
        train = data.train

        # data contains a clients’ purchase history prior to communication.
        purchases = data.purchases

    See Also:

        :func:`.fetch_lenta`: Load and return the Lenta dataset (classification).

        :func:`.fetch_criteo`: Load and return the Criteo Uplift Prediction Dataset (classification).

        :func:`.fetch_hillstrom`: Load and return Kevin Hillstrom Dataset MineThatData (classification or regression).

        :func:`.fetch_megafon`: Load and return the MegaFon Uplift Competition dataset (classification).
    """

    x5_metadata = {
        'url_train': 'https://sklift.s3.eu-west-2.amazonaws.com/uplift_train.csv.gz',
        'url_clients': 'https://sklift.s3.eu-west-2.amazonaws.com/clients.csv.gz',
        'url_purchases': 'https://sklift.s3.eu-west-2.amazonaws.com/purchases.csv.gz',
        'uplift_hash': '2720bbb659daa9e0989b2777b6a42d19',
        'clients_hash': 'b9cdeb2806b732771de03e819b3354c5',
        'purchases_hash': '48d2de13428e24e8b61d66fef02957a8'
    }
    file_train = x5_metadata['url_train'].split('/')[-1]
    csv_train_path = _get_data(data_home=data_home, url=x5_metadata['url_train'], dest_subdir=dest_subdir,
                               dest_filename=file_train,
                               download_if_missing=download_if_missing)

    if _get_file_hash(csv_train_path) != x5_metadata['uplift_hash']:
        raise ValueError(f"The {file_train} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")

    train = pd.read_csv(csv_train_path)
    train_features = list(train.columns)

    target_col = 'target'
    treatment_col = 'treatment_flg'

    treatment, target = train[treatment_col], train[target_col]

    train = train.drop([target_col, treatment_col], axis=1)

    file_clients = x5_metadata['url_clients'].split('/')[-1]
    csv_clients_path = _get_data(data_home=data_home, url=x5_metadata['url_clients'], dest_subdir=dest_subdir,
                                 dest_filename=file_clients,
                                 download_if_missing=download_if_missing)

    if _get_file_hash(csv_clients_path) != x5_metadata['clients_hash']:
        raise ValueError(f"The {file_clients} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")

    clients = pd.read_csv(csv_clients_path)
    clients_features = list(clients.columns)

    file_purchases = x5_metadata['url_purchases'].split('/')[-1]
    csv_purchases_path = _get_data(data_home=data_home, url=x5_metadata['url_purchases'], dest_subdir=dest_subdir,
                                   dest_filename=file_purchases,
                                   download_if_missing=download_if_missing)

    if _get_file_hash(csv_clients_path) != x5_metadata['purchases_hash']:
        raise ValueError(f"The {file_purchases} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")
        
    purchases = pd.read_csv(csv_purchases_path)
    purchases_features = list(purchases.columns)

    data = Bunch(clients=clients, train=train, purchases=purchases)
    feature_names = Bunch(train_features=train_features, clients_features=clients_features,
                          purchases_features=purchases_features)

    module_path = os.path.dirname(__file__)
    with open(os.path.join(module_path, 'descr', 'x5.rst')) as rst_file:
        fdescr = rst_file.read()

    return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr,
                 feature_names=feature_names, target_name='target', treatment_name='treatment_flg')


[docs]def fetch_criteo(target_col='visit', treatment_col='treatment', data_home=None, dest_subdir=None,
                 download_if_missing=True, percent10=False, return_X_y_t=False):
    """Load and return the Criteo Uplift Prediction Dataset (classification).

    This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized
    trial procedure where a random part of the population is prevented from being targeted by advertising.

    Major columns:

    * ``treatment`` (binary): treatment
    * ``exposure`` (binary): treatment
    * ``visit`` (binary): target
    * ``conversion`` (binary): target
    * ``f0, ... , f11`` (float): feature values

    Read more in the :ref:`docs <Criteo>`.

    Args:
        target_col (string, 'visit', 'conversion' or 'all', default='visit'): Selects which column from dataset
            will be target. If 'all', return a DataFrame with all targets cols.
        treatment_col (string,'treatment', 'exposure' or 'all', default='treatment'): Selects which column from dataset
            will be treatment. If 'all', return a DataFrame with all treatment cols.
        data_home (string): Specify a download and cache folder for the datasets.
        dest_subdir (string): The name of the folder in which the dataset is stored.
        download_if_missing (bool, default=True): If False, raise an IOError if the data is not locally available
            instead of trying to download the data from the source site.
        percent10 (bool, default=False): Whether to load only 10 percent of the data.
        return_X_y_t (bool, default=False): If True, returns (data, target, treatment) instead of a Bunch object.

    Returns:
        Bunch or tuple: dataset.

        Bunch:
            By default dictionary-like object, with the following attributes:

                * ``data`` (DataFrame object): Dataset without target and treatment.
                * ``target`` (Series or DataFrame object): Column target by values.
                * ``treatment`` (Series or DataFrame object): Column treatment by values.
                * ``DESCR`` (str): Description of the Criteo dataset.
                * ``feature_names`` (list): Names of the features.
                * ``target_name`` (str list): Name of the target.
                * ``treatment_name`` (str or list): Name of the treatment.

        Tuple:
            tuple (data, target, treatment) if `return_X_y` is True

    Example::

        from sklift.datasets import fetch_criteo


        dataset = fetch_criteo(target_col='conversion', treatment_col='exposure')
        data, target, treatment = dataset.data, dataset.target, dataset.treatment

        # alternative option
        data, target, treatment = fetch_criteo(target_col='conversion', treatment_col='exposure', return_X_y_t=True)

    References:
        :cite:t:`Diemert2018`

        .. bibliography::

    See Also:

        :func:`.fetch_lenta`: Load and return the Lenta dataset (classification).

        :func:`.fetch_x5`: Load and return the X5 RetailHero dataset (classification).

        :func:`.fetch_hillstrom`: Load and return Kevin Hillstrom Dataset MineThatData (classification or regression).

        :func:`.fetch_megafon`: Load and return the MegaFon Uplift Competition dataset (classification).
    """
    treatment_cols = ['exposure', 'treatment']
    if treatment_col == 'all':
        treatment_col = treatment_cols
    elif treatment_col not in treatment_cols:
        raise ValueError(f"The treatment_col must be an element of {treatment_cols + ['all']}. "
                         f"Got value target_col={treatment_col}.")

    target_cols = ['visit', 'conversion']
    if target_col == 'all':
        target_col = target_cols
    elif target_col not in target_cols:
        raise ValueError(f"The target_col must be an element of {target_cols + ['all']}. "
                         f"Got value target_col={target_col}.")

    criteo_metadata = {
        'url': '',
        'criteo_hash': ''
    }

    if percent10:
        criteo_metadata['url'] = 'https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo10.csv.gz'
        criteo_metadata['criteo_hash'] = 'fe159bcee2cea57548e48eb2d7d5d00c'
    else:
        criteo_metadata['url'] = "https://criteo-bucket.s3.eu-central-1.amazonaws.com/criteo.csv.gz"
        criteo_metadata['criteo_hash'] = 'd2236769ef69e9be52556110102911ec'

    filename = criteo_metadata['url'].split('/')[-1]
    csv_path = _get_data(data_home=data_home, url=criteo_metadata['url'], dest_subdir=dest_subdir,
                         dest_filename=filename,
                         download_if_missing=download_if_missing)

    if _get_file_hash(csv_path) != criteo_metadata['criteo_hash']:
        raise ValueError(f"The {filename} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")

    dtypes = {
        'exposure': 'Int8',
        'treatment': 'Int8',
        'conversion': 'Int8',
        'visit': 'Int8'
    }
    data = pd.read_csv(csv_path, dtype=dtypes)
    treatment, target = data[treatment_col], data[target_col]

    data = data.drop(target_cols + treatment_cols, axis=1)

    if return_X_y_t:
        return data, target, treatment

    feature_names = list(data.columns)

    module_path = os.path.dirname(__file__)
    with open(os.path.join(module_path, 'descr', 'criteo.rst')) as rst_file:
        fdescr = rst_file.read()

    return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr, feature_names=feature_names,
                 target_name=target_col, treatment_name=treatment_col)


[docs]def fetch_hillstrom(target_col='visit', data_home=None, dest_subdir=None, download_if_missing=True,
                    return_X_y_t=False):
    """Load and return Kevin Hillstrom Dataset MineThatData (classification or regression).

    This dataset contains 64,000 customers who last purchased within twelve months.
    The customers were involved in an e-mail test.

    Major columns:

    * ``visit`` (binary): target. 1/0 indicator, 1 = Customer visited website in the following two weeks.
    * ``conversion`` (binary): target. 1/0 indicator, 1 = Customer purchased merchandise in the following two weeks.
    * ``spend`` (float): target. Actual dollars spent in the following two weeks.
    * ``segment`` (str): treatment. The e-mail campaign the customer received

    Read more in the :ref:`docs <Hillstrom>`.

    Args:
        target_col (string, 'visit' or 'conversion', 'spend' or 'all', default='visit'): Selects which column from dataset
            will be target
        data_home (str): The path to the folder where datasets are stored.
        dest_subdir (str): The name of the folder in which the dataset is stored.
        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
        return_X_y_t (bool, default=False): If True, returns (data, target, treatment) instead of a Bunch object.

    Returns:
        Bunch or tuple: dataset.

        Bunch:
            By default dictionary-like object, with the following attributes:

                * ``data`` (DataFrame object): Dataset without target and treatment.
                * ``target`` (Series or DataFrame object): Column target by values.
                * ``treatment`` (Series object): Column treatment by values.
                * ``DESCR`` (str): Description of the Hillstrom dataset.
                * ``feature_names`` (list): Names of the features.
                * ``target_name`` (str or list): Name of the target.
                * ``treatment_name`` (str): Name of the treatment.

        Tuple:
            tuple (data, target, treatment) if `return_X_y` is True

    References:
        https://blog.minethatdata.com/2008/03/minethatdata-e-mail-analytics-and-data.html

    Example::

        from sklift.datasets import fetch_hillstrom


        dataset = fetch_hillstrom(target_col='visit')
        data, target, treatment = dataset.data, dataset.target, dataset.treatment

        # alternative option
        data, target, treatment = fetch_hillstrom(target_col='visit', return_X_y_t=True)

    See Also:

        :func:`.fetch_lenta`: Load and return the Lenta dataset (classification).

        :func:`.fetch_x5`: Load and return the X5 RetailHero dataset (classification).

        :func:`.fetch_criteo`: Load and return the Criteo Uplift Prediction Dataset (classification).

        :func:`.fetch_megafon`: Load and return the MegaFon Uplift Competition dataset (classification)
    """
    target_cols = ['visit', 'conversion', 'spend']
    if target_col == 'all':
        target_col = target_cols
    elif target_col not in target_cols:
        raise ValueError(f"The target_col must be an element of {target_cols + ['all']}. "
                         f"Got value target_col={target_col}.")

    hillstrom_metadata = {
        'url': 'https://hillstorm1.s3.us-east-2.amazonaws.com/hillstorm_no_indices.csv.gz',
        'hillstrom_hash': 'a68a81291f53a14f4e29002629803ba3'
    }
 
    filename = hillstrom_metadata['url'].split('/')[-1]
    csv_path = _get_data(data_home=data_home, url=hillstrom_metadata['url'], dest_subdir=dest_subdir,
                         dest_filename=filename,
                         download_if_missing=download_if_missing)
    
    if _get_file_hash(csv_path) != hillstrom_metadata['hillstrom_hash']:
        raise ValueError(f"The {filename} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")

    treatment_col = 'segment'

    data = pd.read_csv(csv_path)
    treatment, target = data[treatment_col], data[target_col]

    data = data.drop(target_cols + [treatment_col], axis=1)

    if return_X_y_t:
        return data, target, treatment

    feature_names = list(data.columns)

    module_path = os.path.dirname(os.path.abspath(__file__))
    with open(os.path.join(module_path, 'descr', 'hillstrom.rst')) as rst_file:
        fdescr = rst_file.read()

    return Bunch(data=data, target=target, treatment=treatment, DESCR=fdescr,
                 feature_names=feature_names, target_name=target_col, treatment_name=treatment_col)


[docs]def fetch_megafon(data_home=None, dest_subdir=None, download_if_missing=True,
                  return_X_y_t=False):
    """Load and return the MegaFon Uplift Competition dataset (classification).

    An uplift modeling dataset containing synthetic data generated by telecom companies, trying to bring them closer to the real case that they encountered.

    Major columns:

    - ``X_1...X_50`` : anonymized feature set
    - ``conversion`` (binary): target
    - ``treatment_group`` (str): customer purchasing

    Read more in the :ref:`docs <MegaFon>`.

    Args:
        data_home (str): The path to the folder where datasets are stored.
        dest_subdir (str): The name of the folder in which the dataset is stored.
        download_if_missing (bool): Download the data if not present. Raises an IOError if False and data is missing.
        return_X_y_t (bool): If True, returns (data, target, treatment) instead of a Bunch object.

    Returns:
        Bunch or tuple: dataset.

        Bunch:
            By default dictionary-like object, with the following attributes:

                * ``data`` (DataFrame object): Dataset without target and treatment.
                * ``target`` (Series object): Column target by values.
                * ``treatment`` (Series object): Column treatment by values.
                * ``DESCR`` (str): Description of the Megafon dataset.
                * ``feature_names`` (list): Names of the features.
                * ``target_name`` (str): Name of the target.
                * ``treatment_name`` (str): Name of the treatment.

        Tuple:
            tuple (data, target, treatment) if `return_X_y` is True

    Example::

        from sklift.datasets import fetch_megafon


        dataset = fetch_megafon()
        data, target, treatment = dataset.data, dataset.target, dataset.treatment

        # alternative option
        data, target, treatment = fetch_megafon(return_X_y_t=True)

    See Also:

        :func:`.fetch_lenta`: Load and return the Lenta dataset (classification).

        :func:`.fetch_x5`: Load and return the X5 RetailHero dataset (classification).

        :func:`.fetch_criteo`: Load and return the Criteo Uplift Prediction Dataset (classification).

        :func:`.fetch_hillstrom`: Load and return Kevin Hillstrom Dataset MineThatData (classification or regression).

    """
    megafon_metadata = {
        'url': 'https://sklift.s3.eu-west-2.amazonaws.com/megafon_dataset.csv.gz',
        'megafon_hash': 'ee8d45a343d4d2cf90bb756c93959ecd'
    }

    filename = megafon_metadata['url'].split('/')[-1]
    csv_path = _get_data(data_home=data_home, url=megafon_metadata['url'], dest_subdir=dest_subdir,
                               dest_filename=filename,
                               download_if_missing=download_if_missing)
    
    if _get_file_hash(csv_path) != megafon_metadata['megafon_hash']:
        raise ValueError(f"The {filename} file is broken,\
                            please clean the directory with the clean_data_dir function, and run the function again")
        
    train = pd.read_csv(csv_path)

    target_col = 'conversion'
    treatment_col = 'treatment_group'

    treatment, target = train[treatment_col], train[target_col]

    train = train.drop([target_col, treatment_col], axis=1)

    if return_X_y_t:
        return train, target, treatment

    feature_names = list(train.columns)

    module_path = os.path.dirname(os.path.abspath(__file__))
    with open(os.path.join(module_path, 'descr', 'megafon.rst')) as rst_file:
        fdescr = rst_file.read()

    return Bunch(data=train, target=target, treatment=treatment, DESCR=fdescr,
                 feature_names=feature_names, target_name=target_col, treatment_name=treatment_col)