Source code for sklift.models.models

import warnings

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_consistent_length

from ..utils import check_is_binary


[docs]class SoloModel(BaseEstimator):
    """aka Treatment Dummy approach, or Single model approach, or S-Learner.

    Fit solo model on whole dataset with 'treatment' as an additional feature.

    Each object from the test sample is scored twice: with the communication flag equal to 1 and equal to 0.
    Subtracting the probabilities for each observation, we get the uplift.

    Return delta of predictions for each example.

    Read more in the :ref:`User Guide <SoloModel>`.

    Args:
        estimator (estimator object implementing 'fit'): The object to use to fit the data.
        method (string, ’dummy’ or ’treatment_interaction’, default='dummy'): Specifies the approach:

            * ``'dummy'``:
                Single model;
            * ``'treatment_interaction'``:
                Single model including treatment interactions.

    Attributes:
        trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment.
        ctrl_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when control.

    Example::

        # import approach
        from sklift.models import SoloModel
        # import any estimator adheres to scikit-learn conventions
        from catboost import CatBoostClassifier


        sm = SoloModel(CatBoostClassifier(verbose=100, random_state=777))  # define approach
        sm = sm.fit(X_train, y_train, treat_train, estimator_fit_params={{'plot': True})  # fit the model
        uplift_sm = sm.predict(X_val)  # predict uplift

    References:
        Lo, Victor. (2002). The True Lift Model - A Novel Data Mining Approach to Response Modeling
        in Database Marketing. SIGKDD Explorations. 4. 78-86.

    See Also:

        **Other approaches:**

        * :class:`.ClassTransformation`: Class Variable Transformation approach.
        * :class:`.TwoModels`: Double classifier approach.

        **Other:**

        * :func:`.plot_uplift_preds`: Plot histograms of treatment, control and uplift predictions.
    """

    def __init__(self, estimator, method='dummy'):
        self.estimator = estimator
        self.method = method
        self.trmnt_preds_ = None
        self.ctrl_preds_ = None
        self._type_of_target = None

        all_methods = ['dummy', 'treatment_interaction']
        if method not in all_methods:
            raise ValueError("SoloModel approach supports only methods in %s, got"
                             " %s." % (all_methods, method))

[docs]    def fit(self, X, y, treatment, estimator_fit_params=None):
        """Fit the model according to the given training data.

        For each test example calculate predictions on new set twice: by the first and second models.
        After that calculate uplift as a delta between these predictions.

        Return delta of predictions for each example.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of
                samples and n_features is the number of features.
            y (array-like, shape (n_samples,)): Binary target vector relative to X.
            treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X.
            estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator.

        Returns:
            object: self
        """

        check_consistent_length(X, y, treatment)
        check_is_binary(treatment)
        treatment_values = np.unique(treatment)

        if len(treatment_values) != 2:
            raise ValueError("Expected only two unique values in treatment vector, got %s" % len(treatment_values))

        if self.method == 'dummy':
            if isinstance(X, np.ndarray):
                X_mod = np.column_stack((X, treatment))
            elif isinstance(X, pd.DataFrame):
                X_mod = X.assign(treatment=treatment)
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X))

        if self.method == 'treatment_interaction':
            if isinstance(X, np.ndarray):
                X_mod = np.column_stack((X, np.multiply(X, np.array(treatment).reshape(-1, 1)), treatment))
            elif isinstance(X, pd.DataFrame):
                X_mod = pd.concat([
                    X,
                    X.apply(lambda x: x * treatment)
                        .rename(columns=lambda x: str(x) + '_treatment_interaction')
                ], axis=1) \
                    .assign(treatment=treatment)
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X))

        self._type_of_target = type_of_target(y)

        if estimator_fit_params is None:
            estimator_fit_params = {}
        self.estimator.fit(X_mod, y, **estimator_fit_params)
        return self

[docs]    def predict(self, X):
        """Perform uplift on samples in X.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples
                and n_features is the number of features.

        Returns:
            array (shape (n_samples,)): uplift
        """

        if self.method == 'dummy':
            if isinstance(X, np.ndarray):
                X_mod_trmnt = np.column_stack((X, np.ones(X.shape[0])))
                X_mod_ctrl = np.column_stack((X, np.zeros(X.shape[0])))
            elif isinstance(X, pd.DataFrame):
                X_mod_trmnt = X.assign(treatment=np.ones(X.shape[0]))
                X_mod_ctrl = X.assign(treatment=np.zeros(X.shape[0]))
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X))

        if self.method == 'treatment_interaction':
            if isinstance(X, np.ndarray):
                X_mod_trmnt = np.column_stack((X, np.multiply(X, np.ones((X.shape[0], 1))), np.ones(X.shape[0])))
                X_mod_ctrl = np.column_stack((X, np.multiply(X, np.zeros((X.shape[0], 1))), np.zeros(X.shape[0])))
            elif isinstance(X, pd.DataFrame):
                X_mod_trmnt = pd.concat([
                    X,
                    X.apply(lambda x: x * np.ones(X.shape[0]))
                        .rename(columns=lambda x: str(x) + '_treatment_interaction')
                ], axis=1) \
                    .assign(treatment=np.ones(X.shape[0]))
                X_mod_ctrl = pd.concat([
                    X,
                    X.apply(lambda x: x * np.zeros(X.shape[0]))
                        .rename(columns=lambda x: str(x) + '_treatment_interaction')
                ], axis=1) \
                    .assign(treatment=np.zeros(X.shape[0]))
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X))

        if self._type_of_target == 'binary':
            self.trmnt_preds_ = self.estimator.predict_proba(X_mod_trmnt)[:, 1]
            self.ctrl_preds_ = self.estimator.predict_proba(X_mod_ctrl)[:, 1]
        else:
            self.trmnt_preds_ = self.estimator.predict(X_mod_trmnt)
            self.ctrl_preds_ = self.estimator.predict(X_mod_ctrl)

        uplift = self.trmnt_preds_ - self.ctrl_preds_
        return uplift


[docs]class ClassTransformation(BaseEstimator):
    """aka Class Variable Transformation or Revert Label approach.

    Redefine target variable, which indicates that treatment make some impact on target or
    did target is negative without treatment: ``Z = Y * W + (1 - Y)(1 - W)``,

    where ``Y`` - target vector, ``W`` - vector of binary communication flags.

    Then, ``Uplift ~ 2 * (Z == 1) - 1``

    Returns only uplift predictions.

    Read more in the :ref:`User Guide <ClassTransformation>`.

    Args:
        estimator (estimator object implementing 'fit'): The object to use to fit the data.

    Example::

        # import approach
        from sklift.models import ClassTransformation
        # import any estimator adheres to scikit-learn conventions
        from catboost import CatBoostClassifier


        # define approach
        ct = ClassTransformation(CatBoostClassifier(verbose=100, random_state=777))
        # fit the model
        ct = ct.fit(X_train, y_train, treat_train, estimator_fit_params={{'plot': True})
        # predict uplift
        uplift_ct = ct.predict(X_val)

    References:
        Maciej Jaskowski and Szymon Jaroszewicz. Uplift modeling for clinical trial data.
        ICML Workshop on Clinical Data Analysis, 2012.

    See Also:

        **Other approaches:**

        * :class:`.SoloModel`: Single model approach.
        * :class:`.TwoModels`: Double classifier approach.
    """

    def __init__(self, estimator):
        self.estimator = estimator
        self._type_of_target = None

[docs]    def fit(self, X, y, treatment, estimator_fit_params=None):
        """Fit the model according to the given training data.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (array-like, shape (n_samples,)): Target vector relative to X.
            treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X.
            estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator.

        Returns:
            object: self
        """

        check_consistent_length(X, y, treatment)
        check_is_binary(treatment)
        self._type_of_target = type_of_target(y)

        if self._type_of_target != 'binary':
            raise ValueError("This approach is only suitable for binary classification problem")

        y_mod = (np.array(y) == np.array(treatment)).astype(int)

        if estimator_fit_params is None:
            estimator_fit_params = {}
        self.estimator.fit(X, y_mod, **estimator_fit_params)
        return self

[docs]    def predict(self, X):
        """Perform uplift on samples in X.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples
                and n_features is the number of features.

        Returns:
            array (shape (n_samples,)): uplift
        """
        uplift = 2 * self.estimator.predict_proba(X)[:, 1] - 1
        return uplift


class ClassTransformationReg(BaseEstimator):
    """aka CATE (Conditional Average Treatment Effect) generating transformation approach for continuous labels.

    Redefine target variable, which indicates that treatment make some impact on target or
    did target is negative without treatment: ``Z = Y * (W - p)/(p * (1 - p))``,

    where ``Y`` - target vector, ``W`` - vector of binary communication flags, and ``p`` is a propensity score (the probabilty that each y_i is assigned to the treatment group.).

    Then, train a regressor on ``Z`` to predict uplift.

    Returns uplift predictions and optionally propensity predictions.

    The propensity score can be a scalar value (e.g. p = 0.5), which would mean that every subject has identical probability of being assigned to the treatment group.

    Alternatively, the propensity can be learned using a Classifier model. In this case, the model predicts the probability that a given subject would be assigned to the treatment group.

    Read more in the :ref:`User Guide <ClassTransformationReg>`.

    Args:
        estimator (estimator object implementing 'fit'): The object to use to fit the data.
        propensity_val (float): A constant propensity value, which assumes every subject has equal probability of assignment to the treatment group.
        propensity_estimator (estimator object with `predict_proba`): The object used to predict the propensity score if `propensity_val` is not given.


    Example::

        # import approach
        from sklift.models import ClassTransformationReg
        # import any estimator adheres to scikit-learn conventions
        from sklearn.linear_model import LinearRegression, LogisticRegression


        # define approach
        ct = ClassTransformationReg(estimator=LinearRegression, propensity_estimator=LogisticRegression())
        # fit the model
        ct = ct.fit(X_train, y_train, treat_train)
        # predict uplift
        uplift_ct = ct.predict(X_val)

    References:
        Maciej Jaskowski and Szymon Jaroszewicz. Uplift modeling for clinical trial data.
        ICML Workshop on Clinical Data Analysis, 2012.

    See Also:

        **Other approaches:**

        * :class:`.SoloModel`: Single model approach.
        * :class:`.TwoModels`: Double classifier approach.
        * :classL1`.ClassTransformation`: Binary classifier transformation approach.
    """

    def __init__(self, estimator, propensity_val=None, propensity_estimator=None):

        if (propensity_val is None) and (propensity_estimator is None):
            raise ValueError('`propensity_val` and `propensity_estimator` cannot both be equal to `None`. Both arguments are currently null.')
        elif (propensity_val is not None) and (propensity_estimator is not None):
            raise ValueError('Exactly one of (`propensity_val`, `propensity_estimator`) must be None, and the other must be defined. Both arguments are currently non-null.')

        self.estimator = estimator
        self.propensity_val = propensity_val
        self.propensity_estimator = propensity_estimator

        self._type_of_target = None

    def fit(self, X, y, treatment, estimator_fit_params=None):
        """Fit the model according to the given training data.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and
                n_features is the number of features.
            y (array-like, shape (n_samples,)): Target vector relative to X.
            treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X.
            estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator.

        Returns:
            object: self
        """

        check_consistent_length(X, y, treatment)
        check_is_binary(treatment)
        self._type_of_target = type_of_target(y)

        if self.propensity_val is not None:
            p = self.propensity_val

        elif self.propensity_estimator is not None:
            self.propensity_estimator.fit(X, treatment)
            p = self.propensity_estimator.predict_proba(X)[:, 1]

        y_mod = y * ((treatment - p) / (p * (1 - p)))

        if estimator_fit_params is None:
                estimator_fit_params = {}

        self.estimator.fit(X, y_mod, **estimator_fit_params)

        return self


    def predict_propensity(self, X):
        """Predict propensity values.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples
                and n_features is the number of features.

        Returns:
            array (shape (n_samples,)): propensity
        """

        if self.propensity_estimator is not None:
            return self.propensity_estimator.predict_proba(X)[:, 1]
        else:
            return self.propensity_val

    def predict(self, X):
        """Perform uplift on samples in X.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples
                and n_features is the number of features.

        Returns:
            array (shape (n_samples,)): uplift
        """

        uplift = self.estimator.predict(X)
        return uplift


[docs]class TwoModels(BaseEstimator):
    """aka naïve approach, or difference score method, or double classifier approach.

    Fit two separate models: on the treatment data and on the control data.

    Read more in the :ref:`User Guide <TwoModels>`.

    Args:
        estimator_trmnt (estimator object implementing 'fit'): The object to use to fit the treatment data.
        estimator_ctrl (estimator object implementing 'fit'): The object to use to fit the control data.
        method (string, 'vanilla', 'ddr_control' or 'ddr_treatment', default='vanilla'): Specifies the approach:

            * ``'vanilla'``:
                Two independent models;
            * ``'ddr_control'``:
                Dependent data representation (First train control estimator).
            * ``'ddr_treatment'``:
                Dependent data representation (First train treatment estimator).

    Attributes:
        trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment.
        ctrl_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when control.

    Example::

        # import approach
        from sklift.models import TwoModels
        # import any estimator adheres to scikit-learn conventions
        from catboost import CatBoostClassifier


        estimator_trmnt = CatBoostClassifier(silent=True, thread_count=2, random_state=42)
        estimator_ctrl = CatBoostClassifier(silent=True, thread_count=2, random_state=42)

        # define approach
        tm_ctrl = TwoModels(
            estimator_trmnt=estimator_trmnt,
            estimator_ctrl=estimator_ctrl,
            method='ddr_control'
        )

        # fit the models
        tm_ctrl = tm_ctrl.fit(
            X_train, y_train, treat_train,
            estimator_trmnt_fit_params={'cat_features': cat_features},
            estimator_ctrl_fit_params={'cat_features': cat_features}
        )
        uplift_tm_ctrl = tm_ctrl.predict(X_val)  # predict uplift

    References
        Betlei, Artem & Diemert, Eustache & Amini, Massih-Reza. (2018).
        Uplift Prediction with Dependent Feature Representation in Imbalanced Treatment and Control Conditions:
        25th International Conference, ICONIP 2018, Siem Reap, Cambodia, December 13–16, 2018,
        Proceedings, Part V. 10.1007/978-3-030-04221-9_5.

        Zhao, Yan & Fang, Xiao & Simchi-Levi, David. (2017).
        Uplift Modeling with Multiple Treatments and General Response Types.
        10.1137/1.9781611974973.66.

    See Also:

        **Other approaches:**

        * :class:`.SoloModel`: Single model approach.
        * :class:`.ClassTransformation`: Class Variable Transformation approach.

        **Other:**

        * :func:`.plot_uplift_preds`: Plot histograms of treatment, control and uplift predictions.
    """

    def __init__(self, estimator_trmnt, estimator_ctrl, method='vanilla'):
        self.estimator_trmnt = estimator_trmnt
        self.estimator_ctrl = estimator_ctrl
        self.method = method
        self.trmnt_preds_ = None
        self.ctrl_preds_ = None
        self._type_of_target = None

        all_methods = ['vanilla', 'ddr_control', 'ddr_treatment']
        if method not in all_methods:
            raise ValueError("Two models approach supports only methods in %s, got"
                             " %s." % (all_methods, method))

        if estimator_trmnt is estimator_ctrl:
            raise ValueError('Control and Treatment estimators should be different objects.')

[docs]    def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_fit_params=None):
        """Fit the model according to the given training data.

        For each test example calculate predictions on new set twice: by the first and second models.
        After that calculate uplift as a delta between these predictions.

        Return delta of predictions for each example.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number
                of samples and n_features is the number of features.
            y (array-like, shape (n_samples,)): Target vector relative to X.
            treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X.
            estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method
                of the treatment estimator.
            estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method
                of the control estimator.

        Returns:
            object: self
        """

        check_consistent_length(X, y, treatment)
        check_is_binary(treatment)
        self._type_of_target = type_of_target(y)

        y_copy = y.copy()
        treatment_copy = treatment.copy()

        if (isinstance(X, pd.Series) or isinstance(X, pd.DataFrame)) and isinstance(y_copy, pd.Series) and not X.index.equals(y_copy.index):
            y_copy.index = X.index
            warnings.warn("Target indexes do not match data indexes, re-indexing has been performed")
        if (isinstance(X, pd.Series) or isinstance(X, pd.DataFrame)) and isinstance(treatment_copy, pd.Series) and not X.index.equals(treatment_copy.index):
            treatment_copy.index = X.index
            warnings.warn("Treatment indexes do not match data indexes, re-indexing has been performed")

        X_ctrl, y_ctrl = X[treatment_copy == 0], y_copy[treatment_copy == 0]
        X_trmnt, y_trmnt = X[treatment_copy == 1], y_copy[treatment_copy == 1]

        if estimator_trmnt_fit_params is None:
            estimator_trmnt_fit_params = {}
        if estimator_ctrl_fit_params is None:
            estimator_ctrl_fit_params = {}

        if self.method == 'vanilla':
            self.estimator_ctrl.fit(
                X_ctrl, y_ctrl, **estimator_ctrl_fit_params
            )
            self.estimator_trmnt.fit(
                X_trmnt, y_trmnt, **estimator_trmnt_fit_params
            )

        if self.method == 'ddr_control':
            self.estimator_ctrl.fit(
                X_ctrl, y_ctrl, **estimator_ctrl_fit_params
            )
            if self._type_of_target == 'binary':
                ddr_control = self.estimator_ctrl.predict_proba(X_trmnt)[:, 1]
            else:
                ddr_control = self.estimator_ctrl.predict(X_trmnt)

            if isinstance(X_trmnt, np.ndarray):
                X_trmnt_mod = np.column_stack((X_trmnt, ddr_control))
            elif isinstance(X_trmnt, pd.DataFrame):
                X_trmnt_mod = X_trmnt.assign(ddr_control=ddr_control)
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_trmnt))

            self.estimator_trmnt.fit(
                X_trmnt_mod, y_trmnt, **estimator_trmnt_fit_params
            )

        if self.method == 'ddr_treatment':
            self.estimator_trmnt.fit(
                X_trmnt, y_trmnt, **estimator_trmnt_fit_params
            )
            if self._type_of_target == 'binary':
                ddr_treatment = self.estimator_trmnt.predict_proba(X_ctrl)[:, 1]
            else:
                ddr_treatment = self.estimator_trmnt.predict(X_ctrl)

            if isinstance(X_ctrl, np.ndarray):
                X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment))
            elif isinstance(X_trmnt, pd.DataFrame):
                X_ctrl_mod = X_ctrl.assign(ddr_treatment=ddr_treatment)
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_ctrl))

            self.estimator_ctrl.fit(
                X_ctrl_mod, y_ctrl, **estimator_ctrl_fit_params
            )

        return self

[docs]    def predict(self, X):
        """Perform uplift on samples in X.

        Args:
            X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples
                and n_features is the number of features.

        Returns:
            array (shape (n_samples,)): uplift
        """

        if self.method == 'ddr_control':
            if self._type_of_target == 'binary':
                self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X)[:, 1]
            else:
                self.ctrl_preds_ = self.estimator_ctrl.predict(X)

            if isinstance(X, np.ndarray):
                X_mod = np.column_stack((X, self.ctrl_preds_))
            elif isinstance(X, pd.DataFrame):
                X_mod = X.assign(ddr_control=self.ctrl_preds_)
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X))

            if self._type_of_target == 'binary':
                self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X_mod)[:, 1]
            else:
                self.trmnt_preds_ = self.estimator_trmnt.predict(X_mod)

        elif self.method == 'ddr_treatment':
            if self._type_of_target == 'binary':
                self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1]
            else:
                self.trmnt_preds_ = self.estimator_trmnt.predict(X)

            if isinstance(X, np.ndarray):
                X_mod = np.column_stack((X, self.trmnt_preds_))
            elif isinstance(X, pd.DataFrame):
                X_mod = X.assign(ddr_treatment=self.trmnt_preds_)
            else:
                raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X))

            if self._type_of_target == 'binary':
                self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X_mod)[:, 1]
            else:
                self.ctrl_preds_ = self.estimator_ctrl.predict(X_mod)

        else:
            if self._type_of_target == 'binary':
                self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X)[:, 1]
                self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1]
            else:
                self.ctrl_preds_ = self.estimator_ctrl.predict(X)
                self.trmnt_preds_ = self.estimator_trmnt.predict(X)

        uplift = self.trmnt_preds_ - self.ctrl_preds_

        return uplift