Source code for sklift.models.models

import warnings

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.utils.multiclass import type_of_target
from sklearn.utils.validation import check_consistent_length

from ..utils import check_is_binary


[docs]class SoloModel(BaseEstimator): """aka Treatment Dummy approach, or Single model approach, or S-Learner. Fit solo model on whole dataset with 'treatment' as an additional feature. Each object from the test sample is scored twice: with the communication flag equal to 1 and equal to 0. Subtracting the probabilities for each observation, we get the uplift. Return delta of predictions for each example. Read more in the :ref:`User Guide <SoloModel>`. Args: estimator (estimator object implementing 'fit'): The object to use to fit the data. method (string, ’dummy’ or ’treatment_interaction’, default='dummy'): Specifies the approach: * ``'dummy'``: Single model; * ``'treatment_interaction'``: Single model including treatment interactions. Attributes: trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment. ctrl_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when control. Example:: # import approach from sklift.models import SoloModel # import any estimator adheres to scikit-learn conventions from catboost import CatBoostClassifier sm = SoloModel(CatBoostClassifier(verbose=100, random_state=777)) # define approach sm = sm.fit(X_train, y_train, treat_train, estimator_fit_params={{'plot': True}) # fit the model uplift_sm = sm.predict(X_val) # predict uplift References: Lo, Victor. (2002). The True Lift Model - A Novel Data Mining Approach to Response Modeling in Database Marketing. SIGKDD Explorations. 4. 78-86. See Also: **Other approaches:** * :class:`.ClassTransformation`: Class Variable Transformation approach. * :class:`.ClassTransformationReg`: Transformed Outcome approach. * :class:`.TwoModels`: Double classifier approach. **Other:** * :func:`.plot_uplift_preds`: Plot histograms of treatment, control and uplift predictions. """ def __init__(self, estimator, method='dummy'): self.estimator = estimator self.method = method self.trmnt_preds_ = None self.ctrl_preds_ = None self._type_of_target = None all_methods = ['dummy', 'treatment_interaction'] if method not in all_methods: raise ValueError("SoloModel approach supports only methods in %s, got" " %s." % (all_methods, method))
[docs] def fit(self, X, y, treatment, estimator_fit_params=None): """Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. Return delta of predictions for each example. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Binary target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. Returns: object: self """ check_consistent_length(X, y, treatment) check_is_binary(treatment) treatment_values = np.unique(treatment) if len(treatment_values) != 2: raise ValueError("Expected only two unique values in treatment vector, got %s" % len(treatment_values)) if self.method == 'dummy': if isinstance(X, np.ndarray): X_mod = np.column_stack((X, treatment)) elif isinstance(X, pd.DataFrame): X_mod = X.assign(treatment=treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) if self.method == 'treatment_interaction': if isinstance(X, np.ndarray): X_mod = np.column_stack((X, np.multiply(X, np.array(treatment).reshape(-1, 1)), treatment)) elif isinstance(X, pd.DataFrame): X_mod = pd.concat([ X, X.apply(lambda x: x * treatment) .rename(columns=lambda x: str(x) + '_treatment_interaction') ], axis=1) \ .assign(treatment=treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) self._type_of_target = type_of_target(y) if estimator_fit_params is None: estimator_fit_params = {} self.estimator.fit(X_mod, y, **estimator_fit_params) return self
[docs] def predict(self, X): """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ if self.method == 'dummy': if isinstance(X, np.ndarray): X_mod_trmnt = np.column_stack((X, np.ones(X.shape[0]))) X_mod_ctrl = np.column_stack((X, np.zeros(X.shape[0]))) elif isinstance(X, pd.DataFrame): X_mod_trmnt = X.assign(treatment=np.ones(X.shape[0])) X_mod_ctrl = X.assign(treatment=np.zeros(X.shape[0])) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) if self.method == 'treatment_interaction': if isinstance(X, np.ndarray): X_mod_trmnt = np.column_stack((X, np.multiply(X, np.ones((X.shape[0], 1))), np.ones(X.shape[0]))) X_mod_ctrl = np.column_stack((X, np.multiply(X, np.zeros((X.shape[0], 1))), np.zeros(X.shape[0]))) elif isinstance(X, pd.DataFrame): X_mod_trmnt = pd.concat([ X, X.apply(lambda x: x * np.ones(X.shape[0])) .rename(columns=lambda x: str(x) + '_treatment_interaction') ], axis=1) \ .assign(treatment=np.ones(X.shape[0])) X_mod_ctrl = pd.concat([ X, X.apply(lambda x: x * np.zeros(X.shape[0])) .rename(columns=lambda x: str(x) + '_treatment_interaction') ], axis=1) \ .assign(treatment=np.zeros(X.shape[0])) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) if self._type_of_target == 'binary': self.trmnt_preds_ = self.estimator.predict_proba(X_mod_trmnt)[:, 1] self.ctrl_preds_ = self.estimator.predict_proba(X_mod_ctrl)[:, 1] else: self.trmnt_preds_ = self.estimator.predict(X_mod_trmnt) self.ctrl_preds_ = self.estimator.predict(X_mod_ctrl) uplift = self.trmnt_preds_ - self.ctrl_preds_ return uplift
[docs]class ClassTransformation(BaseEstimator): """aka Class Variable Transformation or Revert Label approach. Redefine target variable, which indicates that treatment make some impact on target or did target is negative without treatment: ``Z = Y * W + (1 - Y)(1 - W)``, where ``Y`` - target vector, ``W`` - vector of binary communication flags. Then, ``Uplift ~ 2 * (Z == 1) - 1`` Returns only uplift predictions. Read more in the :ref:`User Guide <ClassTransformation>`. Args: estimator (estimator object implementing 'fit'): The object to use to fit the data. Example:: # import approach from sklift.models import ClassTransformation # import any estimator adheres to scikit-learn conventions from catboost import CatBoostClassifier # define approach ct = ClassTransformation(CatBoostClassifier(verbose=100, random_state=777)) # fit the model ct = ct.fit(X_train, y_train, treat_train, estimator_fit_params={{'plot': True}) # predict uplift uplift_ct = ct.predict(X_val) References: Maciej Jaskowski and Szymon Jaroszewicz. Uplift modeling for clinical trial data. ICML Workshop on Clinical Data Analysis, 2012. See Also: **Other approaches:** * :class:`.ClassTransformationReg`: Transformed Outcome approach. * :class:`.SoloModel`: Single model approach. * :class:`.TwoModels`: Double classifier approach. """ def __init__(self, estimator): self.estimator = estimator self._type_of_target = None
[docs] def fit(self, X, y, treatment, estimator_fit_params=None): """Fit the model according to the given training data. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. Returns: object: self """ check_consistent_length(X, y, treatment) check_is_binary(treatment) self._type_of_target = type_of_target(y) if self._type_of_target != 'binary': raise ValueError("This approach is only suitable for binary classification problem") y_mod = (np.array(y) == np.array(treatment)).astype(int) if estimator_fit_params is None: estimator_fit_params = {} self.estimator.fit(X, y_mod, **estimator_fit_params) return self
[docs] def predict(self, X): """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ uplift = 2 * self.estimator.predict_proba(X)[:, 1] - 1 return uplift
[docs]class ClassTransformationReg(BaseEstimator): """aka CATE-generating (Conditional Average Treatment Effect) Transformation of the Outcome. Redefine target variable, which indicates that treatment make some impact on target or did target is negative without treatment: ``Z = Y * (W - p)/(p * (1 - p))``, where ``Y`` - target vector, ``W`` - vector of binary communication flags, and ``p`` is a propensity score (the probabilty that each y_i is assigned to the treatment group.). Then, train a regressor on ``Z`` to predict uplift. Returns uplift predictions and optionally propensity predictions. The propensity score can be a scalar value (e.g. p = 0.5), which would mean that every subject has identical probability of being assigned to the treatment group. Alternatively, the propensity can be learned using a Classifier model. In this case, the model predicts the probability that a given subject would be assigned to the treatment group. Read more in the :ref:`User Guide <ClassTransformationReg>`. Args: estimator (estimator object implementing 'fit'): The object to use to fit the data. propensity_val (float): A constant propensity value, which assumes every subject has equal probability of assignment to the treatment group. propensity_estimator (estimator object with `predict_proba`): The object used to predict the propensity score if `propensity_val` is not given. Example:: # import approach from sklift.models import ClassTransformationReg # import any estimator adheres to scikit-learn conventions from sklearn.linear_model import LinearRegression, LogisticRegression # define approach ct = ClassTransformationReg(estimator=LinearRegression(), propensity_estimator=LogisticRegression()) # fit the model ct = ct.fit(X_train, y_train, treat_train) # predict uplift uplift_ct = ct.predict(X_val) References: Athey, Susan & Imbens, Guido & Ramachandra, Vikas. (2015). Machine Learning Methods for Estimating Heterogeneous Causal Effects. See Also: **Other approaches:** * :class:`.SoloModel`: Single model approach. * :class:`.TwoModels`: Double classifier approach. * :class:`.ClassTransformation`: Binary classifier transformation approach. """ def __init__(self, estimator, propensity_val=None, propensity_estimator=None): if (propensity_val is None) and (propensity_estimator is None): raise ValueError('`propensity_val` and `propensity_estimator` cannot both be equal to `None`. Both arguments are currently null.') elif (propensity_val is not None) and (propensity_estimator is not None): raise ValueError('Exactly one of (`propensity_val`, `propensity_estimator`) must be None, and the other must be defined. Both arguments are currently non-null.') self.estimator = estimator self.propensity_val = propensity_val self.propensity_estimator = propensity_estimator self._type_of_target = None
[docs] def fit(self, X, y, treatment, estimator_fit_params=None): """Fit the model according to the given training data. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. Returns: object: self """ check_consistent_length(X, y, treatment) check_is_binary(treatment) self._type_of_target = type_of_target(y) if self.propensity_val is not None: p = self.propensity_val elif self.propensity_estimator is not None: self.propensity_estimator.fit(X, treatment) p = self.propensity_estimator.predict_proba(X)[:, 1] y_mod = y * ((treatment - p) / (p * (1 - p))) if estimator_fit_params is None: estimator_fit_params = {} self.estimator.fit(X, y_mod, **estimator_fit_params) return self
[docs] def predict_propensity(self, X): """Predict propensity values. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): propensity """ if self.propensity_estimator is not None: return self.propensity_estimator.predict_proba(X)[:, 1] else: return self.propensity_val
[docs] def predict(self, X): """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ uplift = self.estimator.predict(X) return uplift
[docs]class TwoModels(BaseEstimator): """aka naïve approach, or difference score method, or double classifier approach. Fit two separate models: on the treatment data and on the control data. Read more in the :ref:`User Guide <TwoModels>`. Args: estimator_trmnt (estimator object implementing 'fit'): The object to use to fit the treatment data. estimator_ctrl (estimator object implementing 'fit'): The object to use to fit the control data. method (string, 'vanilla', 'ddr_control' or 'ddr_treatment', default='vanilla'): Specifies the approach: * ``'vanilla'``: Two independent models; * ``'ddr_control'``: Dependent data representation (First train control estimator). * ``'ddr_treatment'``: Dependent data representation (First train treatment estimator). Attributes: trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment. ctrl_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when control. Example:: # import approach from sklift.models import TwoModels # import any estimator adheres to scikit-learn conventions from catboost import CatBoostClassifier estimator_trmnt = CatBoostClassifier(silent=True, thread_count=2, random_state=42) estimator_ctrl = CatBoostClassifier(silent=True, thread_count=2, random_state=42) # define approach tm_ctrl = TwoModels( estimator_trmnt=estimator_trmnt, estimator_ctrl=estimator_ctrl, method='ddr_control' ) # fit the models tm_ctrl = tm_ctrl.fit( X_train, y_train, treat_train, estimator_trmnt_fit_params={'cat_features': cat_features}, estimator_ctrl_fit_params={'cat_features': cat_features} ) uplift_tm_ctrl = tm_ctrl.predict(X_val) # predict uplift References Betlei, Artem & Diemert, Eustache & Amini, Massih-Reza. (2018). Uplift Prediction with Dependent Feature Representation in Imbalanced Treatment and Control Conditions: 25th International Conference, ICONIP 2018, Siem Reap, Cambodia, December 13–16, 2018, Proceedings, Part V. 10.1007/978-3-030-04221-9_5. Zhao, Yan & Fang, Xiao & Simchi-Levi, David. (2017). Uplift Modeling with Multiple Treatments and General Response Types. 10.1137/1.9781611974973.66. See Also: **Other approaches:** * :class:`.SoloModel`: Single model approach. * :class:`.ClassTransformation`: Class Variable Transformation approach. * :class:`.ClassTransformationReg`: Transformed Outcome approach. **Other:** * :func:`.plot_uplift_preds`: Plot histograms of treatment, control and uplift predictions. """ def __init__(self, estimator_trmnt, estimator_ctrl, method='vanilla'): self.estimator_trmnt = estimator_trmnt self.estimator_ctrl = estimator_ctrl self.method = method self.trmnt_preds_ = None self.ctrl_preds_ = None self._type_of_target = None all_methods = ['vanilla', 'ddr_control', 'ddr_treatment'] if method not in all_methods: raise ValueError("Two models approach supports only methods in %s, got" " %s." % (all_methods, method)) if estimator_trmnt is estimator_ctrl: raise ValueError('Control and Treatment estimators should be different objects.')
[docs] def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_fit_params=None): """Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. Return delta of predictions for each example. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method of the treatment estimator. estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method of the control estimator. Returns: object: self """ check_consistent_length(X, y, treatment) check_is_binary(treatment) self._type_of_target = type_of_target(y) y_copy = y.copy() treatment_copy = treatment.copy() if (isinstance(X, pd.Series) or isinstance(X, pd.DataFrame)) and isinstance(y_copy, pd.Series) and not X.index.equals(y_copy.index): y_copy.index = X.index warnings.warn("Target indexes do not match data indexes, re-indexing has been performed") if (isinstance(X, pd.Series) or isinstance(X, pd.DataFrame)) and isinstance(treatment_copy, pd.Series) and not X.index.equals(treatment_copy.index): treatment_copy.index = X.index warnings.warn("Treatment indexes do not match data indexes, re-indexing has been performed") X_ctrl, y_ctrl = X[treatment_copy == 0], y_copy[treatment_copy == 0] X_trmnt, y_trmnt = X[treatment_copy == 1], y_copy[treatment_copy == 1] if estimator_trmnt_fit_params is None: estimator_trmnt_fit_params = {} if estimator_ctrl_fit_params is None: estimator_ctrl_fit_params = {} if self.method == 'vanilla': self.estimator_ctrl.fit( X_ctrl, y_ctrl, **estimator_ctrl_fit_params ) self.estimator_trmnt.fit( X_trmnt, y_trmnt, **estimator_trmnt_fit_params ) if self.method == 'ddr_control': self.estimator_ctrl.fit( X_ctrl, y_ctrl, **estimator_ctrl_fit_params ) if self._type_of_target == 'binary': ddr_control = self.estimator_ctrl.predict_proba(X_trmnt)[:, 1] else: ddr_control = self.estimator_ctrl.predict(X_trmnt) if isinstance(X_trmnt, np.ndarray): X_trmnt_mod = np.column_stack((X_trmnt, ddr_control)) elif isinstance(X_trmnt, pd.DataFrame): X_trmnt_mod = X_trmnt.assign(ddr_control=ddr_control) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_trmnt)) self.estimator_trmnt.fit( X_trmnt_mod, y_trmnt, **estimator_trmnt_fit_params ) if self.method == 'ddr_treatment': self.estimator_trmnt.fit( X_trmnt, y_trmnt, **estimator_trmnt_fit_params ) if self._type_of_target == 'binary': ddr_treatment = self.estimator_trmnt.predict_proba(X_ctrl)[:, 1] else: ddr_treatment = self.estimator_trmnt.predict(X_ctrl) if isinstance(X_ctrl, np.ndarray): X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment)) elif isinstance(X_trmnt, pd.DataFrame): X_ctrl_mod = X_ctrl.assign(ddr_treatment=ddr_treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_ctrl)) self.estimator_ctrl.fit( X_ctrl_mod, y_ctrl, **estimator_ctrl_fit_params ) return self
[docs] def predict(self, X): """Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ if self.method == 'ddr_control': if self._type_of_target == 'binary': self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X)[:, 1] else: self.ctrl_preds_ = self.estimator_ctrl.predict(X) if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.ctrl_preds_)) elif isinstance(X, pd.DataFrame): X_mod = X.assign(ddr_control=self.ctrl_preds_) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X)) if self._type_of_target == 'binary': self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X_mod)[:, 1] else: self.trmnt_preds_ = self.estimator_trmnt.predict(X_mod) elif self.method == 'ddr_treatment': if self._type_of_target == 'binary': self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] else: self.trmnt_preds_ = self.estimator_trmnt.predict(X) if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.trmnt_preds_)) elif isinstance(X, pd.DataFrame): X_mod = X.assign(ddr_treatment=self.trmnt_preds_) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X)) if self._type_of_target == 'binary': self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X_mod)[:, 1] else: self.ctrl_preds_ = self.estimator_ctrl.predict(X_mod) else: if self._type_of_target == 'binary': self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X)[:, 1] self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] else: self.ctrl_preds_ = self.estimator_ctrl.predict(X) self.trmnt_preds_ = self.estimator_trmnt.predict(X) uplift = self.trmnt_preds_ - self.ctrl_preds_ return uplift