Source code for sklift.models.models

import warnings
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from sklearn.utils.validation import check_consistent_length
from sklearn.utils.multiclass import type_of_target


[docs]class SoloModel(BaseEstimator): """aka Treatment Dummy approach, or Single model approach, or S-Learner. Fit solo model on whole dataset with 'treatment' as an additional feature. For each test example calculate predictions on new set twice: with treatment == '1' and with treatment == '0'. After that calculate uplift as a delta between these predictions. Return delta of predictions for each example. See more details about `SoloModel in documentation`_. Args: estimator (estimator object implementing 'fit'): The object to use to fit the data. Attributes: trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment. ctrl_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when control. Example:: # import approach from sklift.models import SoloModel # import any estimator adheres to scikit-learn conventions from catboost import CatBoostClassifier sm = SoloModel(CatBoostClassifier(verbose=100, random_state=777)) # define approach sm = sm.fit(X_train, y_train, treat_train, estimator_fit_params={{'plot': True}) # fit the model uplift_sm = sm.predict(X_val) # predict uplift References: Lo, Victor. (2002). The True Lift Model - A Novel Data Mining Approach to Response Modeling in Database Marketing. SIGKDD Explorations. 4. 78-86. .. _SoloModel in documentation: https://scikit-uplift.readthedocs.io/en/latest/api/models.html#one-model-with-treatment-as-feature """ def __init__(self, estimator): self.estimator = estimator self.trmnt_preds_ = None self.ctrl_preds_ = None self._type_of_target = None
[docs] def fit(self, X, y, treatment, estimator_fit_params=None): """ Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. Return delta of predictions for each example. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. Returns: object: self """ check_consistent_length(X, y, treatment) treatment_values = np.unique(treatment) if len(treatment_values) != 2: raise ValueError("Expected only two unique values, got %s" % len(treatment_values)) if isinstance(X, np.ndarray): X_mod = np.column_stack((X, treatment)) elif isinstance(X, pd.core.frame.DataFrame): X_mod = X.assign(treatment=treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) self._type_of_target = type_of_target(y) if estimator_fit_params is None: estimator_fit_params = {} self.estimator.fit(X_mod, y, **estimator_fit_params) return self
[docs] def predict(self, X): """ Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ if isinstance(X, np.ndarray): X_mod_trmnt = np.column_stack((X, np.ones(X.shape[0]))) X_mod_ctrl = np.column_stack((X, np.zeros(X.shape[0]))) elif isinstance(X, pd.core.frame.DataFrame): X_mod_trmnt = X.assign(treatment=np.ones(X.shape[0])) X_mod_ctrl = X.assign(treatment=np.zeros(X.shape[0])) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame in training vector X, got %s" % type(X)) if self._type_of_target == 'binary': self.trmnt_preds_ = self.estimator.predict_proba(X_mod_trmnt)[:, 1] self.ctrl_preds_ = self.estimator.predict_proba(X_mod_ctrl)[:, 1] else: self.trmnt_preds_ = self.estimator.predict(X_mod_trmnt) self.ctrl_preds_ = self.estimator.predict(X_mod_ctrl) uplift = self.trmnt_preds_ - self.ctrl_preds_ return uplift
[docs]class ClassTransformation(BaseEstimator): """aka Class Variable Transformation or Revert Label approach. Redefine target variable, which indicates that treatment make some impact on target or did target is negative without treatment. Z = Y * W + (1 - Y)(1 - W), where Y - target, W - communication flag. Then, Uplift ~ 2 * (Z == 1) - 1 Returns only uplift predictions. See more details about `ClassTransformation in documentation`_. Args: estimator (estimator object implementing 'fit'): The object to use to fit the data. Example:: # import approach from sklift.models import ClassTransformation # import any estimator adheres to scikit-learn conventions from catboost import CatBoostClassifier ct = ClassTransformation(CatBoostClassifier(verbose=100, random_state=777)) # define approach ct = ct.fit(X_train, y_train, treat_train, estimator_fit_params={{'plot': True}) # fit the model uplift_ct = ct.predict(X_val) # predict uplift References: Maciej Jaskowski and Szymon Jaroszewicz. Uplift modeling for clinical trial data. ICML Workshop on Clinical Data Analysis, 2012. .. _ClassTransformation in documentation: https://scikit-uplift.readthedocs.io/en/latest/api/models.html#class-transformation """ def __init__(self, estimator): self.estimator = estimator self._type_of_target = None
[docs] def fit(self, X, y, treatment, estimator_fit_params=None): """ Fit the model according to the given training data. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_fit_params (dict, optional): Parameters to pass to the fit method of the estimator. Returns: object: self """ # TODO: check the treatment is binary check_consistent_length(X, y, treatment) self._type_of_target = type_of_target(y) if self._type_of_target != 'binary': raise ValueError("This approach is only suitable for binary classification problem") # TODO: Заменить raise на Warning _, treatment_counts = np.unique(treatment, return_counts=True) if treatment_counts[0] != treatment_counts[1]: warnings.warn( "It is recommended to use this approach on treatment balanced data. Current sample size is unbalanced.", category=UserWarning, stacklevel=2 ) y_mod = (np.array(y) == np.array(treatment)).astype(int) if estimator_fit_params is None: estimator_fit_params = {} self.estimator.fit(X, y_mod, **estimator_fit_params) return self
[docs] def predict(self, X): """ Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ uplift = 2 * self.estimator.predict_proba(X)[:, 1] - 1 return uplift
[docs]class TwoModels(BaseEstimator): """aka naïve approach, or difference score method, or double classifier approach. Fit two separate models: on the treatment data and on the control data. See more details about `TwoModels in documentation`_. Args: estimator_trmnt (estimator object implementing 'fit'): The object to use to fit the treatment data. estimator_ctrl (estimator object implementing 'fit'): The object to use to fit the control data. method (string, ‘vanilla’, ’ddr_control’ or ‘ddr_treatment’, default='vanilla'): Specifies the approach: * ‘vanilla’ - two independent models * ’ddr_control’ - dependent data representation (First train control estimator) * ’ddr_treatment’ - dependent data representation (First train treatment estimator) Attributes: trmnt_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when treatment. ctrl_preds_ (array-like, shape (n_samples, )): Estimator predictions on samples when control. Example:: # import approach from sklift.models import TwoModels # import any estimator adheres to scikit-learn conventions from catboost import CatBoostClassifier estimator_trmnt = CatBoostClassifier(silent=True, thread_count=2, random_state=42) estimator_ctrl = CatBoostClassifier(silent=True, thread_count=2, random_state=42) # define approach tm_ctrl = TwoModels( estimator_trmnt=estimator_trmnt, estimator_ctrl=estimator_ctrl, method='ddr_control' ) # fit the models tm_ctrl = tm_ctrl.fit( X_train, y_train, treat_train, estimator_trmnt_fit_params={'cat_features': cat_features}, estimator_ctrl_fit_params={'cat_features': cat_features} ) uplift_tm_ctrl = tm_ctrl.predict(X_val) # predict uplift References Betlei, Artem & Diemert, Eustache & Amini, Massih-Reza. (2018). Uplift Prediction with Dependent Feature Representation in Imbalanced Treatment and Control Conditions: 25th International Conference, ICONIP 2018, Siem Reap, Cambodia, December 13–16, 2018, Proceedings, Part V. 10.1007/978-3-030-04221-9_5. Zhao, Yan & Fang, Xiao & Simchi-Levi, David. (2017). Uplift Modeling with Multiple Treatments and General Response Types. 10.1137/1.9781611974973.66. .. _TwoModels in documentation: https://scikit-uplift.readthedocs.io/en/latest/api/models.html#one-model-with-treatment-as-feature """ def __init__(self, estimator_trmnt, estimator_ctrl, method='vanilla'): self.estimator_trmnt = estimator_trmnt self.estimator_ctrl = estimator_ctrl self.method = method self.trmnt_preds_ = None self.ctrl_preds_ = None self._type_of_target = None all_methods = ['vanilla', 'ddr_control', 'ddr_treatment'] if method not in all_methods: raise ValueError("Two models approach supports only methods in %s, got" " %s." % (all_methods, method)) if estimator_trmnt is estimator_ctrl: raise ValueError('Control and Treatment estimators should be different objects.')
[docs] def fit(self, X, y, treatment, estimator_trmnt_fit_params=None, estimator_ctrl_fit_params=None): """ Fit the model according to the given training data. For each test example calculate predictions on new set twice: by the first and second models. After that calculate uplift as a delta between these predictions. Return delta of predictions for each example. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. y (array-like, shape (n_samples,)): Target vector relative to X. treatment (array-like, shape (n_samples,)): Binary treatment vector relative to X. estimator_trmnt_fit_params (dict, optional): Parameters to pass to the fit method of the treatment estimator. estimator_ctrl_fit_params (dict, optional): Parameters to pass to the fit method of the control estimator. Returns: object: self """ # TODO: check the treatment is binary check_consistent_length(X, y, treatment) self._type_of_target = type_of_target(y) X_ctrl, y_ctrl = X[treatment == 0], y[treatment == 0] X_trmnt, y_trmnt = X[treatment == 1], y[treatment == 1] if estimator_trmnt_fit_params is None: estimator_trmnt_fit_params = {} if estimator_ctrl_fit_params is None: estimator_ctrl_fit_params = {} if self.method == 'vanilla': self.estimator_ctrl.fit( X_ctrl, y_ctrl, **estimator_ctrl_fit_params ) self.estimator_trmnt.fit( X_trmnt, y_trmnt, **estimator_trmnt_fit_params ) if self.method == 'ddr_control': self.estimator_ctrl.fit( X_ctrl, y_ctrl, **estimator_ctrl_fit_params ) if self._type_of_target == 'binary': ddr_control = self.estimator_ctrl.predict_proba(X_trmnt)[:, 1] else: ddr_control = self.estimator_ctrl.predict_(X_trmnt) if isinstance(X_trmnt, np.ndarray): X_trmnt_mod = np.column_stack((X_trmnt, ddr_control)) elif isinstance(X_trmnt, pd.core.frame.DataFrame): X_trmnt_mod = X_trmnt.assign(ddr_control=ddr_control) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_trmnt)) self.estimator_trmnt.fit( X_trmnt_mod, y_trmnt, **estimator_trmnt_fit_params ) if self.method == 'ddr_treatment': self.estimator_trmnt.fit( X_trmnt, y_trmnt, **estimator_trmnt_fit_params ) if self._type_of_target == 'binary': ddr_treatment = self.estimator_trmnt.predict_proba(X_ctrl)[:, 1] else: ddr_treatment = self.estimator_trmnt.predict(X_ctrl)[:, 1] if isinstance(X_ctrl, np.ndarray): X_ctrl_mod = np.column_stack((X_ctrl, ddr_treatment)) elif isinstance(X_trmnt, pd.core.frame.DataFrame): X_ctrl_mod = X_ctrl.assign(ddr_treatment=ddr_treatment) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_ctrl)) self.estimator_ctrl.fit( X_ctrl_mod, y_ctrl, **estimator_ctrl_fit_params ) return self
[docs] def predict(self, X): """ Perform uplift on samples in X. Args: X (array-like, shape (n_samples, n_features)): Training vector, where n_samples is the number of samples and n_features is the number of features. Returns: array (shape (n_samples,)): uplift """ if self.method == 'ddr_control': if self._type_of_target == 'binary': self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X)[:, 1] else: self.ctrl_preds_ = self.estimator_ctrl.predict(X) if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.ctrl_preds_)) elif isinstance(X, pd.core.frame.DataFrame): X_mod = X.assign(ddr_control=self.ctrl_preds_) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_mod)) self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X_mod)[:, 1] elif self.method == 'ddr_treatment': if self._type_of_target == 'binary': self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] else: self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] if isinstance(X, np.ndarray): X_mod = np.column_stack((X, self.trmnt_preds_)) elif isinstance(X, pd.core.frame.DataFrame): X_mod = X.assign(ddr_treatment=self.trmnt_preds_) else: raise TypeError("Expected numpy.ndarray or pandas.DataFrame, got %s" % type(X_mod)) self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X_mod)[:, 1] else: if self._type_of_target == 'binary': self.ctrl_preds_ = self.estimator_ctrl.predict_proba(X)[:, 1] self.trmnt_preds_ = self.estimator_trmnt.predict_proba(X)[:, 1] else: self.ctrl_preds_ = self.estimator_ctrl.predict(X) self.trmnt_preds_ = self.estimator_trmnt.predict(X) uplift = self.trmnt_preds_ - self.ctrl_preds_ return uplift