xgbse._debiased_bce.XGBSEDebiasedBCE¶

Train a set of logistic regressions on top of leaf embeddings produced by XGBoost, each predicting survival at different user-defined discrete time windows. The classifiers remove individuals as they are censored, with targets that are indicatorsof surviving at each window.

Note

Training and scoring of logistic regression models is efficient, being performed in parallel through joblib, so the model can scale to hundreds of thousands or millions of samples.
However, if many windows are used and data is large, training of logistic regression models may become a bottleneck, taking more time than training of the underlying XGBoost model.

Read more in How XGBSE works.

Source code in xgbse/_debiased_bce.py

class XGBSEDebiasedBCE(XGBSEBaseEstimator):
    """
    Train a set of logistic regressions on top of leaf embeddings produced by XGBoost,
    each predicting survival at different user-defined discrete time windows.
    The classifiers remove individuals as they are censored,
     with targets that are indicatorsof surviving at each window.

    !!! Note
        * Training and scoring of logistic regression models is efficient,
        being performed in parallel through joblib, so the model can scale to
        hundreds of thousands or millions of samples.
        * However, if many windows are used and data is large, training of
        logistic regression models may become a bottleneck, taking more time
        than training of the underlying XGBoost model.

    Read more in [How XGBSE works](https://loft-br.github.io/xgboost-survival-embeddings/how_xgbse_works.html).

    """

    def __init__(
        self,
        xgb_params: Optional[Dict[str, Any]] = None,
        lr_params: Dict[str, Any] = {},
        n_jobs: int = 1,
        enable_categorical: bool = False,
    ):
        """
        Args:
            xgb_params (Dict, None): Parameters for XGBoost model.
                If None, will use XGBoost defaults and set objective as `survival:aft`.
                Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for options.

            lr_params (Dict, None): Parameters for LogisticRegression model.
                If None, will use scikit-learn default parameters.

            n_jobs (int): Number of jobs used for parallel training of logistic regressions.

            enable_categorical (bool): Enable categorical feature support on xgboost model
        """
        super().__init__(xgb_params=xgb_params, enable_categorical=enable_categorical)
        self.lr_params = lr_params
        self.n_jobs = n_jobs

    def fit(
        self,
        X,
        y,
        time_bins: Optional[Sequence] = None,
        validation_data: Optional[List[Tuple[Any, Any]]] = None,
        num_boost_round: int = 10,
        early_stopping_rounds: Optional[int] = None,
        verbose_eval: int = 0,
        persist_train: bool = False,
        index_id=None,
    ):
        """
        Transform feature space by fitting a XGBoost model and returning its leafs.
        Leaves are transformed and considered as dummy variables to fit multiple
        logistic regression models to each evaluated time bin.

        Args:
            X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model

            y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
                and time of event or time of censoring as second field.

            time_bins (np.array): Specified time windows to use when making survival predictions

            validation_data (List[Tuple]): Validation data in the format of a list of tuples [(X, y)]
                if user desires to use early stopping

            num_boost_round (Int): Number of boosting iterations.

            early_stopping_rounds (Int): Activates early stopping.
                Validation metric needs to improve at least once
                in every **early_stopping_rounds** round(s) to continue training.
                See xgboost.train documentation.

            verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

            persist_train (Bool): Whether or not to persist training data to use explainability
                through prototypes

            index_id (pd.Index): User defined index if intended to use explainability
                through prototypes

        Returns:
            XGBSEDebiasedBCE: Trained XGBSEDebiasedBCE instance
        """
        self.fit_feature_extractor(
            X,
            y,
            time_bins=time_bins,
            validation_data=validation_data,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=verbose_eval,
        )

        E_train, T_train = convert_y(y)
        # predicting and encoding leaves
        self.encoder = OneHotEncoder()
        leaves = self.feature_extractor.predict_leaves(X)
        leaves_encoded = self.encoder.fit_transform(leaves)

        # convert targets for using with logistic regression
        self.targets, self.time_bins = _build_multi_task_targets(
            E_train, T_train, self.feature_extractor.time_bins
        )

        # fitting LR for several targets
        self.lr_estimators_ = self._fit_all_lr(leaves_encoded, self.targets)

        if persist_train:
            self.persist_train = True
            if index_id is None:
                index_id = X.index.copy()
            self.tree = BallTree(leaves, metric="hamming")

        self.index_id = index_id

        return self

    def _fit_one_lr(self, leaves_encoded, target):
        """
        Fits a single logistic regression to predict survival probability
        at a certain time bin as target. Encoded leaves are used as features.

        Args:
            leaves_encoded (np.array): A tensor of one hot encoded leaves.

            target (np.array): An array of time targets for a specific

        Returns:
            lr (sklearn.linear_model.LogisticRegression): A fitted Logistic
            Regression model. This model outputs calibrated survival probabilities
            on a time T.
        """

        # masking
        mask = target != -1

        # by default we use a logistic regression
        classifier = LogisticRegression(**self.lr_params)

        if len(target[mask]) == 0:
            # If there's no observation in a time bucket we raise an error
            raise ValueError("Error: No observations in a time bucket")
        elif len(np.unique(target[mask])) == 1:
            # If there's only one class in a time bucket
            # we create a dummy classifier that predicts that class and send a warning
            warnings.warn(
                "Warning: Only one class found in a time bucket", RuntimeWarning
            )
            classifier = DummyLogisticRegression()

        classifier.fit(leaves_encoded[mask, :], target[mask])
        return classifier

    def _fit_all_lr(self, leaves_encoded, targets):
        """
        Fits multiple Logistic Regressions to predict survival probability
        for a list of time bins as target. Encoded leaves are used as features.

        Args:
            leaves_encoded (np.array): A tensor of one hot encoded leaves.

            targets (np.array): An array of time targets for a specific time bin.

        Returns:
            lr_estimators (List): A list of fitted Logistic Regression models.
                These models output calibrated survival probabilities for all times
                in pre specified time bins.
        """

        with Parallel(n_jobs=self.n_jobs) as parallel:
            lr_estimators = parallel(
                delayed(self._fit_one_lr)(leaves_encoded, targets[:, i])
                for i in range(targets.shape[1])
            )

        return lr_estimators

    def _predict_from_lr_list(self, lr_estimators, leaves_encoded, time_bins):
        """
        Predicts survival probabilities from a list of multiple fitted
        Logistic Regressions models. Encoded leaves are used as features.

        Args:
            lr_estimators (List): A list of fitted Logistic Regression models.
            These models output calibrated survival probabilities for all times
            in pre specified time bins.

            leaves_encoded (np.array): A tensor of one hot encoded leaves.

            time_bins (np.array): Specified time bins to split targets.

        Returns:
            preds (pd.DataFrame): A dataframe of estimated survival probabilities
                for all times (columns), from the time_bins array, for all samples
                (rows).
        """

        with Parallel(n_jobs=self.n_jobs) as parallel:
            preds = parallel(
                delayed(m.predict_proba)(leaves_encoded) for m in lr_estimators
            )

        # organizing interval predictions from LRs
        preds = np.array(preds)[:, :, 1].T
        preds = pd.DataFrame(preds, columns=time_bins)

        # converting these interval predictions
        # to cumulative survival curve
        return hazard_to_survival(preds)

    def predict(self, X: pd.DataFrame, return_interval_probs: bool = False):
        """
        Predicts survival probabilities using XGBoost + Logistic Regression pipeline.

        Args:
            X (pd.DataFrame): Dataframe of features to be used as input for the
                XGBoost model.

            return_interval_probs (Bool): Boolean indicating if interval probabilities
                are to be returned. If False the cumulative survival is returned.
                Default is False.

        Returns:
            pd.DataFrame: A dataframe of survival probabilities
            for all times (columns), from a time_bins array, for all samples of X
            (rows). If return_interval_probs is True, the interval probabilities are
            returned instead of the cumulative survival probabilities.
        """

        # getting leaves and extracting neighbors
        leaves = self.feature_extractor.predict_leaves(X)

        leaves_encoded = self.encoder.transform(leaves)

        # predicting from logistic regression artifacts

        preds_df = self._predict_from_lr_list(
            self.lr_estimators_, leaves_encoded, self.time_bins
        )

        if return_interval_probs:
            preds_df = calculate_interval_failures(preds_df)

        return preds_df

`init(self, xgb_params=None, lr_params={}, n_jobs=1, enable_categorical=False)` `special` ¶

Parameters:

Name	Type	Description	Default
`xgb_params`	`Dict, None`	Parameters for XGBoost model. If None, will use XGBoost defaults and set objective as `survival:aft`. Check https://xgboost.readthedocs.io/en/latest/parameter.html for options.	`None`
`lr_params`	`Dict, None`	Parameters for LogisticRegression model. If None, will use scikit-learn default parameters.	`{}`
`n_jobs`	`int`	Number of jobs used for parallel training of logistic regressions.	`1`
`enable_categorical`	`bool`	Enable categorical feature support on xgboost model	`False`

Source code in xgbse/_debiased_bce.py

def __init__(
    self,
    xgb_params: Optional[Dict[str, Any]] = None,
    lr_params: Dict[str, Any] = {},
    n_jobs: int = 1,
    enable_categorical: bool = False,
):
    """
    Args:
        xgb_params (Dict, None): Parameters for XGBoost model.
            If None, will use XGBoost defaults and set objective as `survival:aft`.
            Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for options.

        lr_params (Dict, None): Parameters for LogisticRegression model.
            If None, will use scikit-learn default parameters.

        n_jobs (int): Number of jobs used for parallel training of logistic regressions.

        enable_categorical (bool): Enable categorical feature support on xgboost model
    """
    super().__init__(xgb_params=xgb_params, enable_categorical=enable_categorical)
    self.lr_params = lr_params
    self.n_jobs = n_jobs

`fit(self, X, y, time_bins=None, validation_data=None, num_boost_round=10, early_stopping_rounds=None, verbose_eval=0, persist_train=False, index_id=None)` ¶

Transform feature space by fitting a XGBoost model and returning its leafs. Leaves are transformed and considered as dummy variables to fit multiple logistic regression models to each evaluated time bin.

Parameters:

Name	Type	Description	Default
`X`	`[pd.DataFrame, np.array]`	Features to be used while fitting XGBoost model	required
`y`	`structured array(numpy.bool_, numpy.number`	Binary event indicator as first field, and time of event or time of censoring as second field.	required
`time_bins`	`np.array`	Specified time windows to use when making survival predictions	`None`
`validation_data`	`List[Tuple]`	Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping	`None`
`num_boost_round`	`Int`	Number of boosting iterations.	`10`
`early_stopping_rounds`	`Int`	Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training. See xgboost.train documentation.	`None`
`verbose_eval`	`[Bool, Int]`	Level of verbosity. See xgboost.train documentation.	`0`
`persist_train`	`Bool`	Whether or not to persist training data to use explainability through prototypes	`False`
`index_id`	`pd.Index`	User defined index if intended to use explainability through prototypes	`None`

Returns:

Type	Description
`XGBSEDebiasedBCE`	Trained XGBSEDebiasedBCE instance

Source code in xgbse/_debiased_bce.py

def fit(
    self,
    X,
    y,
    time_bins: Optional[Sequence] = None,
    validation_data: Optional[List[Tuple[Any, Any]]] = None,
    num_boost_round: int = 10,
    early_stopping_rounds: Optional[int] = None,
    verbose_eval: int = 0,
    persist_train: bool = False,
    index_id=None,
):
    """
    Transform feature space by fitting a XGBoost model and returning its leafs.
    Leaves are transformed and considered as dummy variables to fit multiple
    logistic regression models to each evaluated time bin.

    Args:
        X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model

        y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
            and time of event or time of censoring as second field.

        time_bins (np.array): Specified time windows to use when making survival predictions

        validation_data (List[Tuple]): Validation data in the format of a list of tuples [(X, y)]
            if user desires to use early stopping

        num_boost_round (Int): Number of boosting iterations.

        early_stopping_rounds (Int): Activates early stopping.
            Validation metric needs to improve at least once
            in every **early_stopping_rounds** round(s) to continue training.
            See xgboost.train documentation.

        verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.

        persist_train (Bool): Whether or not to persist training data to use explainability
            through prototypes

        index_id (pd.Index): User defined index if intended to use explainability
            through prototypes

    Returns:
        XGBSEDebiasedBCE: Trained XGBSEDebiasedBCE instance
    """
    self.fit_feature_extractor(
        X,
        y,
        time_bins=time_bins,
        validation_data=validation_data,
        num_boost_round=num_boost_round,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=verbose_eval,
    )

    E_train, T_train = convert_y(y)
    # predicting and encoding leaves
    self.encoder = OneHotEncoder()
    leaves = self.feature_extractor.predict_leaves(X)
    leaves_encoded = self.encoder.fit_transform(leaves)

    # convert targets for using with logistic regression
    self.targets, self.time_bins = _build_multi_task_targets(
        E_train, T_train, self.feature_extractor.time_bins
    )

    # fitting LR for several targets
    self.lr_estimators_ = self._fit_all_lr(leaves_encoded, self.targets)

    if persist_train:
        self.persist_train = True
        if index_id is None:
            index_id = X.index.copy()
        self.tree = BallTree(leaves, metric="hamming")

    self.index_id = index_id

    return self

`predict(self, X, return_interval_probs=False)` ¶

Predicts survival probabilities using XGBoost + Logistic Regression pipeline.

Parameters:

Name	Type	Description	Default
`X`	`pd.DataFrame`	Dataframe of features to be used as input for the XGBoost model.	required
`return_interval_probs`	`Bool`	Boolean indicating if interval probabilities are to be returned. If False the cumulative survival is returned. Default is False.	`False`

Returns:

Type	Description
`pd.DataFrame`	A dataframe of survival probabilities for all times (columns), from a time_bins array, for all samples of X (rows). If return_interval_probs is True, the interval probabilities are returned instead of the cumulative survival probabilities.

Source code in xgbse/_debiased_bce.py

def predict(self, X: pd.DataFrame, return_interval_probs: bool = False):
    """
    Predicts survival probabilities using XGBoost + Logistic Regression pipeline.

    Args:
        X (pd.DataFrame): Dataframe of features to be used as input for the
            XGBoost model.

        return_interval_probs (Bool): Boolean indicating if interval probabilities
            are to be returned. If False the cumulative survival is returned.
            Default is False.

    Returns:
        pd.DataFrame: A dataframe of survival probabilities
        for all times (columns), from a time_bins array, for all samples of X
        (rows). If return_interval_probs is True, the interval probabilities are
        returned instead of the cumulative survival probabilities.
    """

    # getting leaves and extracting neighbors
    leaves = self.feature_extractor.predict_leaves(X)

    leaves_encoded = self.encoder.transform(leaves)

    # predicting from logistic regression artifacts

    preds_df = self._predict_from_lr_list(
        self.lr_estimators_, leaves_encoded, self.time_bins
    )

    if return_interval_probs:
        preds_df = calculate_interval_failures(preds_df)

    return preds_df

`set_fit_request(self, *, early_stopping_rounds='$UNCHANGED$', index_id='$UNCHANGED$', num_boost_round='$UNCHANGED$', persist_train='$UNCHANGED$', time_bins='$UNCHANGED$', validation_data='$UNCHANGED$', verbose_eval='$UNCHANGED$')` ¶

Request metadata passed to the fit method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to fit if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to fit.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

.. versionadded:: 1.3

.. note:: This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters¶

early_stopping_rounds : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for early_stopping_rounds parameter in fit.

index_id : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for index_id parameter in fit.

num_boost_round : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for num_boost_round parameter in fit.

persist_train : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for persist_train parameter in fit.

time_bins : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for time_bins parameter in fit.

validation_data : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for validation_data parameter in fit.

verbose_eval : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for verbose_eval parameter in fit.

Returns¶

self : object The updated object.

Source code in xgbse/_debiased_bce.py

def func(*args, **kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

`set_predict_request(self, *, return_interval_probs='$UNCHANGED$')` ¶

Request metadata passed to the predict method.

Note that this method is only relevant if enable_metadata_routing=True (see :func:sklearn.set_config). Please see :ref:User Guide <metadata_routing> on how the routing mechanism works.

The options for each parameter are:

True: metadata is requested, and passed to predict if provided. The request is ignored if metadata is not provided.
False: metadata is not requested and the meta-estimator will not pass it to predict.
None: metadata is not requested, and the meta-estimator will raise an error if the user provides it.
str: metadata should be passed to the meta-estimator with this given alias instead of the original name.

The default (sklearn.utils.metadata_routing.UNCHANGED) retains the existing request. This allows you to change the request for some parameters and not others.

.. versionadded:: 1.3

.. note:: This method is only relevant if this estimator is used as a sub-estimator of a meta-estimator, e.g. used inside a :class:~sklearn.pipeline.Pipeline. Otherwise it has no effect.

Parameters¶

return_interval_probs : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED Metadata routing for return_interval_probs parameter in predict.

Returns¶

self : object The updated object.

Source code in xgbse/_debiased_bce.py

def func(*args, **kw):
    """Updates the request for provided parameters

    This docstring is overwritten below.
    See REQUESTER_DOC for expected functionality
    """
    if not _routing_enabled():
        raise RuntimeError(
            "This method is only available when metadata routing is enabled."
            " You can enable it using"
            " sklearn.set_config(enable_metadata_routing=True)."
        )

    if self.validate_keys and (set(kw) - set(self.keys)):
        raise TypeError(
            f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
            f"Accepted arguments are: {set(self.keys)}"
        )

    # This makes it possible to use the decorated method as an unbound method,
    # for instance when monkeypatching.
    # https://github.com/scikit-learn/scikit-learn/issues/28632
    if instance is None:
        _instance = args[0]
        args = args[1:]
    else:
        _instance = instance

    # Replicating python's behavior when positional args are given other than
    # `self`, and `self` is only allowed if this method is unbound.
    if args:
        raise TypeError(
            f"set_{self.name}_request() takes 0 positional argument but"
            f" {len(args)} were given"
        )

    requests = _instance._get_metadata_request()
    method_metadata_request = getattr(requests, self.name)

    for prop, alias in kw.items():
        if alias is not UNCHANGED:
            method_metadata_request.add_request(param=prop, alias=alias)
    _instance._metadata_request = requests

    return _instance

xgbse._debiased_bce.XGBSEDebiasedBCE¶

__init__(self, xgb_params=None, lr_params={}, n_jobs=1, enable_categorical=False) special ¶

fit(self, X, y, time_bins=None, validation_data=None, num_boost_round=10, early_stopping_rounds=None, verbose_eval=0, persist_train=False, index_id=None) ¶

predict(self, X, return_interval_probs=False) ¶

set_fit_request(self, *, early_stopping_rounds='$UNCHANGED$', index_id='$UNCHANGED$', num_boost_round='$UNCHANGED$', persist_train='$UNCHANGED$', time_bins='$UNCHANGED$', validation_data='$UNCHANGED$', verbose_eval='$UNCHANGED$') ¶

Parameters¶

Returns¶

set_predict_request(self, *, return_interval_probs='$UNCHANGED$') ¶

Parameters¶

Returns¶

`init(self, xgb_params=None, lr_params={}, n_jobs=1, enable_categorical=False)` `special` ¶

`fit(self, X, y, time_bins=None, validation_data=None, num_boost_round=10, early_stopping_rounds=None, verbose_eval=0, persist_train=False, index_id=None)` ¶

`predict(self, X, return_interval_probs=False)` ¶

`set_fit_request(self, *, early_stopping_rounds='$UNCHANGED$', index_id='$UNCHANGED$', num_boost_round='$UNCHANGED$', persist_train='$UNCHANGED$', time_bins='$UNCHANGED$', validation_data='$UNCHANGED$', verbose_eval='$UNCHANGED$')` ¶

`set_predict_request(self, *, return_interval_probs='$UNCHANGED$')` ¶