xgbse._debiased_bce.XGBSEDebiasedBCE¶
Train a set of logistic regressions on top of leaf embeddings produced by XGBoost, each predicting survival at different user-defined discrete time windows. The classifiers remove individuals as they are censored, with targets that are indicatorsof surviving at each window.
Note
- Training and scoring of logistic regression models is efficient, being performed in parallel through joblib, so the model can scale to hundreds of thousands or millions of samples.
- However, if many windows are used and data is large, training of logistic regression models may become a bottleneck, taking more time than training of the underlying XGBoost model.
Read more in How XGBSE works.
Source code in xgbse/_debiased_bce.py
class XGBSEDebiasedBCE(XGBSEBaseEstimator):
"""
Train a set of logistic regressions on top of leaf embeddings produced by XGBoost,
each predicting survival at different user-defined discrete time windows.
The classifiers remove individuals as they are censored,
with targets that are indicatorsof surviving at each window.
!!! Note
* Training and scoring of logistic regression models is efficient,
being performed in parallel through joblib, so the model can scale to
hundreds of thousands or millions of samples.
* However, if many windows are used and data is large, training of
logistic regression models may become a bottleneck, taking more time
than training of the underlying XGBoost model.
Read more in [How XGBSE works](https://loft-br.github.io/xgboost-survival-embeddings/how_xgbse_works.html).
"""
def __init__(
self,
xgb_params: Optional[Dict[str, Any]] = None,
lr_params: Dict[str, Any] = {},
n_jobs: int = 1,
enable_categorical: bool = False,
):
"""
Args:
xgb_params (Dict, None): Parameters for XGBoost model.
If None, will use XGBoost defaults and set objective as `survival:aft`.
Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for options.
lr_params (Dict, None): Parameters for LogisticRegression model.
If None, will use scikit-learn default parameters.
n_jobs (int): Number of jobs used for parallel training of logistic regressions.
enable_categorical (bool): Enable categorical feature support on xgboost model
"""
super().__init__(xgb_params=xgb_params, enable_categorical=enable_categorical)
self.lr_params = lr_params
self.n_jobs = n_jobs
def fit(
self,
X,
y,
time_bins: Optional[Sequence] = None,
validation_data: Optional[List[Tuple[Any, Any]]] = None,
num_boost_round: int = 10,
early_stopping_rounds: Optional[int] = None,
verbose_eval: int = 0,
persist_train: bool = False,
index_id=None,
):
"""
Transform feature space by fitting a XGBoost model and returning its leafs.
Leaves are transformed and considered as dummy variables to fit multiple
logistic regression models to each evaluated time bin.
Args:
X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model
y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
and time of event or time of censoring as second field.
time_bins (np.array): Specified time windows to use when making survival predictions
validation_data (List[Tuple]): Validation data in the format of a list of tuples [(X, y)]
if user desires to use early stopping
num_boost_round (Int): Number of boosting iterations.
early_stopping_rounds (Int): Activates early stopping.
Validation metric needs to improve at least once
in every **early_stopping_rounds** round(s) to continue training.
See xgboost.train documentation.
verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.
persist_train (Bool): Whether or not to persist training data to use explainability
through prototypes
index_id (pd.Index): User defined index if intended to use explainability
through prototypes
Returns:
XGBSEDebiasedBCE: Trained XGBSEDebiasedBCE instance
"""
self.fit_feature_extractor(
X,
y,
time_bins=time_bins,
validation_data=validation_data,
num_boost_round=num_boost_round,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=verbose_eval,
)
E_train, T_train = convert_y(y)
# predicting and encoding leaves
self.encoder = OneHotEncoder()
leaves = self.feature_extractor.predict_leaves(X)
leaves_encoded = self.encoder.fit_transform(leaves)
# convert targets for using with logistic regression
self.targets, self.time_bins = _build_multi_task_targets(
E_train, T_train, self.feature_extractor.time_bins
)
# fitting LR for several targets
self.lr_estimators_ = self._fit_all_lr(leaves_encoded, self.targets)
if persist_train:
self.persist_train = True
if index_id is None:
index_id = X.index.copy()
self.tree = BallTree(leaves, metric="hamming")
self.index_id = index_id
return self
def _fit_one_lr(self, leaves_encoded, target):
"""
Fits a single logistic regression to predict survival probability
at a certain time bin as target. Encoded leaves are used as features.
Args:
leaves_encoded (np.array): A tensor of one hot encoded leaves.
target (np.array): An array of time targets for a specific
Returns:
lr (sklearn.linear_model.LogisticRegression): A fitted Logistic
Regression model. This model outputs calibrated survival probabilities
on a time T.
"""
# masking
mask = target != -1
# by default we use a logistic regression
classifier = LogisticRegression(**self.lr_params)
if len(target[mask]) == 0:
# If there's no observation in a time bucket we raise an error
raise ValueError("Error: No observations in a time bucket")
elif len(np.unique(target[mask])) == 1:
# If there's only one class in a time bucket
# we create a dummy classifier that predicts that class and send a warning
warnings.warn(
"Warning: Only one class found in a time bucket", RuntimeWarning
)
classifier = DummyLogisticRegression()
classifier.fit(leaves_encoded[mask, :], target[mask])
return classifier
def _fit_all_lr(self, leaves_encoded, targets):
"""
Fits multiple Logistic Regressions to predict survival probability
for a list of time bins as target. Encoded leaves are used as features.
Args:
leaves_encoded (np.array): A tensor of one hot encoded leaves.
targets (np.array): An array of time targets for a specific time bin.
Returns:
lr_estimators (List): A list of fitted Logistic Regression models.
These models output calibrated survival probabilities for all times
in pre specified time bins.
"""
with Parallel(n_jobs=self.n_jobs) as parallel:
lr_estimators = parallel(
delayed(self._fit_one_lr)(leaves_encoded, targets[:, i])
for i in range(targets.shape[1])
)
return lr_estimators
def _predict_from_lr_list(self, lr_estimators, leaves_encoded, time_bins):
"""
Predicts survival probabilities from a list of multiple fitted
Logistic Regressions models. Encoded leaves are used as features.
Args:
lr_estimators (List): A list of fitted Logistic Regression models.
These models output calibrated survival probabilities for all times
in pre specified time bins.
leaves_encoded (np.array): A tensor of one hot encoded leaves.
time_bins (np.array): Specified time bins to split targets.
Returns:
preds (pd.DataFrame): A dataframe of estimated survival probabilities
for all times (columns), from the time_bins array, for all samples
(rows).
"""
with Parallel(n_jobs=self.n_jobs) as parallel:
preds = parallel(
delayed(m.predict_proba)(leaves_encoded) for m in lr_estimators
)
# organizing interval predictions from LRs
preds = np.array(preds)[:, :, 1].T
preds = pd.DataFrame(preds, columns=time_bins)
# converting these interval predictions
# to cumulative survival curve
return hazard_to_survival(preds)
def predict(self, X: pd.DataFrame, return_interval_probs: bool = False):
"""
Predicts survival probabilities using XGBoost + Logistic Regression pipeline.
Args:
X (pd.DataFrame): Dataframe of features to be used as input for the
XGBoost model.
return_interval_probs (Bool): Boolean indicating if interval probabilities
are to be returned. If False the cumulative survival is returned.
Default is False.
Returns:
pd.DataFrame: A dataframe of survival probabilities
for all times (columns), from a time_bins array, for all samples of X
(rows). If return_interval_probs is True, the interval probabilities are
returned instead of the cumulative survival probabilities.
"""
# getting leaves and extracting neighbors
leaves = self.feature_extractor.predict_leaves(X)
leaves_encoded = self.encoder.transform(leaves)
# predicting from logistic regression artifacts
preds_df = self._predict_from_lr_list(
self.lr_estimators_, leaves_encoded, self.time_bins
)
if return_interval_probs:
preds_df = calculate_interval_failures(preds_df)
return preds_df
__init__(self, xgb_params=None, lr_params={}, n_jobs=1, enable_categorical=False)
special
¶
Parameters:
Name | Type | Description | Default |
---|---|---|---|
xgb_params |
Dict, None |
Parameters for XGBoost model.
If None, will use XGBoost defaults and set objective as |
None |
lr_params |
Dict, None |
Parameters for LogisticRegression model. If None, will use scikit-learn default parameters. |
{} |
n_jobs |
int |
Number of jobs used for parallel training of logistic regressions. |
1 |
enable_categorical |
bool |
Enable categorical feature support on xgboost model |
False |
Source code in xgbse/_debiased_bce.py
def __init__(
self,
xgb_params: Optional[Dict[str, Any]] = None,
lr_params: Dict[str, Any] = {},
n_jobs: int = 1,
enable_categorical: bool = False,
):
"""
Args:
xgb_params (Dict, None): Parameters for XGBoost model.
If None, will use XGBoost defaults and set objective as `survival:aft`.
Check <https://xgboost.readthedocs.io/en/latest/parameter.html> for options.
lr_params (Dict, None): Parameters for LogisticRegression model.
If None, will use scikit-learn default parameters.
n_jobs (int): Number of jobs used for parallel training of logistic regressions.
enable_categorical (bool): Enable categorical feature support on xgboost model
"""
super().__init__(xgb_params=xgb_params, enable_categorical=enable_categorical)
self.lr_params = lr_params
self.n_jobs = n_jobs
fit(self, X, y, time_bins=None, validation_data=None, num_boost_round=10, early_stopping_rounds=None, verbose_eval=0, persist_train=False, index_id=None)
¶
Transform feature space by fitting a XGBoost model and returning its leafs. Leaves are transformed and considered as dummy variables to fit multiple logistic regression models to each evaluated time bin.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
X |
[pd.DataFrame, np.array] |
Features to be used while fitting XGBoost model |
required |
y |
structured array(numpy.bool_, numpy.number |
Binary event indicator as first field, and time of event or time of censoring as second field. |
required |
time_bins |
np.array |
Specified time windows to use when making survival predictions |
None |
validation_data |
List[Tuple] |
Validation data in the format of a list of tuples [(X, y)] if user desires to use early stopping |
None |
num_boost_round |
Int |
Number of boosting iterations. |
10 |
early_stopping_rounds |
Int |
Activates early stopping. Validation metric needs to improve at least once in every early_stopping_rounds round(s) to continue training. See xgboost.train documentation. |
None |
verbose_eval |
[Bool, Int] |
Level of verbosity. See xgboost.train documentation. |
0 |
persist_train |
Bool |
Whether or not to persist training data to use explainability through prototypes |
False |
index_id |
pd.Index |
User defined index if intended to use explainability through prototypes |
None |
Returns:
Type | Description |
---|---|
XGBSEDebiasedBCE |
Trained XGBSEDebiasedBCE instance |
Source code in xgbse/_debiased_bce.py
def fit(
self,
X,
y,
time_bins: Optional[Sequence] = None,
validation_data: Optional[List[Tuple[Any, Any]]] = None,
num_boost_round: int = 10,
early_stopping_rounds: Optional[int] = None,
verbose_eval: int = 0,
persist_train: bool = False,
index_id=None,
):
"""
Transform feature space by fitting a XGBoost model and returning its leafs.
Leaves are transformed and considered as dummy variables to fit multiple
logistic regression models to each evaluated time bin.
Args:
X ([pd.DataFrame, np.array]): Features to be used while fitting XGBoost model
y (structured array(numpy.bool_, numpy.number)): Binary event indicator as first field,
and time of event or time of censoring as second field.
time_bins (np.array): Specified time windows to use when making survival predictions
validation_data (List[Tuple]): Validation data in the format of a list of tuples [(X, y)]
if user desires to use early stopping
num_boost_round (Int): Number of boosting iterations.
early_stopping_rounds (Int): Activates early stopping.
Validation metric needs to improve at least once
in every **early_stopping_rounds** round(s) to continue training.
See xgboost.train documentation.
verbose_eval ([Bool, Int]): Level of verbosity. See xgboost.train documentation.
persist_train (Bool): Whether or not to persist training data to use explainability
through prototypes
index_id (pd.Index): User defined index if intended to use explainability
through prototypes
Returns:
XGBSEDebiasedBCE: Trained XGBSEDebiasedBCE instance
"""
self.fit_feature_extractor(
X,
y,
time_bins=time_bins,
validation_data=validation_data,
num_boost_round=num_boost_round,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=verbose_eval,
)
E_train, T_train = convert_y(y)
# predicting and encoding leaves
self.encoder = OneHotEncoder()
leaves = self.feature_extractor.predict_leaves(X)
leaves_encoded = self.encoder.fit_transform(leaves)
# convert targets for using with logistic regression
self.targets, self.time_bins = _build_multi_task_targets(
E_train, T_train, self.feature_extractor.time_bins
)
# fitting LR for several targets
self.lr_estimators_ = self._fit_all_lr(leaves_encoded, self.targets)
if persist_train:
self.persist_train = True
if index_id is None:
index_id = X.index.copy()
self.tree = BallTree(leaves, metric="hamming")
self.index_id = index_id
return self
predict(self, X, return_interval_probs=False)
¶
Predicts survival probabilities using XGBoost + Logistic Regression pipeline.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
X |
pd.DataFrame |
Dataframe of features to be used as input for the XGBoost model. |
required |
return_interval_probs |
Bool |
Boolean indicating if interval probabilities are to be returned. If False the cumulative survival is returned. Default is False. |
False |
Returns:
Type | Description |
---|---|
pd.DataFrame |
A dataframe of survival probabilities for all times (columns), from a time_bins array, for all samples of X (rows). If return_interval_probs is True, the interval probabilities are returned instead of the cumulative survival probabilities. |
Source code in xgbse/_debiased_bce.py
def predict(self, X: pd.DataFrame, return_interval_probs: bool = False):
"""
Predicts survival probabilities using XGBoost + Logistic Regression pipeline.
Args:
X (pd.DataFrame): Dataframe of features to be used as input for the
XGBoost model.
return_interval_probs (Bool): Boolean indicating if interval probabilities
are to be returned. If False the cumulative survival is returned.
Default is False.
Returns:
pd.DataFrame: A dataframe of survival probabilities
for all times (columns), from a time_bins array, for all samples of X
(rows). If return_interval_probs is True, the interval probabilities are
returned instead of the cumulative survival probabilities.
"""
# getting leaves and extracting neighbors
leaves = self.feature_extractor.predict_leaves(X)
leaves_encoded = self.encoder.transform(leaves)
# predicting from logistic regression artifacts
preds_df = self._predict_from_lr_list(
self.lr_estimators_, leaves_encoded, self.time_bins
)
if return_interval_probs:
preds_df = calculate_interval_failures(preds_df)
return preds_df
set_fit_request(self, *, early_stopping_rounds='$UNCHANGED$', index_id='$UNCHANGED$', num_boost_round='$UNCHANGED$', persist_train='$UNCHANGED$', time_bins='$UNCHANGED$', validation_data='$UNCHANGED$', verbose_eval='$UNCHANGED$')
¶
Request metadata passed to the fit
method.
Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
).
Please see :ref:User Guide <metadata_routing>
on how the routing
mechanism works.
The options for each parameter are:
-
True
: metadata is requested, and passed tofit
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it tofit
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (sklearn.utils.metadata_routing.UNCHANGED
) retains the
existing request. This allows you to change the request for some
parameters and not others.
.. versionadded:: 1.3
.. note::
This method is only relevant if this estimator is used as a
sub-estimator of a meta-estimator, e.g. used inside a
:class:~sklearn.pipeline.Pipeline
. Otherwise it has no effect.
Parameters¶
early_stopping_rounds : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for early_stopping_rounds
parameter in fit
.
index_id : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for index_id
parameter in fit
.
num_boost_round : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for num_boost_round
parameter in fit
.
persist_train : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for persist_train
parameter in fit
.
time_bins : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for time_bins
parameter in fit
.
validation_data : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for validation_data
parameter in fit
.
verbose_eval : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for verbose_eval
parameter in fit
.
Returns¶
self : object The updated object.
Source code in xgbse/_debiased_bce.py
def func(*args, **kw):
"""Updates the request for provided parameters
This docstring is overwritten below.
See REQUESTER_DOC for expected functionality
"""
if not _routing_enabled():
raise RuntimeError(
"This method is only available when metadata routing is enabled."
" You can enable it using"
" sklearn.set_config(enable_metadata_routing=True)."
)
if self.validate_keys and (set(kw) - set(self.keys)):
raise TypeError(
f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
f"Accepted arguments are: {set(self.keys)}"
)
# This makes it possible to use the decorated method as an unbound method,
# for instance when monkeypatching.
# https://github.com/scikit-learn/scikit-learn/issues/28632
if instance is None:
_instance = args[0]
args = args[1:]
else:
_instance = instance
# Replicating python's behavior when positional args are given other than
# `self`, and `self` is only allowed if this method is unbound.
if args:
raise TypeError(
f"set_{self.name}_request() takes 0 positional argument but"
f" {len(args)} were given"
)
requests = _instance._get_metadata_request()
method_metadata_request = getattr(requests, self.name)
for prop, alias in kw.items():
if alias is not UNCHANGED:
method_metadata_request.add_request(param=prop, alias=alias)
_instance._metadata_request = requests
return _instance
set_predict_request(self, *, return_interval_probs='$UNCHANGED$')
¶
Request metadata passed to the predict
method.
Note that this method is only relevant if
enable_metadata_routing=True
(see :func:sklearn.set_config
).
Please see :ref:User Guide <metadata_routing>
on how the routing
mechanism works.
The options for each parameter are:
-
True
: metadata is requested, and passed topredict
if provided. The request is ignored if metadata is not provided. -
False
: metadata is not requested and the meta-estimator will not pass it topredict
. -
None
: metadata is not requested, and the meta-estimator will raise an error if the user provides it. -
str
: metadata should be passed to the meta-estimator with this given alias instead of the original name.
The default (sklearn.utils.metadata_routing.UNCHANGED
) retains the
existing request. This allows you to change the request for some
parameters and not others.
.. versionadded:: 1.3
.. note::
This method is only relevant if this estimator is used as a
sub-estimator of a meta-estimator, e.g. used inside a
:class:~sklearn.pipeline.Pipeline
. Otherwise it has no effect.
Parameters¶
return_interval_probs : str, True, False, or None, default=sklearn.utils.metadata_routing.UNCHANGED
Metadata routing for return_interval_probs
parameter in predict
.
Returns¶
self : object The updated object.
Source code in xgbse/_debiased_bce.py
def func(*args, **kw):
"""Updates the request for provided parameters
This docstring is overwritten below.
See REQUESTER_DOC for expected functionality
"""
if not _routing_enabled():
raise RuntimeError(
"This method is only available when metadata routing is enabled."
" You can enable it using"
" sklearn.set_config(enable_metadata_routing=True)."
)
if self.validate_keys and (set(kw) - set(self.keys)):
raise TypeError(
f"Unexpected args: {set(kw) - set(self.keys)} in {self.name}. "
f"Accepted arguments are: {set(self.keys)}"
)
# This makes it possible to use the decorated method as an unbound method,
# for instance when monkeypatching.
# https://github.com/scikit-learn/scikit-learn/issues/28632
if instance is None:
_instance = args[0]
args = args[1:]
else:
_instance = instance
# Replicating python's behavior when positional args are given other than
# `self`, and `self` is only allowed if this method is unbound.
if args:
raise TypeError(
f"set_{self.name}_request() takes 0 positional argument but"
f" {len(args)} were given"
)
requests = _instance._get_metadata_request()
method_metadata_request = getattr(requests, self.name)
for prop, alias in kw.items():
if alias is not UNCHANGED:
method_metadata_request.add_request(param=prop, alias=alias)
_instance._metadata_request = requests
return _instance