xgbse._base.XGBSEBaseEstimator¶
Base class for all estimators in xgbse. Implements explainability through prototypes
Source code in xgbse/_base.py
          class XGBSEBaseEstimator(BaseEstimator):
    """
    Base class for all estimators in xgbse. Implements explainability through prototypes
    """
    def __init__(
        self,
        xgb_params: Optional[Dict[str, Any]] = None,
        enable_categorical: bool = False,
    ):
        self.enable_categorical = enable_categorical
        self.feature_extractor = FeatureExtractor(
            xgb_params=xgb_params, enable_categorical=enable_categorical
        )
        self.xgb_params = self.feature_extractor.xgb_params
        self.feature_importances_ = None
        self.persist_train = False
        self.index_id = None
        self.tree = None
    def fit_feature_extractor(
        self,
        X,
        y,
        time_bins: Optional[Sequence] = None,
        validation_data: Optional[List[Tuple[Any, Any]]] = None,
        num_boost_round: int = 10,
        early_stopping_rounds: Optional[int] = None,
        verbose_eval: int = 0,
    ):
        self.feature_extractor.fit(
            X,
            y,
            time_bins=time_bins,
            validation_data=validation_data,
            num_boost_round=num_boost_round,
            early_stopping_rounds=early_stopping_rounds,
            verbose_eval=verbose_eval,
        )
        self.feature_importances_ = self.feature_extractor.feature_importances_
        self.time_bins = self.feature_extractor.time_bins
    def get_neighbors(
        self,
        query_data,
        index_data=None,
        query_id=None,
        index_id=None,
        n_neighbors: int = 30,
    ):
        """
        Search for portotypes (size: n_neighbors) for each unit in a
        dataframe X. If units array is specified, comparables will be returned using
        its identifiers. If not, a dataframe of comparables indexes for each sample
        in X is returned.
        Args:
            query_data (pd.DataFrame): Dataframe of features to be used as input
            query_id ([pd.Series, np.array]): Series or array of identification for
                each sample of query_data. Will be used in set_index if specified.
            index_id ([pd.Series, np.array]): Series or array of identification for
                each sample of index_id.
                If specified, comparables will be returned using this identifier.
            n_neighbors (int): Number of neighbors/comparables to be considered.
        Returns:
            comps_df (pd.DataFrame): A dataframe of comparables/neighbors for each
            evaluated sample. If units identifier is specified, the output dataframe
            is converted to use units the proper identifier for each sample. The
            reference sample is considered to be the index of the dataframe and
            its comparables are its specific row values.
        """
        if index_data is None and not self.persist_train:
            raise ValueError("please specify the index_data")
        if index_id is None and not self.persist_train:
            index_id = index_data.index.copy()
        if query_id is None:
            query_id = query_data.index.copy()
        if self.persist_train:
            index_id = self.index_id
            index = self.tree
        else:
            index_leaves = self.feature_extractor.predict_leaves(index_data)
            if len(index_leaves.shape) == 1:
                index_leaves = index_leaves.reshape(-1, 1)
            index = BallTree(index_leaves, metric="hamming")
        query_leaves = self.feature_extractor.predict_leaves(query_data)
        if len(query_leaves.shape) == 1:
            query_leaves = query_leaves.reshape(-1, 1)
        compset = index.query(query_leaves, k=n_neighbors + 1, return_distance=False)
        map_to_id = np.vectorize(lambda x: index_id[x])
        comparables = map_to_id(compset)
        comps_df = pd.DataFrame(comparables[:, 1:]).set_index(query_id)
        comps_df.columns = [f"neighbor_{n + 1}" for n in comps_df.columns]
        return comps_df
        
get_neighbors(self, query_data, index_data=None, query_id=None, index_id=None, n_neighbors=30)
¶
    Search for portotypes (size: n_neighbors) for each unit in a dataframe X. If units array is specified, comparables will be returned using its identifiers. If not, a dataframe of comparables indexes for each sample in X is returned.
Parameters:
| Name | Type | Description | Default | 
|---|---|---|---|
query_data | 
        pd.DataFrame | 
        Dataframe of features to be used as input  | 
        required | 
query_id | 
        [pd.Series, np.array] | 
        Series or array of identification for each sample of query_data. Will be used in set_index if specified.  | 
        None | 
      
index_id | 
        [pd.Series, np.array] | 
        Series or array of identification for each sample of index_id. If specified, comparables will be returned using this identifier.  | 
        None | 
      
n_neighbors | 
        int | 
        Number of neighbors/comparables to be considered.  | 
        30 | 
      
Returns:
| Type | Description | 
|---|---|
comps_df (pd.DataFrame) | 
      A dataframe of comparables/neighbors for each evaluated sample. If units identifier is specified, the output dataframe is converted to use units the proper identifier for each sample. The reference sample is considered to be the index of the dataframe and its comparables are its specific row values.  | 
    
Source code in xgbse/_base.py
          def get_neighbors(
    self,
    query_data,
    index_data=None,
    query_id=None,
    index_id=None,
    n_neighbors: int = 30,
):
    """
    Search for portotypes (size: n_neighbors) for each unit in a
    dataframe X. If units array is specified, comparables will be returned using
    its identifiers. If not, a dataframe of comparables indexes for each sample
    in X is returned.
    Args:
        query_data (pd.DataFrame): Dataframe of features to be used as input
        query_id ([pd.Series, np.array]): Series or array of identification for
            each sample of query_data. Will be used in set_index if specified.
        index_id ([pd.Series, np.array]): Series or array of identification for
            each sample of index_id.
            If specified, comparables will be returned using this identifier.
        n_neighbors (int): Number of neighbors/comparables to be considered.
    Returns:
        comps_df (pd.DataFrame): A dataframe of comparables/neighbors for each
        evaluated sample. If units identifier is specified, the output dataframe
        is converted to use units the proper identifier for each sample. The
        reference sample is considered to be the index of the dataframe and
        its comparables are its specific row values.
    """
    if index_data is None and not self.persist_train:
        raise ValueError("please specify the index_data")
    if index_id is None and not self.persist_train:
        index_id = index_data.index.copy()
    if query_id is None:
        query_id = query_data.index.copy()
    if self.persist_train:
        index_id = self.index_id
        index = self.tree
    else:
        index_leaves = self.feature_extractor.predict_leaves(index_data)
        if len(index_leaves.shape) == 1:
            index_leaves = index_leaves.reshape(-1, 1)
        index = BallTree(index_leaves, metric="hamming")
    query_leaves = self.feature_extractor.predict_leaves(query_data)
    if len(query_leaves.shape) == 1:
        query_leaves = query_leaves.reshape(-1, 1)
    compset = index.query(query_leaves, k=n_neighbors + 1, return_distance=False)
    map_to_id = np.vectorize(lambda x: index_id[x])
    comparables = map_to_id(compset)
    comps_df = pd.DataFrame(comparables[:, 1:]).set_index(query_id)
    comps_df.columns = [f"neighbor_{n + 1}" for n in comps_df.columns]
    return comps_df