Source code for plaid.pipelines.sklearn_block_wrappers

"""Wrapped scikit-learn transformers and regressors for PLAID Dataset compatibility.

Provides adapters to use scikit-learn estimators within the PLAID feature/block system:

- WrappedPlaidSklearnTransformer: wraps a TransformerMixin

- WrappedPlaidSklearnRegressor: wraps a RegressorMixin
"""

# -*- coding: utf-8 -*-
#
# This file is subject to the terms and conditions defined in
# file 'LICENSE.txt', which is part of this source code package.
#
#

import copy
import sys
from typing import Optional

if sys.version_info >= (3, 11):
    from typing import Self
else:  # pragma: no cover
    from typing import TypeVar


[docs]
    Self = TypeVar("Self")



from sklearn.base import (
    BaseEstimator,
    RegressorMixin,
    TransformerMixin,
    clone,
)
from sklearn.utils.validation import check_is_fitted

from plaid import Dataset
from plaid.containers import FeatureIdentifier
from plaid.containers.utils import check_features_type_homogeneity
from plaid.types import Array, SklearnBlock



[docs]
def get_2Darray_from_homogeneous_identifiers(
    dataset: Dataset, features_identifiers: list[FeatureIdentifier]
) -> Array:
    """Returns a 2D array from a Dataset and a feature id.

    The function calls `dataset.get_tabular_from_homogeneous_identifiers(...)`, then removes
    either the second or third dimension if it has size 1, so that the output is 2D.

    Args:
        dataset (Dataset): A Dataset object exposing `get_tabular_from_homogeneous_identifiers`.
        features_identifiers (list[FeatureIdentifier]): a list of input feature identifiers.

    Returns:
        A NumPy array of shape (n_samples, n_features).

    Raises:
        AssertionError: If the number of features in the output does not match the identifiers.
        ValueError: If both the second and third dimensions have size greater than 1.
    """
    X = dataset.get_tabular_from_homogeneous_identifiers(features_identifiers)
    # X is of size (nb_sample, nb_features, dim_features), either nb_features or dim_features should be 1 to be compatible with scikit-learn blocks
    if X.shape[1] == 1:
        X = X[:, 0, :]
    elif X.shape[2] == 1:
        X = X[:, :, 0]
    else:
        raise ValueError(
            "X (generate by dataset.get_tabular_from_homogeneous_identifiers) is expected to have its second or third dimension equal to 1"
        )

    return X




[docs]
class WrappedSklearnTransformer(TransformerMixin, BaseEstimator):
    """Adapter for using a scikit-learn transformer on PLAID Datasets.

    Transforms tabular data extracted from homogeneous feature identifiers,
    and returns results as a `Dataset`. Supports forward and inverse transforms.

    Args:
        sklearn_block (SklearnBlock): A scikit-learn Transformer implementing fit/transform APIs.
        in_features_identifiers (list[FeatureIdentifier]): List of feature identifiers to extract input data from.
        out_features_identifiers (list[FeatureIdentifier], optional): List of feature identifiers used for outputs. If None,
            defaults to `in_features_identifiers`.
    """

    # TODO: check if restrict_to_features=True can be used to reduce further memory consumption
    def __init__(
        self,
        sklearn_block: SklearnBlock,
        in_features_identifiers: list[FeatureIdentifier],
        out_features_identifiers: Optional[list[FeatureIdentifier]] = None,
    ):

[docs]
        self.sklearn_block = sklearn_block


[docs]
        self.in_features_identifiers = in_features_identifiers


[docs]
        self.out_features_identifiers = out_features_identifiers



[docs]
    def fit(self, dataset: Dataset, _y=None) -> Self:
        """Fits the underlying scikit-learn transformer on selected input features.

        Args:
            dataset: A `Dataset` object containing the features to transform.
            _y: Ignored.

        Returns:
            self: The fitted transformer.
        """
        self.in_features_identifiers_ = copy.deepcopy(self.in_features_identifiers)
        check_features_type_homogeneity(self.in_features_identifiers_)

        if self.out_features_identifiers:
            self.out_features_identifiers_ = copy.deepcopy(
                self.out_features_identifiers
            )
            check_features_type_homogeneity(self.out_features_identifiers_)
        else:
            self.out_features_identifiers_ = copy.deepcopy(self.in_features_identifiers)

        X = get_2Darray_from_homogeneous_identifiers(
            dataset, self.in_features_identifiers_
        )

        self.sklearn_block_ = clone(self.sklearn_block).fit(X, _y)

        return self



[docs]
    def transform(self, dataset: Dataset) -> Dataset:
        """Applies the fitted transformer to the selected input features.

        Args:
            dataset: A `Dataset` object to transform.

        Returns:
            Dataset: Transformed features wrapped as a new `Dataset`.
        """
        check_is_fitted(self, "sklearn_block_")

        X = get_2Darray_from_homogeneous_identifiers(
            dataset, self.in_features_identifiers_
        )

        X_transformed = self.sklearn_block_.transform(X)
        X_transformed = X_transformed.reshape(
            (len(dataset), len(self.out_features_identifiers_), -1)
        )

        dataset_transformed = dataset.add_features_from_tabular(
            X_transformed, self.out_features_identifiers_, restrict_to_features=False
        )

        return dataset_transformed



[docs]
    def inverse_transform(self, dataset: Dataset) -> Dataset:
        """Applies inverse transformation to the output features.

        Args:
            dataset: A `Dataset` object with transformed output features.

        Returns:
            Dataset: Dataset with inverse-transformed features.
        """
        check_is_fitted(self, "sklearn_block_")

        X = get_2Darray_from_homogeneous_identifiers(
            dataset, self.out_features_identifiers_
        )

        X_inv_transformed = self.sklearn_block_.inverse_transform(X)

        X_inv_transformed = X_inv_transformed.reshape(
            (len(dataset), len(self.in_features_identifiers_), -1)
        )

        dataset_inv_transformed = dataset.add_features_from_tabular(
            X_inv_transformed, self.in_features_identifiers_, restrict_to_features=False
        )

        return dataset_inv_transformed





[docs]
class WrappedSklearnRegressor(RegressorMixin, BaseEstimator):
    """Adapter for using a scikit-learn regressor with PLAID Dataset.

    Fits and predicts on tabular arrays extracted from stacked features,
    while preserving the feature/block structure expected by PLAID.

    Args:
        sklearn_block: A scikit-learn regressor with fit/predict API.
        in_features_identifiers: List of feature identifiers for inputs.
        out_features_identifiers: List of feature identifiers for outputs.
    """

    # TODO: remove transform and inv tranf

    def __init__(
        self,
        sklearn_block: SklearnBlock,
        in_features_identifiers: list[FeatureIdentifier],
        out_features_identifiers: list[FeatureIdentifier],
    ):

[docs]
        self.sklearn_block = sklearn_block


[docs]
        self.in_features_identifiers = in_features_identifiers


[docs]
        self.out_features_identifiers = out_features_identifiers



[docs]
    def fit(self, dataset: Dataset, _y=None) -> Self:
        """Fits the wrapped scikit-learn regressor on the stacked input/output data.

        Args:
            dataset: A `Dataset` containing both input and output features.
            _y: Ignored.

        Returns:
            self: The fitted regressor.
        """
        self.sklearn_block_ = clone(self.sklearn_block)
        self.in_features_identifiers_ = self.in_features_identifiers.copy()
        self.out_features_identifiers_ = self.out_features_identifiers.copy()

        X, _ = dataset.get_tabular_from_stacked_identifiers(
            self.in_features_identifiers_
        )
        y, self.cumulated_feat_dims = dataset.get_tabular_from_stacked_identifiers(
            self.out_features_identifiers_
        )

        self.sklearn_block_.fit(X, y)

        return self



[docs]
    def predict(self, dataset: Dataset) -> Dataset:
        """Predicts target values using the fitted regressor.

        Args:
            dataset: A `Dataset` with input features.

        Returns:
            Dataset: A new `Dataset` containing predicted target features.
        """
        check_is_fitted(self, "sklearn_block_")

        X, _ = dataset.get_tabular_from_stacked_identifiers(
            self.in_features_identifiers_
        )

        y = self.sklearn_block_.predict(X)
        y = y.reshape((len(dataset), -1))

        dataset_predicted = Dataset.merge_dataset_by_features(
            [
                dataset.from_tabular(
                    y[
                        :,
                        None,
                        self.cumulated_feat_dims[i_feat] : self.cumulated_feat_dims[
                            i_feat + 1
                        ],
                    ],
                    feature_identifiers=[feat_ids],
                )
                for i_feat, feat_ids in enumerate(self.out_features_identifiers_)
            ]
        )
        # dataset_predicted = dataset.add_features_from_tabular(
        #     y, self.out_features_identifiers_, restrict_to_features=False
        # )
        dataset_predicted = dataset.merge_features(dataset_predicted)

        return dataset_predicted