Example Experiment
##################

Below is a fully functioning experiment script example that implements all the
necessary stages for training an sklearn model on the iris dataset. This script
implements both the ``run()`` and a ``get_params()``, and is fully self
contained. This script can be found in the curifactory repo under
``examples/experiments/iris.py``.


.. code-block:: python

    from dataclasses import dataclass

    from sklearn.base import ClassifierMixin
    from sklearn.datasets import load_iris
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split

    import curifactory as cf
    from curifactory.caching import PickleCacher
    from curifactory.reporting import JsonReporter


    @dataclass
    class Params(cf.ExperimentParameters):
        balanced: bool = False
        """Whether class weights should be balanced or not."""
        n: int = 100
        """The number of trees for a random forest."""
        seed: int = 42
        """The random state seed for data splitting and model training."""
        model_type: ClassifierMixin = LogisticRegression
        """The sklearn model to use."""
        test_percent: float = 0.25
        """The percentage of data to use for testing."""


    @cf.stage(
        inputs=None, outputs=["training_data", "testing_data"], cachers=[PickleCacher] * 2
    )
    def load_data(record):
        params: Params = record.params

        data = load_iris()
        x_train, x_test, y_train, y_test = train_test_split(
            data.data, data.target, test_size=params.test_percent, random_state=params.seed
        )

        return (x_train, y_train), (x_test, y_test)


    @cf.stage(inputs=["training_data"], outputs=["model"], cachers=[PickleCacher])
    def train_model(record, training_data):
        params: Params = record.params

        # set up common arguments from passed parameters
        weight = "balanced" if params.balanced else None
        model_args = dict(class_weight=weight, random_state=params.seed)

        # set up model-specific from parameters
        if type(params.model_type) == RandomForestClassifier:
            model_args.update(dict(n_estimators=params.n))

        # fit the parameterized model
        clf = params.model_type(**model_args).fit(training_data[0], training_data[1])
        return clf


    @cf.aggregate(inputs=["model", "testing_data"], outputs=["scores"], cachers=None)
    def test_models(
        record: cf.Record,
        records: list[cf.Record],
        model: dict[cf.Record, any],
        testing_data: dict[cf.Record, any],
    ):
        scores = {}

        # iterate through every record and score its associated model
        for r, r_model in model.items():
            score = r_model.score(testing_data[r][0], testing_data[r][1])

            # store the result keyed to the argument set name
            scores[r.params.name] = score

        print(scores)
        record.report(JsonReporter(scores))
        return scores


    def get_params():
        return [
            Params(name="simple_lr", balanced=True, model_type=LogisticRegression, seed=1),
            Params(name="simple_rf", model_type=RandomForestClassifier, seed=1),
        ]


    def run(param_sets, manager):
        for param_set in param_sets:
            record = cf.Record(manager, param_set)
            train_model(load_data(record))

        test_models(cf.Record(manager, None))