Example Experiment ################## Below is a fully functioning experiment script example that implements all the necessary stages for training an sklearn model on the iris dataset. This script implements both the ``run()`` and a ``get_params()``, and is fully self contained. This script can be found in the curifactory repo under ``examples/experiments/iris.py``. .. code-block:: python from dataclasses import dataclass from sklearn.base import ClassifierMixin from sklearn.datasets import load_iris from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split import curifactory as cf from curifactory.caching import PickleCacher from curifactory.reporting import JsonReporter @dataclass class Params(cf.ExperimentParameters): balanced: bool = False """Whether class weights should be balanced or not.""" n: int = 100 """The number of trees for a random forest.""" seed: int = 42 """The random state seed for data splitting and model training.""" model_type: ClassifierMixin = LogisticRegression """The sklearn model to use.""" test_percent: float = 0.25 """The percentage of data to use for testing.""" @cf.stage( inputs=None, outputs=["training_data", "testing_data"], cachers=[PickleCacher] * 2 ) def load_data(record): params: Params = record.params data = load_iris() x_train, x_test, y_train, y_test = train_test_split( data.data, data.target, test_size=params.test_percent, random_state=params.seed ) return (x_train, y_train), (x_test, y_test) @cf.stage(inputs=["training_data"], outputs=["model"], cachers=[PickleCacher]) def train_model(record, training_data): params: Params = record.params # set up common arguments from passed parameters weight = "balanced" if params.balanced else None model_args = dict(class_weight=weight, random_state=params.seed) # set up model-specific from parameters if type(params.model_type) == RandomForestClassifier: model_args.update(dict(n_estimators=params.n)) # fit the parameterized model clf = params.model_type(**model_args).fit(training_data[0], training_data[1]) return clf @cf.aggregate(inputs=["model", "testing_data"], outputs=["scores"], cachers=None) def test_models( record: cf.Record, records: list[cf.Record], model: dict[cf.Record, any], testing_data: dict[cf.Record, any], ): scores = {} # iterate through every record and score its associated model for r, r_model in model.items(): score = r_model.score(testing_data[r][0], testing_data[r][1]) # store the result keyed to the argument set name scores[r.params.name] = score print(scores) record.report(JsonReporter(scores)) return scores def get_params(): return [ Params(name="simple_lr", balanced=True, model_type=LogisticRegression, seed=1), Params(name="simple_rf", model_type=RandomForestClassifier, seed=1), ] def run(param_sets, manager): for param_set in param_sets: record = cf.Record(manager, param_set) train_model(load_data(record)) test_models(cf.Record(manager, None))