Example Experiment
Below is a fully functioning experiment script example that implements all the
necessary stages for training an sklearn model on the iris dataset. This script
implements both the run()
and a get_params()
, and is fully self
contained. This script can be found in the curifactory repo under
examples/experiments/iris.py
.
from dataclasses import dataclass
from sklearn.base import ClassifierMixin
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import curifactory as cf
from curifactory.caching import PickleCacher
from curifactory.reporting import JsonReporter
@dataclass
class Params(cf.ExperimentParameters):
balanced: bool = False
"""Whether class weights should be balanced or not."""
n: int = 100
"""The number of trees for a random forest."""
seed: int = 42
"""The random state seed for data splitting and model training."""
model_type: ClassifierMixin = LogisticRegression
"""The sklearn model to use."""
test_percent: float = 0.25
"""The percentage of data to use for testing."""
@cf.stage(
inputs=None, outputs=["training_data", "testing_data"], cachers=[PickleCacher] * 2
)
def load_data(record):
params: Params = record.params
data = load_iris()
x_train, x_test, y_train, y_test = train_test_split(
data.data, data.target, test_size=params.test_percent, random_state=params.seed
)
return (x_train, y_train), (x_test, y_test)
@cf.stage(inputs=["training_data"], outputs=["model"], cachers=[PickleCacher])
def train_model(record, training_data):
params: Params = record.params
# set up common arguments from passed parameters
weight = "balanced" if params.balanced else None
model_args = dict(class_weight=weight, random_state=params.seed)
# set up model-specific from parameters
if type(params.model_type) == RandomForestClassifier:
model_args.update(dict(n_estimators=params.n))
# fit the parameterized model
clf = params.model_type(**model_args).fit(training_data[0], training_data[1])
return clf
@cf.aggregate(inputs=["model", "testing_data"], outputs=["scores"], cachers=None)
def test_models(
record: cf.Record,
records: list[cf.Record],
model: dict[cf.Record, any],
testing_data: dict[cf.Record, any],
):
scores = {}
# iterate through every record and score its associated model
for r, r_model in model.items():
score = r_model.score(testing_data[r][0], testing_data[r][1])
# store the result keyed to the argument set name
scores[r.params.name] = score
print(scores)
record.report(JsonReporter(scores))
return scores
def get_params():
return [
Params(name="simple_lr", balanced=True, model_type=LogisticRegression, seed=1),
Params(name="simple_rf", model_type=RandomForestClassifier, seed=1),
]
def run(param_sets, manager):
for param_set in param_sets:
record = cf.Record(manager, param_set)
train_model(load_data(record))
test_models(cf.Record(manager, None))