Skip to content

Evaluate

evaluate(cfg)

Evaluates a dataset containing predictions and references using a specified metric.

Parameters:

Name Type Description Default
cfg DictConfig

OmegaConf configuration. See configs/evaluate.yaml for details.

required

Returns: A dictionary with evaluation results.

Source code in src/kibad_llm/evaluate.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
def evaluate(cfg: DictConfig) -> dict[str, Any]:
    """Evaluates a dataset containing predictions and references using a specified metric.

    Args:
        cfg: OmegaConf configuration. See configs/evaluate.yaml for details.
    Returns:
        A dictionary with evaluation results.
    """
    logger.info("Loading dataset with predictions and references ...")
    logger.info(f"Dataset config: {OmegaConf.to_container(cfg.dataset, resolve=True)}")
    dataset = instantiate(cfg.dataset, _convert_="all")

    logger.info("Instantiating metric ...")
    logger.info(f"Metric config: {OmegaConf.to_container(cfg.metric, resolve=True)}")
    metric: Metric = instantiate(cfg.metric, _convert_="all")

    logger.info("Computing metric ...")
    for record_id, example in dataset.items():
        metric.update(
            prediction=example["prediction"], reference=example["reference"], record_id=record_id
        )
    metric_dict = metric.compute()

    metric.show_result(metric_dict)

    if isinstance(dataset, DictWithMetadata):
        if "prediction" in metric_dict:
            raise ValueError(
                "Cannot attach metadata to 'prediction' key in metric_dict because it already "
                "exists as output from the metric computation. Please adjust the metric computation."
            )
        metric_dict["prediction"] = dataset.metadata

    return metric_dict