Evaluates a dataset containing predictions and references using a specified metric.
Parameters:
| Name |
Type |
Description |
Default |
cfg
|
DictConfig
|
OmegaConf configuration. See configs/evaluate.yaml for details.
|
required
|
Returns:
A dictionary with evaluation results.
Source code in src/kibad_llm/evaluate.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61 | def evaluate(cfg: DictConfig) -> dict[str, Any]:
"""Evaluates a dataset containing predictions and references using a specified metric.
Args:
cfg: OmegaConf configuration. See configs/evaluate.yaml for details.
Returns:
A dictionary with evaluation results.
"""
logger.info("Loading dataset with predictions and references ...")
logger.info(f"Dataset config: {OmegaConf.to_container(cfg.dataset, resolve=True)}")
dataset = instantiate(cfg.dataset, _convert_="all")
logger.info("Instantiating metric ...")
logger.info(f"Metric config: {OmegaConf.to_container(cfg.metric, resolve=True)}")
metric: Metric = instantiate(cfg.metric, _convert_="all")
logger.info("Computing metric ...")
for record_id, example in dataset.items():
metric.update(
prediction=example["prediction"], reference=example["reference"], record_id=record_id
)
metric_dict = metric.compute()
metric.show_result(metric_dict)
if isinstance(dataset, DictWithMetadata):
if "prediction" in metric_dict:
raise ValueError(
"Cannot attach metadata to 'prediction' key in metric_dict because it already "
"exists as output from the metric computation. Please adjust the metric computation."
)
metric_dict["prediction"] = dataset.metadata
return metric_dict
|