Skip to content

F1

F1MicroMultipleFieldsMetric

Bases: MetricCollection[F1MicroSingleFieldMetric]

Source code in src/kibad_llm/metrics/f1.py
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
class F1MicroMultipleFieldsMetric(MetricCollection[F1MicroSingleFieldMetric]):

    def __init__(
        self,
        fields: list[str] | None = None,
        format_as_markdown: bool = True,
        subfield_keys: dict[str, list[str]] | None = None,
        subfield_values: dict[str, list[str]] | None = None,
        sort_fields: bool = False,
        **kwargs,
    ) -> None:
        """Computes F1MicroSingleFieldMetric for multiple fields at once as well as micro (ALL)
        and macro (AVG) over all fields.

        Args:
            fields: List of fields to compute F1MicroSingleFieldMetric for. If not provided,
                the metric will be computed for all fields found in the data.
            format_as_markdown: Whether to format the result as a markdown table. Defaults to True.
            subfield_keys: Optional dict mapping field names to lists of keys used to split
                dict-like entries into separate generated fields. For a configured field, the
                values of these keys are removed from each nested dict and appended to the field
                name, while the remaining key-value pairs are scored as that generated field's
                payload. This makes it possible to compute metrics separately for entries such as
                ``field1.A&B`` and ``field1.C&D`` instead of scoring the whole original field as
                one unit.
            subfield_values: Optional dict mapping field names to lists of keys that should be
                retained as the payload of generated fields after extracting ``subfield_keys``.
                This allows restricting evaluation to selected nested values, e.g. scoring only
                ``Antwortvariable`` or only ``Antwortvariable`` and ``Trend`` within each
                generated field.
            sort_fields: Whether to sort the fields in the output. Defaults to False.
            **kwargs: Additional keyword arguments for F1MicroSingleFieldMetric, e.g.,
                ``ignore_subfields`` or ``ignore_missing_entries``.
        """
        # for now, just raise error if fields contain MICRO or MACRO
        if fields is not None and ("ALL" in fields or "AVG" in fields):
            raise ValueError("Fields cannot contain 'ALL' or 'AVG' as field names.")

        self.fields = fields
        self.subfield_keys = subfield_keys
        self.subfield_values = subfield_values
        self.metric_kwargs = kwargs
        super().__init__(sort_fields=sort_fields)

        self.format_as_markdown = format_as_markdown

    @property
    def ignore_missing_entries(self) -> bool:
        return self.metric_kwargs.get("ignore_missing_entries", False)

    def _update(self, prediction: Any, reference: Any, record_id: Hashable | None = None) -> None:
        if prediction is None:
            prediction = dict()
        if reference is None:
            reference = dict()
        if not isinstance(prediction, dict) or not isinstance(reference, dict):
            raise TypeError(
                f"Prediction and reference should be dicts, but got {type(prediction)} and {type(reference)}."
            )
        if self.fields is None:
            fields = list(prediction.keys() | reference.keys())
        else:
            fields = self.fields

        if self.subfield_keys is not None:
            new_fields = []
            subfield_values = self.subfield_values or {}
            for field in fields:
                if field in self.subfield_keys:
                    prediction, new_prediction_fields = _expand_field_by_key_values(
                        entry=prediction,
                        field=field,
                        key_entries=self.subfield_keys[field],
                        value_entries=subfield_values.get(field, None),
                    )
                    reference, new_reference_fields = _expand_field_by_key_values(
                        entry=reference,
                        field=field,
                        key_entries=self.subfield_keys[field],
                        value_entries=subfield_values.get(field, None),
                    )
                    new_fields.extend(new_prediction_fields | new_reference_fields)
                else:
                    new_fields.append(field)
            fields = new_fields

        # check if all required metrics exist and create missing ones via self.add_metric
        for field in fields:
            if field not in self.metrics:
                self.add_metric(field, F1MicroSingleFieldMetric(field=field, **self.metric_kwargs))

        super()._update(prediction=prediction, reference=reference, record_id=record_id)

    def _compute(self, *args, **kwargs) -> dict[str, Any]:
        """Computes the results for all sub-metrics and micro average over all instances.

        Returns:
            A dictionary mapping field names to their computed results.
        """
        result = super()._compute(*args, **kwargs)
        if self.ignore_missing_entries:
            # remove results from metrics with empty states to get correct AVG values and shorten the result
            result = {
                name: field_result
                for name, field_result in result.items()
                if any(self.metrics[name].state[key] > 0 for key in ("tp", "fp", "fn"))
            }
        # compute mean for precision, recall, f1 over all fields
        scores_list = defaultdict(list)
        for field_result in result.values():
            for key, value in field_result.items():
                scores_list[key].append(value)
        result["AVG"] = {key: sum(values) / len(values) for key, values in scores_list.items()}

        # compute micro average over all instances based on states of all sub-metrics
        state_total = {
            "tp": sum(metric.state["tp"] for metric in self.metrics.values()),
            "fp": sum(metric.state["fp"] for metric in self.metrics.values()),
            "fn": sum(metric.state["fn"] for metric in self.metrics.values()),
        }
        result["ALL"] = F1MicroSingleFieldMetric.calculate_scores(state=state_total)
        return result

    def _format_result(self, result: dict[str, Any]) -> str:
        """Formats the result as a markdown table if specified, otherwise as pretty-printed JSON.

        Args:
            result: The result dictionary to format.
        Returns: A string representation of the result.
        """
        if self.format_as_markdown:
            # create pandas DataFrame and convert to markdown table
            df = DataFrame.from_dict(result, orient="index")
            df.index.name = "field"
            # round to 3 decimal places
            df = df.round(3)
            return df.to_markdown()
        else:
            return super()._format_result(result)

__init__(fields=None, format_as_markdown=True, subfield_keys=None, subfield_values=None, sort_fields=False, **kwargs)

Computes F1MicroSingleFieldMetric for multiple fields at once as well as micro (ALL) and macro (AVG) over all fields.

Parameters:

Name Type Description Default
fields list[str] | None

List of fields to compute F1MicroSingleFieldMetric for. If not provided, the metric will be computed for all fields found in the data.

None
format_as_markdown bool

Whether to format the result as a markdown table. Defaults to True.

True
subfield_keys dict[str, list[str]] | None

Optional dict mapping field names to lists of keys used to split dict-like entries into separate generated fields. For a configured field, the values of these keys are removed from each nested dict and appended to the field name, while the remaining key-value pairs are scored as that generated field's payload. This makes it possible to compute metrics separately for entries such as field1.A&B and field1.C&D instead of scoring the whole original field as one unit.

None
subfield_values dict[str, list[str]] | None

Optional dict mapping field names to lists of keys that should be retained as the payload of generated fields after extracting subfield_keys. This allows restricting evaluation to selected nested values, e.g. scoring only Antwortvariable or only Antwortvariable and Trend within each generated field.

None
sort_fields bool

Whether to sort the fields in the output. Defaults to False.

False
**kwargs

Additional keyword arguments for F1MicroSingleFieldMetric, e.g., ignore_subfields or ignore_missing_entries.

{}
Source code in src/kibad_llm/metrics/f1.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def __init__(
    self,
    fields: list[str] | None = None,
    format_as_markdown: bool = True,
    subfield_keys: dict[str, list[str]] | None = None,
    subfield_values: dict[str, list[str]] | None = None,
    sort_fields: bool = False,
    **kwargs,
) -> None:
    """Computes F1MicroSingleFieldMetric for multiple fields at once as well as micro (ALL)
    and macro (AVG) over all fields.

    Args:
        fields: List of fields to compute F1MicroSingleFieldMetric for. If not provided,
            the metric will be computed for all fields found in the data.
        format_as_markdown: Whether to format the result as a markdown table. Defaults to True.
        subfield_keys: Optional dict mapping field names to lists of keys used to split
            dict-like entries into separate generated fields. For a configured field, the
            values of these keys are removed from each nested dict and appended to the field
            name, while the remaining key-value pairs are scored as that generated field's
            payload. This makes it possible to compute metrics separately for entries such as
            ``field1.A&B`` and ``field1.C&D`` instead of scoring the whole original field as
            one unit.
        subfield_values: Optional dict mapping field names to lists of keys that should be
            retained as the payload of generated fields after extracting ``subfield_keys``.
            This allows restricting evaluation to selected nested values, e.g. scoring only
            ``Antwortvariable`` or only ``Antwortvariable`` and ``Trend`` within each
            generated field.
        sort_fields: Whether to sort the fields in the output. Defaults to False.
        **kwargs: Additional keyword arguments for F1MicroSingleFieldMetric, e.g.,
            ``ignore_subfields`` or ``ignore_missing_entries``.
    """
    # for now, just raise error if fields contain MICRO or MACRO
    if fields is not None and ("ALL" in fields or "AVG" in fields):
        raise ValueError("Fields cannot contain 'ALL' or 'AVG' as field names.")

    self.fields = fields
    self.subfield_keys = subfield_keys
    self.subfield_values = subfield_values
    self.metric_kwargs = kwargs
    super().__init__(sort_fields=sort_fields)

    self.format_as_markdown = format_as_markdown

F1MicroSingleFieldMetric

Bases: MetricWithPrepareEntryAsSet

Computes micro averaged precision, recall, and F1 score for single- and multi-label classification tasks.

The metric operates on sets and allows for simple preprocessing, see _prepare_entry for details.

WARNING: !Since the metric operates on sets, this can obfuscate if the LLM produces duplicate labels !in multi-label settings. E.g., prediction = ["A", "A", "B"] and reference = ["A", "B"] will !be treated as perfect prediction with tp=2, fp=0, fn=0 even though the prediction contains a !duplicate label "A".

Parameters:

Name Type Description Default
ignore_missing_entries bool

If True, instances where either prediction or reference is empty will be ignored in the metric calculation.

False
**kwargs

Keyword arguments for entry-to-set preparation. See MetricWithPrepareEntryAsSet for supported options.

{}
Source code in src/kibad_llm/metrics/f1.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
class F1MicroSingleFieldMetric(MetricWithPrepareEntryAsSet):
    """Computes micro averaged precision, recall, and F1 score for single- and multi-label
    classification tasks.

    The metric operates on sets and allows for simple preprocessing, see _prepare_entry for details.

    WARNING:
    !Since the metric operates on sets, this can obfuscate if the LLM produces duplicate labels
    !in multi-label settings. E.g., prediction = ["A", "A", "B"] and reference = ["A", "B"] will
    !be treated as perfect prediction with tp=2, fp=0, fn=0 even though the prediction contains a
    !duplicate label "A".

    Args:
        ignore_missing_entries: If True, instances where either prediction or reference is empty
            will be ignored in the metric calculation.
        **kwargs: Keyword arguments for entry-to-set preparation. See
            `MetricWithPrepareEntryAsSet` for supported options.
    """

    def __init__(self, ignore_missing_entries: bool = False, **kwargs) -> None:
        super().__init__(**kwargs)
        self.ignore_missing_entries = ignore_missing_entries
        self.reset()

    def reset(self) -> None:
        """Resets all values of the internal state to zero"""
        self.state: dict[str, int] = {"tp": 0, "fp": 0, "fn": 0}

    def _update(self, prediction: Any, reference: Any, record_id: Hashable | None = None) -> None:
        """Updates the internal state with the given prediction(s) and reference(s).
        See `_prepare_entry_as_set` for accepted input formats.
        """
        prediction_set = self._prepare_entry_as_set(prediction)
        reference_set = self._prepare_entry_as_set(reference)
        if self.ignore_missing_entries and (len(prediction_set) == 0 or len(reference_set) == 0):
            return

        self.state["tp"] += len(prediction_set & reference_set)
        self.state["fp"] += len(prediction_set - reference_set)
        self.state["fn"] += len(reference_set - prediction_set)

    @staticmethod
    def calculate_scores(state: dict[str, int]) -> dict[str, float]:
        """Calculates precision, recall and f1 from true positives, false positives and false negatives.

        Args:
            state: dictionary with keys "tp", "fp", "fn"

        returns: dictionary with precision, recall and f1
        """
        tp, fp, fn = state["tp"], state["fp"], state["fn"]
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": tp + fn,
        }

    def _compute(self, *args, **kwargs) -> dict[str, Any]:
        """Computes the micro average of precision, recall and f1 score."""
        return self.calculate_scores(state=self.state)

calculate_scores(state) staticmethod

Calculates precision, recall and f1 from true positives, false positives and false negatives.

Parameters:

Name Type Description Default
state dict[str, int]

dictionary with keys "tp", "fp", "fn"

required

returns: dictionary with precision, recall and f1

Source code in src/kibad_llm/metrics/f1.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
@staticmethod
def calculate_scores(state: dict[str, int]) -> dict[str, float]:
    """Calculates precision, recall and f1 from true positives, false positives and false negatives.

    Args:
        state: dictionary with keys "tp", "fp", "fn"

    returns: dictionary with precision, recall and f1
    """
    tp, fp, fn = state["tp"], state["fp"], state["fn"]
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "support": tp + fn,
    }

reset()

Resets all values of the internal state to zero

Source code in src/kibad_llm/metrics/f1.py
36
37
38
def reset(self) -> None:
    """Resets all values of the internal state to zero"""
    self.state: dict[str, int] = {"tp": 0, "fp": 0, "fn": 0}