Skip to content

F1

F1MicroMultipleFieldsMetric

Bases: MetricCollection

Source code in src/kibad_llm/metrics/f1.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class F1MicroMultipleFieldsMetric(MetricCollection):

    def __init__(
        self,
        fields: list[str],
        format_as_markdown: bool = True,
        sort_fields: bool = False,
        **kwargs,
    ) -> None:
        """Computes F1MicroSingleFieldMetric for multiple fields at once as well as micro (ALL)
        and macro (AVG) over all fields.

        Args:
            fields: List of fields to compute F1MicroSingleFieldMetric for.
            format_as_markdown: Whether to format the result as a markdown table. Defaults to True.
            **kwargs: Additional keyword arguments for F1MicroSingleFieldMetric, e.g., ignore_subfields.
        """
        # for now, just raise error if fields contain MICRO or MACRO
        if "ALL" in fields or "AVG" in fields:
            raise ValueError("Fields cannot contain 'ALL' or 'AVG' as field names.")

        if sort_fields:
            fields = sorted(fields)
        super().__init__(
            metrics={field: F1MicroSingleFieldMetric(field=field, **kwargs) for field in fields}
        )

        self.format_as_markdown = format_as_markdown

    def _compute(self, *args, **kwargs) -> dict[str, Any]:
        """Computes the results for all sub-metrics and micro average over all instances.

        Returns:
            A dictionary mapping field names to their computed results.
        """
        result = super()._compute(*args, **kwargs)
        # compute mean for precision, recall, f1 over all fields
        scores_list = defaultdict(list)
        for field_result in result.values():
            for key, value in field_result.items():
                scores_list[key].append(value)
        result["AVG"] = {key: sum(values) / len(values) for key, values in scores_list.items()}

        # compute micro average over all instances based on states of all sub-metrics
        state_total = {
            "tp": sum(metric.state["tp"] for metric in self.metrics.values()),
            "fp": sum(metric.state["fp"] for metric in self.metrics.values()),
            "fn": sum(metric.state["fn"] for metric in self.metrics.values()),
        }
        result["ALL"] = F1MicroSingleFieldMetric.calculate_scores(state=state_total)
        return result

    def _format_result(self, result: dict[str, Any]) -> str:
        """Formats the result as a markdown table if specified, otherwise as pretty-printed JSON.

        Args:
            result: The result dictionary to format.
        Returns: A string representation of the result.
        """
        if self.format_as_markdown:
            # create pandas DataFrame and convert to markdown table
            df = DataFrame.from_dict(result, orient="index")
            df.index.name = "field"
            # round to 3 decimal places
            df = df.round(3)
            return df.to_markdown()
        else:
            return super()._format_result(result)

__init__(fields, format_as_markdown=True, sort_fields=False, **kwargs)

Computes F1MicroSingleFieldMetric for multiple fields at once as well as micro (ALL) and macro (AVG) over all fields.

Parameters:

Name Type Description Default
fields list[str]

List of fields to compute F1MicroSingleFieldMetric for.

required
format_as_markdown bool

Whether to format the result as a markdown table. Defaults to True.

True
**kwargs

Additional keyword arguments for F1MicroSingleFieldMetric, e.g., ignore_subfields.

{}
Source code in src/kibad_llm/metrics/f1.py
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def __init__(
    self,
    fields: list[str],
    format_as_markdown: bool = True,
    sort_fields: bool = False,
    **kwargs,
) -> None:
    """Computes F1MicroSingleFieldMetric for multiple fields at once as well as micro (ALL)
    and macro (AVG) over all fields.

    Args:
        fields: List of fields to compute F1MicroSingleFieldMetric for.
        format_as_markdown: Whether to format the result as a markdown table. Defaults to True.
        **kwargs: Additional keyword arguments for F1MicroSingleFieldMetric, e.g., ignore_subfields.
    """
    # for now, just raise error if fields contain MICRO or MACRO
    if "ALL" in fields or "AVG" in fields:
        raise ValueError("Fields cannot contain 'ALL' or 'AVG' as field names.")

    if sort_fields:
        fields = sorted(fields)
    super().__init__(
        metrics={field: F1MicroSingleFieldMetric(field=field, **kwargs) for field in fields}
    )

    self.format_as_markdown = format_as_markdown

F1MicroSingleFieldMetric

Bases: MetricWithPrepareEntryAsSet

Computes micro averaged precision, recall, and F1 score for single- and multi-label classification tasks.

The metric operates on sets and allows for simple preprocessing, see _prepare_entry for details.

WARNING: !Since the metric operates on sets, this can obfuscate if the LLM produces duplicate labels !in multi-label settings. E.g., prediction = ["A", "A", "B"] and reference = ["A", "B"] will !be treated as perfect prediction with tp=2, fp=0, fn=0 even though the prediction contains a !duplicate label "A".

Parameters:

Name Type Description Default
**kwargs

Keyword arguments for entry-to-set preparation. See MetricWithPrepareEntryAsSet for supported options.

{}
Source code in src/kibad_llm/metrics/f1.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class F1MicroSingleFieldMetric(MetricWithPrepareEntryAsSet):
    """Computes micro averaged precision, recall, and F1 score for single- and multi-label
    classification tasks.

    The metric operates on sets and allows for simple preprocessing, see _prepare_entry for details.

    WARNING:
    !Since the metric operates on sets, this can obfuscate if the LLM produces duplicate labels
    !in multi-label settings. E.g., prediction = ["A", "A", "B"] and reference = ["A", "B"] will
    !be treated as perfect prediction with tp=2, fp=0, fn=0 even though the prediction contains a
    !duplicate label "A".

    Args:
        **kwargs: Keyword arguments for entry-to-set preparation. See
            `MetricWithPrepareEntryAsSet` for supported options.
    """

    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)
        self.reset()

    def reset(self) -> None:
        """Resets all values of the internal state to zero"""
        self.state: dict[str, int] = {"tp": 0, "fp": 0, "fn": 0}

    def _update(self, prediction: Any, reference: Any, record_id: Hashable | None = None) -> None:
        """Updates the internal state with the given prediction(s) and reference(s).
        See `_prepare_entry_as_set` for accepted input formats.
        """
        prediction_set = self._prepare_entry_as_set(prediction)
        reference_set = self._prepare_entry_as_set(reference)

        self.state["tp"] += len(prediction_set & reference_set)
        self.state["fp"] += len(prediction_set - reference_set)
        self.state["fn"] += len(reference_set - prediction_set)

    @staticmethod
    def calculate_scores(state: dict[str, int]) -> dict[str, float]:
        """Calculates precision, recall and f1 from true positives, false positives and false negatives.

        Args:
            state: dictionary with keys "tp", "fp", "fn"

        returns: dictionary with precision, recall and f1
        """
        tp, fp, fn = state["tp"], state["fp"], state["fn"]
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": tp + fn,
        }

    def _compute(self, *args, **kwargs) -> dict[str, Any]:
        """Computes the micro average of precision, recall and f1 score."""
        return self.calculate_scores(state=self.state)

calculate_scores(state) staticmethod

Calculates precision, recall and f1 from true positives, false positives and false negatives.

Parameters:

Name Type Description Default
state dict[str, int]

dictionary with keys "tp", "fp", "fn"

required

returns: dictionary with precision, recall and f1

Source code in src/kibad_llm/metrics/f1.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
@staticmethod
def calculate_scores(state: dict[str, int]) -> dict[str, float]:
    """Calculates precision, recall and f1 from true positives, false positives and false negatives.

    Args:
        state: dictionary with keys "tp", "fp", "fn"

    returns: dictionary with precision, recall and f1
    """
    tp, fp, fn = state["tp"], state["fp"], state["fn"]
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "support": tp + fn,
    }

reset()

Resets all values of the internal state to zero

Source code in src/kibad_llm/metrics/f1.py
32
33
34
def reset(self) -> None:
    """Resets all values of the internal state to zero"""
    self.state: dict[str, int] = {"tp": 0, "fp": 0, "fn": 0}