Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion src/lighteval/metrics/metrics_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,11 @@ def compute_corpus(self, items: list[LogprobCorpusMetricInput]):
# Single f1
if self.num_classes == 2:
fscore = sklearn.metrics.f1_score(golds, preds, average=self.average)
return np.max(fscore)
# average=None returns a per-class F1 array: report the positive class, not the
# best class (np.max inflated the score). Scalar averages pass through unchanged.
if self.average is None:
return float(fscore[1])
return float(fscore)

# Multi f1
f1s = []
Expand Down
13 changes: 13 additions & 0 deletions tests/test_unit_base_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,19 @@ def test_prefix_quasi_exact_match(self):
res = em.compute_one_item("", "")
assert res == 0

def test_corpus_level_f1_binary_positive_class(self):
from types import SimpleNamespace

from lighteval.metrics.metrics_corpus import CorpusLevelF1Score

# The binary (num_classes=2) path must report the positive-class F1, not the best
# per-class F1. Regression: it returned np.max over both classes, inflating the score.
golds = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1]
preds = [0, 0, 0, 0, 0, 1, 0, 0, 0, 1]
items = [SimpleNamespace(golds=g, preds=p) for g, p in zip(golds, preds)]
score = CorpusLevelF1Score(None).compute_corpus(items)
assert score == pytest.approx(1 / 3) # positive-class F1; max-per-class would be ~0.714

def test_prob(self):
doc = Doc(query="Test query", choices=["A", "B", "C"], gold_index=0, task_name="test")

Expand Down