diff --git a/src/lighteval/tasks/tasks/boolq.py b/src/lighteval/tasks/tasks/boolq.py index f927f9491..102e3c0e2 100644 --- a/src/lighteval/tasks/tasks/boolq.py +++ b/src/lighteval/tasks/tasks/boolq.py @@ -48,7 +48,7 @@ def boolq_contrastset_prompt(line, task_name: str = None): task_name=task_name, query=f"{passage}\nQuestion: {question}\nAnswer:", choices=["Yes", "No"], - gold_index=["No", "Yes"].index(line["answer"]), + gold_index=["Yes", "No"].index(line["answer"]), ) for passage, question in zip(line["contrast_inputs"]["passage"], line["contrast_inputs"]["question"]) ][0] diff --git a/tests/unit/tasks/test_boolq.py b/tests/unit/tasks/test_boolq.py new file mode 100644 index 000000000..ce7f9b2d2 --- /dev/null +++ b/tests/unit/tasks/test_boolq.py @@ -0,0 +1,17 @@ +from lighteval.tasks.tasks.boolq import boolq_contrastset_prompt + + +def test_boolq_contrastset_gold_index_matches_answer(): + # boolq:contrastset derived its gold index from a reversed ["No", "Yes"] + # lookup while its choices were ["Yes", "No"], so every sample was graded + # against the opposite answer. The gold choice must equal the answer. + for answer in ("Yes", "No"): + line = { + "answer": answer, + "contrast_inputs": { + "passage": ["A passage."], + "question": ["A question?"], + }, + } + doc = boolq_contrastset_prompt(line) + assert doc.choices[doc.gold_index] == answer