diff --git a/src/lighteval/tasks/tasks/boolq.py b/src/lighteval/tasks/tasks/boolq.py
index f927f9491..102e3c0e2 100644
--- a/src/lighteval/tasks/tasks/boolq.py
+++ b/src/lighteval/tasks/tasks/boolq.py
@@ -48,7 +48,7 @@ def boolq_contrastset_prompt(line, task_name: str = None):
             task_name=task_name,
             query=f"{passage}\nQuestion: {question}\nAnswer:",
             choices=["Yes", "No"],
-            gold_index=["No", "Yes"].index(line["answer"]),
+            gold_index=["Yes", "No"].index(line["answer"]),
         )
         for passage, question in zip(line["contrast_inputs"]["passage"], line["contrast_inputs"]["question"])
     ][0]
diff --git a/tests/unit/tasks/test_boolq.py b/tests/unit/tasks/test_boolq.py
new file mode 100644
index 000000000..ce7f9b2d2
--- /dev/null
+++ b/tests/unit/tasks/test_boolq.py
@@ -0,0 +1,17 @@
+from lighteval.tasks.tasks.boolq import boolq_contrastset_prompt
+
+
+def test_boolq_contrastset_gold_index_matches_answer():
+    # boolq:contrastset derived its gold index from a reversed ["No", "Yes"]
+    # lookup while its choices were ["Yes", "No"], so every sample was graded
+    # against the opposite answer. The gold choice must equal the answer.
+    for answer in ("Yes", "No"):
+        line = {
+            "answer": answer,
+            "contrast_inputs": {
+                "passage": ["A passage."],
+                "question": ["A question?"],
+            },
+        }
+        doc = boolq_contrastset_prompt(line)
+        assert doc.choices[doc.gold_index] == answer