cvs-health · vgyani · Nov 12, 2025
diff --git a/tests/data/scorers/ensemble_results_file.json b/tests/data/scorers/ensemble_results_file.json
@@ -1 +1 @@
-{"ensemble1": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [0.9999998323932312, 0.9999993853802055, 0.9997710794982175, 0.7865512731195814, 0.9999998323932312], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.99908431799287, 0.14620509247832553, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.25, 0.25, 0.25, 0.25], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "5", "bytes": [53], "logprob": -1.9227449, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}, "bsdetector": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5599999999999999, 0.13999999999999999, 0.3], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -3.1737043e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.080879845, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.001702437, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -5.5122365e-07, "top_logprobs": []}]]}}, "ensemble2": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [null, null, null, null, null], "ensemble_scores": [0.9999996647864624, 0.999998770760411, 0.5911959418126744, 0.9399798095276081, 0.9999996647864624], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.1823918836253489, 0.8799596190552161, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5, 0.5], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0012972581, "top_logprobs": []}, {"token": " mar", "bytes": [32, 109, 97, 114], "logprob": -1.7015977, "top_logprobs": []}, {"token": "bles", "bytes": [98, 108, 101, 115], "logprob": 0.0, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.69366264, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.12787926, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.0036003059, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}}
+{"ensemble1": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [0.9999998323932312, 0.9999993853802055, 0.9997710794982175, 0.7865512731195814, 0.9999998323932312], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.99908431799287, 0.14620509247832553, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "5", "bytes": [53], "logprob": -1.9227449, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.25, 0.25, 0.25, 0.25]}}, "bsdetector": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -3.1737043e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.080879845, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.001702437, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -5.5122365e-07, "top_logprobs": []}]]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5599999999999999, 0.13999999999999999, 0.3]}}, "ensemble2": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [null, null, null, null, null], "ensemble_scores": [0.9999996647864624, 0.999998770760411, 0.5911959418126744, 0.9399798095276081, 0.9999996647864624], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.1823918836253489, 0.8799596190552161, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0012972581, "top_logprobs": []}, {"token": " mar", "bytes": [32, 109, 97, 114], "logprob": -1.7015977, "top_logprobs": []}, {"token": "bles", "bytes": [98, 108, 101, 115], "logprob": 0.0, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.69366264, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.12787926, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.0036003059, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5, 0.5]}}}
diff --git a/tests/data/scorers/semanticentropy_results_file.json b/tests/data/scorers/semanticentropy_results_file.json
@@ -1 +1 @@
-{"data": {"responses": ["5", "$3", "12", "308", "35"], "entropy_values": [0.0, 1.3296613488547582, 0.6365141682948128, 0.45056120886630463, 0.8675632284814612], "confidence_scores": [1.0, 0.25790187148969435, 0.644754678724236, 0.7485370014199393, 0.5158037429793888], "sampled_responses": [["5", "5", "5", "5", "5 miles"], ["$9", "$6", "Josh makes 12 bracelets. His cost for supplies is:\n12 bracelets * $1/bracelet = $12\n\nHe sells each bracelet for $1.50, so his revenue from selling 12 bracelets is:\n12 bracelets * $1.50/bracelet = $18\n\nHis profit, therefore, is:\n$18 - $12 = $6\n\nAfter buying the cookies, he still has $3, so the cost of the cookies is:\n$6 - $3 = $3\n\nTherefore, the cost of the box of cookies is $3.", "$6", "$9"], ["12", "36", "12", "12", "36"], ["308", "308", "308", "315", "308"], ["32.5", "32", "32.5", "30.", "32"]], "prompts": ["Solve the math problem, but return only the numerical answer.\nVery early this morning, Elise left home in a cab headed for the hospital. Fortunately, the roads were clear, and the cab company only charged her a base price of $3, and $4 for every mile she traveled. If Elise paid a total of $23, how far is the hospital from her house?", "Solve the math problem, but return only the numerical answer.\nJosh is saving up for a box of cookies. To raise the money, he is going to make bracelets and sell them. It costs $1 for supplies for each bracelet and he sells each one for $1.5. If he makes 12 bracelets and after buying the cookies still has $3, how much did the box of cookies cost?", "Solve the math problem, but return only the numerical answer.\nColin can skip at six times the speed that Brandon can.  Brandon can skip at one-third the speed that Tony can.  And Tony can skip at twice the speed that Bruce can.  At what speed, in miles per hour, can Colin skip if Bruce skips at 1 mile per hour?", "Solve the math problem, but return only the numerical answer.\nJanet, a third grade teacher, is picking up the sack lunch order from a local deli for the field trip she is taking her class on. There are 35 children in her class, 5 volunteer chaperones, and herself. She she also ordered three additional sack lunches, just in case there was a problem. Each sack lunch costs $7. How much do all the lunches cost in total?", "Solve the math problem, but return only the numerical answer.\nAt 30, Anika is 4/3 the age of Maddie. What would be their average age in 15 years?"]}, "metadata": {"parameters": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5}}}
+{"data": {"responses": ["5", "$3", "12", "308", "35"], "entropy_values": [0.0, 1.3296613488547582, 0.6365141682948128, 0.45056120886630463, 0.8675632284814612], "confidence_scores": [1.0, 0.25790187148969435, 0.644754678724236, 0.7485370014199393, 0.5158037429793888], "sampled_responses": [["5", "5", "5", "5", "5 miles"], ["$9", "$6", "Josh makes 12 bracelets. His cost for supplies is:\n12 bracelets * $1/bracelet = $12\n\nHe sells each bracelet for $1.50, so his revenue from selling 12 bracelets is:\n12 bracelets * $1.50/bracelet = $18\n\nHis profit, therefore, is:\n$18 - $12 = $6\n\nAfter buying the cookies, he still has $3, so the cost of the cookies is:\n$6 - $3 = $3\n\nTherefore, the cost of the box of cookies is $3.", "$6", "$9"], ["12", "36", "12", "12", "36"], ["308", "308", "308", "315", "308"], ["32.5", "32", "32.5", "30.", "32"]], "prompts": ["Solve the math problem, but return only the numerical answer.\nVery early this morning, Elise left home in a cab headed for the hospital. Fortunately, the roads were clear, and the cab company only charged her a base price of $3, and $4 for every mile she traveled. If Elise paid a total of $23, how far is the hospital from her house?", "Solve the math problem, but return only the numerical answer.\nJosh is saving up for a box of cookies. To raise the money, he is going to make bracelets and sell them. It costs $1 for supplies for each bracelet and he sells each one for $1.5. If he makes 12 bracelets and after buying the cookies still has $3, how much did the box of cookies cost?", "Solve the math problem, but return only the numerical answer.\nColin can skip at six times the speed that Brandon can.  Brandon can skip at one-third the speed that Tony can.  And Tony can skip at twice the speed that Bruce can.  At what speed, in miles per hour, can Colin skip if Bruce skips at 1 mile per hour?", "Solve the math problem, but return only the numerical answer.\nJanet, a third grade teacher, is picking up the sack lunch order from a local deli for the field trip she is taking her class on. There are 35 children in her class, 5 volunteer chaperones, and herself. She she also ordered three additional sack lunches, just in case there was a problem. Each sack lunch costs $7. How much do all the lunches cost in total?", "Solve the math problem, but return only the numerical answer.\nAt 30, Anika is 4/3 the age of Maddie. What would be their average age in 15 years?"]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5}}
diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py
@@ -34,7 +34,7 @@
 MOCKED_RESPONSES = data["responses"]
 MOCKED_SAMPLED_RESPONSES = data["sampled_responses"]
 MOCKED_JUDGE_SCORES = data["judge_1"]
-MOCKED_LOGPROBS = metadata["logprobs"]
+MOCKED_LOGPROBS = data["logprobs"]
 
 
 @pytest.fixture
@@ -156,7 +156,7 @@ async def test_ensemble2(monkeypatch, mock_llm):
     PROMPTS = data["prompts"]
     MOCKED_RESPONSES = data["responses"]
     MOCKED_JUDGE_SCORES = data["judge_1"]
-    MOCKED_LOGPROBS = metadata["logprobs"]
+    MOCKED_LOGPROBS = data["logprobs"]
     uqe = UQEnsemble(llm=mock_llm, scorers=["min_probability", mock_llm])
 
     async def mock_generate_original_responses(*args, **kwargs):
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"ensemble1": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [0.9999998323932312, 0.9999993853802055, 0.9997710794982175, 0.7865512731195814, 0.9999998323932312], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.99908431799287, 0.14620509247832553, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.25, 0.25, 0.25, 0.25], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "5", "bytes": [53], "logprob": -1.9227449, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}, "bsdetector": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5599999999999999, 0.13999999999999999, 0.3], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -3.1737043e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.080879845, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.001702437, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -5.5122365e-07, "top_logprobs": []}]]}}, "ensemble2": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [null, null, null, null, null], "ensemble_scores": [0.9999996647864624, 0.999998770760411, 0.5911959418126744, 0.9399798095276081, 0.9999996647864624], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.1823918836253489, 0.8799596190552161, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5, 0.5], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0012972581, "top_logprobs": []}, {"token": " mar", "bytes": [32, 109, 97, 114], "logprob": -1.7015977, "top_logprobs": []}, {"token": "bles", "bytes": [98, 108, 101, 115], "logprob": 0.0, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.69366264, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.12787926, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.0036003059, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}}}
		{"ensemble1": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [0.9999998323932312, 0.9999993853802055, 0.9997710794982175, 0.7865512731195814, 0.9999998323932312], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.99908431799287, 0.14620509247832553, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "5", "bytes": [53], "logprob": -1.9227449, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.25, 0.25, 0.25, 0.25]}}, "bsdetector": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [["30", "30", "30", "30", "30"], ["8", "8", "8", "8", "8"], ["17", "17", "17", "17", "17"], ["5", "5", "5", "5", "5"], ["11", "11", "11", "11", "11"]], "ensemble_scores": [1.0, 1.0, 1.0, 1.0, 1.0], "noncontradiction": [1.0, 1.0, 1.0, 1.0, 1.0], "exact_match": [1.0, 1.0, 1.0, 1.0, 1.0], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -3.1737043e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0009161015, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.080879845, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.001702437, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -5.5122365e-07, "top_logprobs": []}]]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5599999999999999, 0.13999999999999999, 0.3]}}, "ensemble2": {"data": {"prompts": ["When you solve this math problem only return the answer with no additional text.\n7 red peaches, 15 yellow peaches and 8 green peaches are in the basket. How many peaches are in the basket?", "When you solve this math problem only return the answer with no additional text.\nJack received 3 emails and 64 letters in the morning. He then received 5 emails and 54 letters in the afternoon. How many emails did jack receive in the day?", "When you solve this math problem only return the answer with no additional text.\nEd had 2 more marbles than Doug. Doug lost some of his marbles at the playground. Now Ed has 19 more marbles than doug. How many marbles did Doug lose?", "When you solve this math problem only return the answer with no additional text.\nDan has $ 3 left with him after he bought a candy bar for $ 2. How much money did he have initially?", "When you solve this math problem only return the answer with no additional text.\nJake has 13 fewer peaches and 3 more apples than Steven. Steven has 9 peaches and 8 apples. How many apples does Jake have?"], "responses": ["30", "8", "17", "5", "11"], "sampled_responses": [null, null, null, null, null], "ensemble_scores": [0.9999996647864624, 0.999998770760411, 0.5911959418126744, 0.9399798095276081, 0.9999996647864624], "min_probability": [0.9999993295729247, 0.9999975415208221, 0.1823918836253489, 0.8799596190552161, 0.9999993295729247], "judge_1": [1.0, 1.0, 1.0, 1.0, 1.0], "logprobs": [[{"token": "30", "bytes": [51, 48], "logprob": -6.704273e-07, "top_logprobs": []}], [{"token": "8", "bytes": [56], "logprob": -2.4584822e-06, "top_logprobs": []}], [{"token": "17", "bytes": [49, 55], "logprob": -0.0012972581, "top_logprobs": []}, {"token": " mar", "bytes": [32, 109, 97, 114], "logprob": -1.7015977, "top_logprobs": []}, {"token": "bles", "bytes": [98, 108, 101, 115], "logprob": 0.0, "top_logprobs": []}, {"token": ".", "bytes": [46], "logprob": -0.69366264, "top_logprobs": []}], [{"token": "$", "bytes": [36], "logprob": -0.12787926, "top_logprobs": []}, {"token": "5", "bytes": [53], "logprob": -0.0036003059, "top_logprobs": []}], [{"token": "11", "bytes": [49, 49], "logprob": -6.704273e-07, "top_logprobs": []}]]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5, "thresh": 0.5, "weights": [0.5, 0.5]}}}
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		{"data": {"responses": ["5", "$3", "12", "308", "35"], "entropy_values": [0.0, 1.3296613488547582, 0.6365141682948128, 0.45056120886630463, 0.8675632284814612], "confidence_scores": [1.0, 0.25790187148969435, 0.644754678724236, 0.7485370014199393, 0.5158037429793888], "sampled_responses": [["5", "5", "5", "5", "5 miles"], ["$9", "$6", "Josh makes 12 bracelets. His cost for supplies is:\n12 bracelets * $1/bracelet = $12\n\nHe sells each bracelet for $1.50, so his revenue from selling 12 bracelets is:\n12 bracelets * $1.50/bracelet = $18\n\nHis profit, therefore, is:\n$18 - $12 = $6\n\nAfter buying the cookies, he still has $3, so the cost of the cookies is:\n$6 - $3 = $3\n\nTherefore, the cost of the box of cookies is $3.", "$6", "$9"], ["12", "36", "12", "12", "36"], ["308", "308", "308", "315", "308"], ["32.5", "32", "32.5", "30.", "32"]], "prompts": ["Solve the math problem, but return only the numerical answer.\nVery early this morning, Elise left home in a cab headed for the hospital. Fortunately, the roads were clear, and the cab company only charged her a base price of $3, and $4 for every mile she traveled. If Elise paid a total of $23, how far is the hospital from her house?", "Solve the math problem, but return only the numerical answer.\nJosh is saving up for a box of cookies. To raise the money, he is going to make bracelets and sell them. It costs $1 for supplies for each bracelet and he sells each one for $1.5. If he makes 12 bracelets and after buying the cookies still has $3, how much did the box of cookies cost?", "Solve the math problem, but return only the numerical answer.\nColin can skip at six times the speed that Brandon can. Brandon can skip at one-third the speed that Tony can. And Tony can skip at twice the speed that Bruce can. At what speed, in miles per hour, can Colin skip if Bruce skips at 1 mile per hour?", "Solve the math problem, but return only the numerical answer.\nJanet, a third grade teacher, is picking up the sack lunch order from a local deli for the field trip she is taking her class on. There are 35 children in her class, 5 volunteer chaperones, and herself. She she also ordered three additional sack lunches, just in case there was a problem. Each sack lunch costs $7. How much do all the lunches cost in total?", "Solve the math problem, but return only the numerical answer.\nAt 30, Anika is 4/3 the age of Maddie. What would be their average age in 15 years?"]}, "metadata": {"parameters": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5}}}
		{"data": {"responses": ["5", "$3", "12", "308", "35"], "entropy_values": [0.0, 1.3296613488547582, 0.6365141682948128, 0.45056120886630463, 0.8675632284814612], "confidence_scores": [1.0, 0.25790187148969435, 0.644754678724236, 0.7485370014199393, 0.5158037429793888], "sampled_responses": [["5", "5", "5", "5", "5 miles"], ["$9", "$6", "Josh makes 12 bracelets. His cost for supplies is:\n12 bracelets * $1/bracelet = $12\n\nHe sells each bracelet for $1.50, so his revenue from selling 12 bracelets is:\n12 bracelets * $1.50/bracelet = $18\n\nHis profit, therefore, is:\n$18 - $12 = $6\n\nAfter buying the cookies, he still has $3, so the cost of the cookies is:\n$6 - $3 = $3\n\nTherefore, the cost of the box of cookies is $3.", "$6", "$9"], ["12", "36", "12", "12", "36"], ["308", "308", "308", "315", "308"], ["32.5", "32", "32.5", "30.", "32"]], "prompts": ["Solve the math problem, but return only the numerical answer.\nVery early this morning, Elise left home in a cab headed for the hospital. Fortunately, the roads were clear, and the cab company only charged her a base price of $3, and $4 for every mile she traveled. If Elise paid a total of $23, how far is the hospital from her house?", "Solve the math problem, but return only the numerical answer.\nJosh is saving up for a box of cookies. To raise the money, he is going to make bracelets and sell them. It costs $1 for supplies for each bracelet and he sells each one for $1.5. If he makes 12 bracelets and after buying the cookies still has $3, how much did the box of cookies cost?", "Solve the math problem, but return only the numerical answer.\nColin can skip at six times the speed that Brandon can. Brandon can skip at one-third the speed that Tony can. And Tony can skip at twice the speed that Bruce can. At what speed, in miles per hour, can Colin skip if Bruce skips at 1 mile per hour?", "Solve the math problem, but return only the numerical answer.\nJanet, a third grade teacher, is picking up the sack lunch order from a local deli for the field trip she is taking her class on. There are 35 children in her class, 5 volunteer chaperones, and herself. She she also ordered three additional sack lunches, just in case there was a problem. Each sack lunch costs $7. How much do all the lunches cost in total?", "Solve the math problem, but return only the numerical answer.\nAt 30, Anika is 4/3 the age of Maddie. What would be their average age in 15 years?"]}, "metadata": {"temperature": 1.0, "sampling_temperature": 1.0, "num_responses": 5}}