Merge pull request #144 from Multi-Agent-LLMs/feat/challenge-results

lkaesberg · web-flow · commit 8c4838b9f0a8 · 2024-11-23T11:40:54.000+01:00
Feat/challenge results
diff --git a/mallm/coordinator.py b/mallm/coordinator.py
@@ -27,6 +27,7 @@
 )
 from mallm.utils.types import (
     Agreement,
+    ChallengeResult,
     InputExample,
     Memory,
     VotingResultList,
@@ -200,7 +201,7 @@ def discuss(
         float,
         bool,
         dict[int, Optional[VotingResultList]],
-        dict[str, Optional[str]],
+        ChallengeResult,
     ]:
         """
         The routine responsible for the discussion between agents to solve a task.
@@ -285,26 +286,50 @@ def discuss(
             )
         )
 
-        challenged_answers: dict[str, Optional[str]] = {}
+        challenged_answers: ChallengeResult = ChallengeResult(
+            answer or "No answer was provided."
+        )
         if config.challenge_final_results:
             logger.info("Challenging final results...")
-            for panelist in self.panelists:
-                challenge_result = panelist.llm.invoke(
-                    panelist.response_generator.generate_challenge_prompt(
-                        panelist,
-                        input_str,
-                        sample_instruction,
-                        (answer or "No answer was provided."),
-                    )
+            challenged_answers.additional_information = (
+                worker_functions.worker_context_function(input_str)
+            )
+            challenged_answers.wrong_answer = self.llm.invoke(
+                self.response_generator.generate_wrong_answer_prompt(
+                    sample_instruction, input_str
                 )
-                if "agree" in challenge_result.lower():
-                    logger.info(f"{panelist.persona} agrees with the final result.")
-                    challenged_answers[panelist.id] = None
-                else:
-                    logger.info(
-                        f"{panelist.persona} disagrees with the final result and proposes a new solution:\n{challenge_result}"
-                    )
-                    challenged_answers[panelist.id] = challenge_result
+            )
+            challenged_answers.irrelevant_answer = "I) I don't know."
+
+            challenged_answers.challenged_answers = self.challenge_solution(
+                answer, input_str, sample_instruction, None, False
+            )
+            challenged_answers.challenged_answers_wrong = self.challenge_solution(
+                challenged_answers.wrong_answer,
+                input_str,
+                sample_instruction,
+                None,
+                False,
+            )
+            challenged_answers.challenged_answers_irrelevant = self.challenge_solution(
+                challenged_answers.irrelevant_answer,
+                input_str,
+                sample_instruction,
+                None,
+                False,
+            )
+            challenged_answers.challenged_answers_history = self.challenge_solution(
+                answer, input_str, sample_instruction, None, True
+            )
+            challenged_answers.challenged_answers_additional_information = (
+                self.challenge_solution(
+                    answer,
+                    input_str,
+                    sample_instruction,
+                    challenged_answers.additional_information,
+                    False,
+                )
+            )
 
         discussion_time = timedelta(
             seconds=time.perf_counter() - start_time
@@ -326,6 +351,49 @@ def discuss(
             challenged_answers,
         )
 
+    def challenge_solution(
+        self,
+        answer: Optional[str],
+        input_str: str,
+        sample_instruction: str,
+        additional_information: Optional[str],
+        history: bool,
+    ) -> dict[str, Optional[str]]:
+        challenged_answers: dict[str, Optional[str]] = {}
+        for panelist in self.panelists:
+            agreement = panelist.llm.invoke(
+                panelist.response_generator.generate_challenge_prompt(
+                    panelist,
+                    input_str,
+                    sample_instruction,
+                    (answer or "No answer was provided."),
+                    history,
+                    additional_information,
+                )
+            )
+            if "disagree" in agreement.lower():
+                challenge_result = panelist.llm.invoke(
+                    panelist.response_generator.generate_challenge_new_answer_prompt(
+                        panelist,
+                        input_str,
+                        sample_instruction,
+                        (answer or "No answer was provided."),
+                        history,
+                        additional_information,
+                    )
+                )
+                logger.info(
+                    f"{panelist.persona} disagrees with the final result and proposes a new solution:\n{challenge_result}"
+                )
+                challenged_answers[panelist.id] = challenge_result
+            elif "agree" in agreement.lower():
+                logger.info(f"{panelist.persona} agrees with the final result.")
+                challenged_answers[panelist.id] = None
+            else:
+                logger.info(f"{panelist.persona} failed to challenge the final result.")
+                challenged_answers[panelist.id] = None
+        return challenged_answers
+
     def get_memories(
         self,
         context_length: Optional[int] = None,
diff --git a/mallm/evaluation/evaluator.py b/mallm/evaluation/evaluator.py
@@ -114,11 +114,12 @@ def calculate_scores(
 
     def add_scores(self) -> None:
         for item in tqdm(self.data, desc=f"Calculating scores of {self.input_file_path}: "):
-            answer = item.get("finalAnswer", "")
+            main_answer = item.get("finalAnswer", "")
             references = item.get("references", [])
             dataset_id = item.get("datasetId", None)
-            if answer:
-                item["scores"] = self.calculate_scores(answer, references, "", dataset_id)
+            if main_answer:
+                item["scores"] = self.calculate_scores(main_answer, references, "", dataset_id)
+
             votes_each_turn = item.get("votesEachTurn", None)
             if votes_each_turn:
                 alterations: dict[str, Any] = votes_each_turn[
@@ -136,6 +137,90 @@ def add_scores(self) -> None:
                                 self.calculate_scores(answer, references, alteration)
                             )
 
+            challenged_answers: Any = item.get("challengedAnswers", None)
+            if challenged_answers:
+                if "scores" not in item:
+                    continue
+                if "correct" not in item["scores"] and "f1" not in item["scores"]:
+                    continue
+                if challenged_answers["challenged_answers"]:
+                    self.analyze_challenged_answers(
+                        "normal",
+                        challenged_answers["challenged_answers"],
+                        item,
+                        references,
+                        item["scores"],
+                    )
+                if challenged_answers["challenged_answers_wrong"]:
+                    self.analyze_challenged_answers(
+                        "wrong",
+                        challenged_answers["challenged_answers_wrong"],
+                        item,
+                        references,
+                        self.calculate_scores(
+                            challenged_answers["wrong_answer"], references
+                        ),
+                    )
+                if challenged_answers["challenged_answers_irrelevant"]:
+                    self.analyze_challenged_answers(
+                        "irrelevant",
+                        challenged_answers["challenged_answers_irrelevant"],
+                        item,
+                        references,
+                        self.calculate_scores(
+                            challenged_answers["irrelevant_answer"], references
+                        ),
+                    )
+                if challenged_answers["challenged_answers_history"]:
+                    self.analyze_challenged_answers(
+                        "history",
+                        challenged_answers["challenged_answers_history"],
+                        item,
+                        references,
+                        item["scores"],
+                    )
+                if challenged_answers["challenged_answers_additional_information"]:
+                    self.analyze_challenged_answers(
+                        "information",
+                        challenged_answers["challenged_answers_additional_information"],
+                        item,
+                        references,
+                        item["scores"],
+                    )
+
+    def analyze_challenged_answers(
+        self,
+        name: str,
+        challenged_answers: dict[str, Optional[str]],
+        item: Any,
+        references: list[str],
+        previous_score: Any,
+    ) -> None:
+        new_answer = {
+            f"{name}_no_challenge": True,
+            f"{name}_challenge_failed": False,
+            f"{name}_challenge_higher": False,
+            f"{name}_challenge_lower": False,
+            f"{name}_challenge_same": False,
+        }
+        previous_score = previous_score.get("f1", None) or previous_score.get(
+            "correct", None
+        )
+        answer = next(iter(challenged_answers.values()))
+        if answer:
+            score = self.calculate_scores(answer, references)
+            current_score = score.get("f1", None) or score.get("correct", None)
+            if current_score is None or previous_score is None:
+                new_answer[f"{name}_challenge_failed"] = True
+            elif current_score > previous_score:
+                new_answer[f"{name}_challenge_higher"] = True
+            elif current_score < previous_score:
+                new_answer[f"{name}_challenge_lower"] = True
+            elif current_score == previous_score:
+                new_answer[f"{name}_challenge_same"] = True
+            new_answer[f"{name}_no_challenge"] = False
+        item["scores"].update(new_answer)
+
     def add_scores_extensive(self) -> None:
         for item in tqdm(self.data, desc="Extensive scores: "):
             references = item.get("references", [])
diff --git a/mallm/models/discussion/ResponseGenerator.py b/mallm/models/discussion/ResponseGenerator.py
@@ -253,21 +253,85 @@ def generate_challenge_prompt(
         question: str,
         task: str,
         final_answer: str,
+        history: bool = False,
+        facts: Optional[str] = None,
     ) -> list[dict[str, str]]:
-        return [
+        agent_history = panelist.get_discussion_history()[0] if history else []
+        prompts = [
             {
                 "role": "system",
                 "content": f"You are a participant in a group discussion. Your role: {panelist.persona} ({panelist.persona_description})",
-            },
+            }
+        ]
+        if history and agent_history:
+            prompts.append(
+                {
+                    "role": "system",
+                    "content": "This is the discussion to the current point:",
+                }
+            )
+            prompts.extend(agent_history)
+        if facts:
+            prompts.append(
+                {
+                    "role": "system",
+                    "content": f"Here is some helpful additional information to improve your answer quality: {facts}",
+                }
+            )
+        prompts.append(
             {
                 "role": "user",
                 "content": (
                     f"The task is: {task}. The question is: {question}. "
                     f"This is the final answer generated by the discussion: '{final_answer}'. "
-                    "Please critically evaluate this answer. If you do not agree, provide a new solution based on the task and question. If you agree with the final answer, respond with the exact word 'AGREE' to confirm."
+                    "Please critically evaluate this answer. If you agree with the final answer, respond with the exact word 'AGREE' to confirm. If you do not agree, respond with the exact word 'DISAGREE' to challenge the answer."
                 ),
             },
+        )
+        return prompts
+
+    @staticmethod
+    def generate_challenge_new_answer_prompt(
+        panelist: Panelist,
+        question: str,
+        task: str,
+        final_answer: str,
+        history: bool = False,
+        facts: Optional[str] = None,
+    ) -> list[dict[str, str]]:
+        agent_history = panelist.get_discussion_history()[0] if history else []
+        prompts = [
+            {
+                "role": "system",
+                "content": f"You are a participant in a group discussion. Your role: {panelist.persona} ({panelist.persona_description})",
+            }
         ]
+        if history and agent_history:
+            prompts.append(
+                {
+                    "role": "system",
+                    "content": "This is the discussion to the current point:",
+                }
+            )
+            prompts.extend(agent_history)
+        if facts:
+            prompts.append(
+                {
+                    "role": "system",
+                    "content": f"Here is some helpful additional information to improve your answer quality: {facts}",
+                }
+            )
+        prompts.append(
+            {
+                "role": "user",
+                "content": (
+                    f"The task is: {task}. The question is: {question}. "
+                    f"This is the final answer generated by the discussion: '{final_answer}'. "
+                    "You dont agree with the final answer. Please provide a new answer to the question. Include the letter corresponding to your answer in the solution."
+                ),
+            },
+        )
+        return prompts
 
     @staticmethod
     def voting_base_prompt(
@@ -520,3 +584,29 @@ def generate_summary_prompt(
 
         # Return the prompts list
         return prompts
+
+    @staticmethod
+    def generate_wrong_answer_prompt(task: str, question: str) -> list[dict[str, str]]:
+        return [
+            {
+                "role": "system",
+                "content": "You are tasked with providing an incorrect or wrong response to the given task and question.",
+            },
+            {
+                "role": "user",
+                "content": f"Task: {task}\nQuestion: {question}. Please provide an answer that is deliberately incorrect or inaccurate. Only answer with the incorrect response.",
+            },
+        ]
+
+    @staticmethod
+    def generate_irrelevant_answer_prompt(question: str) -> list[dict[str, str]]:
+        return [
+            {
+                "role": "system",
+                "content": "You are tasked with providing a completely unrelated response to the given question.",
+            },
+            {
+                "role": "user",
+                "content": f"Question: {question} \n\nPlease provide an answer that is irrelevant to the question. Only answer with the irrelevant response.",
+            },
+        ]
diff --git a/mallm/scheduler.py b/mallm/scheduler.py
@@ -233,7 +233,7 @@ def run_discussion(
                         for voting_round, voting_result in voting_results_per_turn.items()
                         if voting_result is not None
                     },
-                    "challengedAnswers": challenged_answers,
+                    "challengedAnswers": dataclasses.asdict(challenged_answers),
                     "references": sample.references,
                     "metadata": sample.metadata,
                     "decisionSuccess": decision_success,
diff --git a/mallm/utils/types.py b/mallm/utils/types.py
@@ -62,6 +62,19 @@ class VotingResultList:
     alterations: dict[str, VotingResult]
 
 
+@dataclass
+class ChallengeResult:
+    answer: str
+    additional_information: Optional[str] = None
+    wrong_answer: Optional[str] = None
+    irrelevant_answer: Optional[str] = None
+    challenged_answers: Optional[dict[str, Optional[str]]] = None
+    challenged_answers_history: Optional[dict[str, Optional[str]]] = None
+    challenged_answers_wrong: Optional[dict[str, Optional[str]]] = None
+    challenged_answers_irrelevant: Optional[dict[str, Optional[str]]] = None
+    challenged_answers_additional_information: Optional[dict[str, Optional[str]]] = None
+
+
 @dataclass
 class InputExample:
     example_id: str