Resolve consensus provider and parsing fixes

cafferychen777 · cafferychen777 · commit 5f90bdeb86fd · 2026-02-08T23:52:09.000-06:00
Add _resolve_consensus_provider helper to centralize resolution of provider, model, and API key (prefers explicit consensus_model then falls back to first available api_keys). Refactor functions to accept consensus_model (previously consensus_check_model) and use the new resolver. Fix numeric parsing regexes to accept 0/1 values and improve robustness of metric extraction. Update resolve_provider_base_url to handle a None provider. Improve format_results: make cluster prefix matching case-insensitive, tighten line-by-line parsing to strip "Cluster X:" prefixes, log when fewer lines than clusters and mark missing entries as "Unknown". Update tests to reflect stripped prefixes behavior.
diff --git a/python/mllmcelltype/consensus.py b/python/mllmcelltype/consensus.py
@@ -38,6 +38,42 @@
 }
 
 
+def _resolve_consensus_provider(
+    consensus_model: dict[str, str] | None,
+    api_keys: dict[str, str],
+) -> tuple[str | None, str | None, str | None]:
+    """Resolve provider, model, and API key for consensus checking.
+
+    Resolution order: explicit ``consensus_model`` dict → first available
+    key in ``api_keys``.
+
+    Args:
+        consensus_model: Optional dict with 'provider' and/or 'model' keys
+        api_keys: Dictionary mapping provider names to API keys
+
+    Returns:
+        tuple of (provider, model, api_key) — any element may be None
+    """
+    if consensus_model:
+        provider = consensus_model.get("provider")
+        model = consensus_model.get("model")
+        if not provider and model:
+            provider = get_provider(model)
+        if provider and not model:
+            model = get_default_model(provider)
+    else:
+        provider = None
+        model = None
+        for p, key in api_keys.items():
+            if key:
+                provider = p
+                model = get_default_model(p)
+                break
+
+    api_key = api_keys.get(provider) if provider else None
+    return provider, model, api_key
+
+
 def _call_llm_with_retry(
     prompt: str,
     provider: str,
@@ -188,7 +224,7 @@ def _extract_metrics_from_text(
 
     # Regex patterns (mirroring R's .CONSENSUS_CONSTANTS)
     consensus_indicator_pattern = r"^\s*[01]\s*$"
-    proportion_pattern = r"^\s*(0\.\d+|1\.0*|1)\s*$"
+    proportion_pattern = r"^\s*(0\.\d+|1\.0*|[01])\s*$"
     entropy_pattern = r"^\s*(\d+\.\d+|\d+)\s*$"
     general_numeric_pattern = r"^\s*\d+(\.\d+)?\s*$"
 
@@ -224,7 +260,7 @@ def _extract_metrics_from_text(
             parts = line.split("=")
             if len(parts) > 1:
                 last_part = parts[-1].strip()
-                value_match = re.search(r"(0\.\d+|1\.0*|1)", last_part)
+                value_match = re.search(r"(0\.\d+|1\.0*|[01])", last_part)
                 if value_match:
                     with contextlib.suppress(ValueError):
                         potential_cp = float(value_match.group(1))
@@ -407,26 +443,9 @@ def check_consensus(
         # Use LLM to check consensus among annotations
         prompt = create_consensus_check_prompt(cluster_annotations)
 
-        # Determine which model to use: explicit consensus_model → api_keys
-        if consensus_model:
-            primary_provider = consensus_model.get("provider")
-            primary_model = consensus_model.get("model")
-            if not primary_provider and primary_model:
-                primary_provider = get_provider(primary_model)
-            if primary_provider and not primary_model:
-                primary_model = get_default_model(primary_provider)
-        else:
-            # Pick from user's available api_keys
-            primary_provider = None
-            primary_model = None
-            if api_keys:
-                for provider, key in api_keys.items():
-                    if key:
-                        primary_provider = provider
-                        primary_model = get_default_model(provider)
-                        break
-
-        primary_api_key = api_keys.get(primary_provider) if primary_provider else None
+        primary_provider, primary_model, primary_api_key = _resolve_consensus_provider(
+            consensus_model, api_keys
+        )
 
         llm_response = _call_llm_with_retry(
             prompt=prompt,
@@ -528,7 +547,7 @@ def check_consensus_for_discussion_round(
     consensus_threshold: float = 0.7,
     entropy_threshold: float = 1.0,
     api_keys: dict[str, str] | None = None,
-    consensus_check_model: dict[str, str] | None = None,
+    consensus_model: dict[str, str] | None = None,
     base_urls: str | dict[str, str] | None = None,
 ) -> dict[str, Any]:
     """Check consensus among model responses for a single discussion round.
@@ -547,7 +566,7 @@ def check_consensus_for_discussion_round(
         consensus_threshold: Agreement threshold (default: 0.7)
         entropy_threshold: Entropy threshold (default: 1.0)
         api_keys: Dictionary mapping provider names to API keys
-        consensus_check_model: Optional dict with 'provider' and 'model' keys
+        consensus_model: Optional dict with 'provider' and 'model' keys
         base_urls: Custom base URLs for API endpoints
 
     Returns:
@@ -562,28 +581,12 @@ def check_consensus_for_discussion_round(
         write_log("No responses to check consensus", level="warning")
         return DEFAULT_CONSENSUS_RESULT.copy()
 
-    # Resolve LLM parameters: explicit consensus_check_model → api_keys → give up
     if api_keys is None:
         api_keys = {}
 
-    if consensus_check_model:
-        primary_provider = consensus_check_model.get("provider")
-        primary_model = consensus_check_model.get("model")
-        if not primary_provider and primary_model:
-            primary_provider = get_provider(primary_model)
-        if primary_provider and not primary_model:
-            primary_model = get_default_model(primary_provider)
-    else:
-        # Pick from user's available api_keys
-        primary_provider = None
-        primary_model = None
-        for provider, key in api_keys.items():
-            if key:
-                primary_provider = provider
-                primary_model = get_default_model(provider)
-                break
-
-    primary_api_key = api_keys.get(primary_provider) if primary_provider else None
+    primary_provider, primary_model, primary_api_key = _resolve_consensus_provider(
+        consensus_model, api_keys
+    )
 
     # Single response — extract label but cannot establish consensus
     if len(round_responses) == 1:
@@ -669,7 +672,7 @@ def process_controversial_clusters(
     cache_dir: str | None = None,
     base_urls: str | dict[str, str] | None = None,
     force_rerun: bool = False,
-    consensus_check_model: dict[str, str] | None = None,
+    consensus_model: dict[str, str] | None = None,
 ) -> tuple[dict[str, str], dict[str, list[dict]], dict[str, float], dict[str, float]]:
     """Process controversial clusters through multi-model discussion.
 
@@ -694,7 +697,7 @@ def process_controversial_clusters(
         cache_dir: Directory to store cache files
         base_urls: Custom base URLs for API endpoints
         force_rerun: If True, ignore cached results
-        consensus_check_model: Optional dict with 'provider' and 'model' keys
+        consensus_model: Optional dict with 'provider' and 'model' keys
             to specify which model to use for consensus checking with LLM.
             If not provided, picks from the caller's api_keys.
 
@@ -872,7 +875,7 @@ def process_controversial_clusters(
                     consensus_threshold=consensus_threshold,
                     entropy_threshold=entropy_threshold,
                     api_keys=api_keys,
-                    consensus_check_model=consensus_check_model,
+                    consensus_model=consensus_model,
                     base_urls=base_urls,
                 )
 
@@ -906,7 +909,7 @@ def process_controversial_clusters(
                         consensus_threshold=consensus_threshold,
                         entropy_threshold=entropy_threshold,
                         api_keys=api_keys,
-                        consensus_check_model=consensus_check_model,
+                        consensus_model=consensus_model,
                         base_urls=base_urls,
                     )
                     final_decision = last_consensus["majority_prediction"]
@@ -926,7 +929,7 @@ def process_controversial_clusters(
                         consensus_threshold=consensus_threshold,
                         entropy_threshold=entropy_threshold,
                         api_keys=api_keys,
-                        consensus_check_model=consensus_check_model,
+                        consensus_model=consensus_model,
                         base_urls=base_urls,
                     )
                     cell_type = last_consensus["majority_prediction"]
@@ -1241,7 +1244,7 @@ def interactive_consensus_annotation(
                 cache_dir=cache_dir,
                 base_urls=base_urls,
                 force_rerun=force_rerun,
-                consensus_check_model=consensus_model_dict,  # Pass consensus model for LLM verification
+                consensus_model=consensus_model_dict,
             )
 
             # Update consensus proportion and entropy for resolved clusters
diff --git a/python/mllmcelltype/url_utils.py b/python/mllmcelltype/url_utils.py
@@ -22,7 +22,7 @@ def resolve_provider_base_url(provider: str, base_urls: str | dict | None) -> st
     Returns:
         Resolved base URL or None
     """
-    if base_urls is None:
+    if base_urls is None or provider is None:
         return None
 
     if isinstance(base_urls, str):
diff --git a/python/mllmcelltype/utils.py b/python/mllmcelltype/utils.py
@@ -281,7 +281,7 @@ def format_results(results: list[str], clusters: list[str]) -> dict[str, str]:
 
     # Case 1: Try to parse the format "Cluster X: Annotation" (most common format from our prompts)
     result = {}
-    cluster_pattern = r"Cluster\s+(.+?):\s*(.*)"
+    cluster_pattern = r"(?i)Cluster\s+(.+?):\s*(.*)"
 
     # First pass: try to find annotations for each cluster by ID
     for cluster in clusters:
@@ -354,45 +354,27 @@ def format_results(results: list[str], clusters: list[str]) -> dict[str, str]:
     except (json.JSONDecodeError, ValueError, KeyError, TypeError, AttributeError) as e:
         write_log(f"Failed to parse JSON response: {e!s}", level="debug")
 
-    # Case 3: Check if this is a simple response where each line corresponds to a cluster
-    # This is the expected format from the R version
-    if len(clean_results) >= len(clusters):
-        # Simple case: one result per cluster
-        simple_result = {}
-        for i, cluster in enumerate(clusters):
-            if i < len(clean_results):
-                # Check if this line contains a cluster prefix and remove it
-                line = clean_results[i]
-                match = re.match(cluster_pattern, line)
-                if match:
-                    simple_result[str(cluster)] = match.group(2).strip()
-                else:
-                    simple_result[str(cluster)] = line.strip()
-            else:
-                simple_result[str(cluster)] = "Unknown"
-
-        write_log("Successfully parsed response as simple line-by-line format", level="info")
-        return simple_result
+    # Case 3: Line-by-line mapping — each line corresponds to a cluster
+    if len(clean_results) < len(clusters):
+        write_log(
+            f"Fewer result lines ({len(clean_results)}) than clusters ({len(clusters)}), "
+            "remaining clusters will be marked Unknown",
+            level="warning",
+        )
 
-    # Case 4: Fall back to the original method
-    write_log(
-        "Could not parse complex LLM response, falling back to simple mapping",
-        level="warning",
-    )
     result = {}
     for i, cluster in enumerate(clusters):
         if i < len(clean_results):
-            result[str(cluster)] = clean_results[i]
+            line = clean_results[i]
+            match = re.match(cluster_pattern, line)
+            if match:
+                result[str(cluster)] = match.group(2).strip()
+            else:
+                result[str(cluster)] = line.strip()
         else:
             result[str(cluster)] = "Unknown"
 
-    # Check if number of results matches number of clusters
-    if len(result) != len(clusters):
-        write_log(
-            f"Number of results ({len(result)}) does not match number of clusters ({len(clusters)})",
-            level="warning",
-        )
-
+    write_log("Parsed response as line-by-line format", level="info")
     return result
 
 
diff --git a/python/tests/test_core.py b/python/tests/test_core.py
@@ -175,9 +175,9 @@ def test_format_results_mismatched():
     assert "2" in formatted
     # The function adds "Unknown" for missing clusters
     assert "3" in formatted
-    # In simple mapping mode, it doesn't clean prefixes
-    assert "Cluster 1: T cells" in formatted["1"]
-    assert "Cluster 2: B cells" in formatted["2"]
+    # Line-by-line mapping strips "Cluster X:" prefix when present
+    assert formatted["1"] == "T cells"
+    assert formatted["2"] == "B cells"
     assert formatted["3"] == "Unknown"