Remove unused prompt and prediction selection functions

cafferychen777 · cafferychen777 · commit 24cfd12c1b96 · 2026-01-26T02:16:47.000-06:00
Deleted select_best_prediction, create_json_prompt, and create_initial_discussion_prompt functions, along with related imports and exports, to clean up unused code and simplify the API.
diff --git a/python/mllmcelltype/__init__.py b/python/mllmcelltype/__init__.py
@@ -13,15 +13,12 @@
 from .functions import (
     get_provider,
     identify_controversial_clusters,
-    select_best_prediction,
 )
 from .logger import setup_logging, write_log
 from .prompts import (
     create_batch_prompt,
     create_consensus_check_prompt,
     create_discussion_prompt,
-    create_initial_discussion_prompt,
-    create_json_prompt,
     create_prompt,
 )
 from .url_utils import (
@@ -66,7 +63,6 @@
     "get_provider",
     "clean_annotation",
     "identify_controversial_clusters",
-    "select_best_prediction",
     # Logging
     "setup_logging",
     "write_log",
@@ -85,10 +81,8 @@
     # Prompts
     "create_prompt",
     "create_batch_prompt",
-    "create_json_prompt",
     "create_discussion_prompt",
     "create_consensus_check_prompt",
-    "create_initial_discussion_prompt",
     # Consensus
     "check_consensus",
     "process_controversial_clusters",
diff --git a/python/mllmcelltype/functions.py b/python/mllmcelltype/functions.py
@@ -277,44 +277,6 @@ def get_provider(model: str) -> str:
     )
 
 
-def select_best_prediction(predictions: list[dict[str, str]]) -> dict[str, str]:
-    """Select the best prediction from multiple models.
-
-    Args:
-        predictions: List of dictionaries mapping cluster IDs to cell type annotations
-
-    Returns:
-        dict[str, str]: Dictionary mapping cluster IDs to best predictions
-
-    """
-    if not predictions:
-        return {}
-
-    # Get all cluster IDs
-    all_clusters = set()
-    for prediction in predictions:
-        all_clusters.update(prediction.keys())
-
-    # For each cluster, select the most specific prediction
-    best_predictions = {}
-    for cluster in all_clusters:
-        cluster_predictions = [pred.get(cluster, "") for pred in predictions if cluster in pred]
-
-        # Filter out empty predictions
-        cluster_predictions = [pred for pred in cluster_predictions if pred]
-
-        if not cluster_predictions:
-            best_predictions[cluster] = "Unknown"
-            continue
-
-        # Select the longest prediction (assuming it's more specific)
-        # This is a simple heuristic and could be improved
-        best_pred = max(cluster_predictions, key=len)
-        best_predictions[cluster] = best_pred
-
-    return best_predictions
-
-
 def identify_controversial_clusters(
     annotations: dict[str, dict[str, str]], threshold: float = 0.6
 ) -> list[str]:
diff --git a/python/mllmcelltype/prompts.py b/python/mllmcelltype/prompts.py
@@ -104,43 +104,6 @@ def create_consensus_check_prompt(annotations: list[str]) -> str:
     return prompt.replace("{annotations}", formatted_annotations)
 
 
-# Default JSON format prompt template
-DEFAULT_JSON_PROMPT_TEMPLATE = """You are an expert single-cell RNA-seq analyst specializing in cell type annotation.
-I need you to identify cell types of {species} cells from {tissue}.
-Below is a list of marker genes for each cluster.
-Please assign the most likely cell type to each cluster based on the marker genes.
-
-IMPORTANT: Format your response as a valid JSON object as follows, using the EXACT SAME cluster IDs as provided in the input, and maintaining NUMERICAL ORDER:
-```json
-{{
-  "annotations": [
-    {{
-      "cluster": "0",
-      "cell_type": "T cells",
-      "confidence": "high",
-      "key_markers": ["CD3D", "CD3G", "CD3E"]
-    }},
-    {{
-      "cluster": "1",
-      "cell_type": "B cells",
-      "confidence": "high",
-      "key_markers": ["CD19", "CD79A", "MS4A1"]
-    }},
-    ...
-  ]
-}}
-```
-
-For each cluster, provide:
-1. The cluster ID (use the SAME ID as in the input)
-2. The cell type name (be concise but specific)
-3. Your confidence level (high, medium, low)
-4. A list of 2-4 key markers that support your annotation
-
-Here are the marker genes for each cluster:
-{markers}
-"""
-
 # Template for facilitating discussion for controversial clusters
 DEFAULT_DISCUSSION_TEMPLATE = """You are an expert in single-cell RNA-seq cell type annotation tasked with resolving disagreements between model predictions.
 
@@ -335,33 +298,6 @@ def create_batch_prompt(
     return prompt
 
 
-def create_json_prompt(
-    marker_genes: dict[str, list[str]],
-    species: str,
-    tissue: Optional[str] = None,
-    additional_context: Optional[str] = None,
-) -> str:
-    """Create a prompt for cell type annotation with JSON output format.
-
-    Args:
-        marker_genes: Dictionary mapping cluster names to lists of marker genes
-        species: Species name (e.g., 'human', 'mouse')
-        tissue: Tissue name (e.g., 'brain', 'blood')
-        additional_context: Additional context to include in the prompt
-
-    Returns:
-        str: The generated prompt
-
-    """
-    return create_prompt(
-        marker_genes=marker_genes,
-        species=species,
-        tissue=tissue,
-        additional_context=additional_context,
-        prompt_template=DEFAULT_JSON_PROMPT_TEMPLATE,
-    )
-
-
 def create_discussion_prompt(
     cluster_id: str,
     marker_genes: list[str],
@@ -526,57 +462,3 @@ def create_discussion_consensus_check_prompt(
 
     write_log(f"Generated discussion consensus check prompt with {len(prompt)} characters")
     return prompt
-
-
-def create_initial_discussion_prompt(
-    cluster_id: str, marker_genes: list[str], species: str, tissue: Optional[str] = None
-) -> str:
-    """Create a prompt for initial cell type discussion about a cluster.
-
-    Args:
-        cluster_id: ID of the cluster
-        marker_genes: List of marker genes for the cluster
-        species: Species name (e.g., 'human', 'mouse')
-        tissue: Tissue name (e.g., 'brain', 'blood')
-
-    Returns:
-        str: The generated prompt
-
-    """
-    write_log(f"Creating initial discussion prompt for cluster {cluster_id}")
-
-    # Default tissue if none provided
-    tissue_text = tissue if tissue else "unknown tissue"
-
-    # Format marker genes text
-    marker_genes_text = ", ".join(marker_genes)
-
-    # Template for initial discussion
-    template = """You are an expert in single-cell RNA-seq analysis, assigned to identify the cell type for a specific cluster.
-
-Cluster ID: {cluster_id}
-Species: {species}
-Tissue: {tissue}
-
-Marker genes: {marker_genes}
-
-Your task:
-1. Analyze these marker genes and their expression patterns
-2. Consider the cell types that might express this combination of genes
-3. Provide a detailed reasoning process
-4. Determine the most likely cell type for this cluster
-
-Give a thorough analysis, explaining which genes are most informative and why.
-End with a clear cell type determination.
-"""
-
-    # Fill in the template
-    prompt = template.format(
-        cluster_id=cluster_id,
-        species=species,
-        tissue=tissue_text,
-        marker_genes=marker_genes_text,
-    )
-
-    write_log(f"Generated initial discussion prompt with {len(prompt)} characters")
-    return prompt