Skip to content

Commit 24cfd12

Browse files
Remove unused prompt and prediction selection functions
Deleted select_best_prediction, create_json_prompt, and create_initial_discussion_prompt functions, along with related imports and exports, to clean up unused code and simplify the API.
1 parent b7c7262 commit 24cfd12

File tree

3 files changed

+0
-162
lines changed

3 files changed

+0
-162
lines changed

python/mllmcelltype/__init__.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,12 @@
1313
from .functions import (
1414
get_provider,
1515
identify_controversial_clusters,
16-
select_best_prediction,
1716
)
1817
from .logger import setup_logging, write_log
1918
from .prompts import (
2019
create_batch_prompt,
2120
create_consensus_check_prompt,
2221
create_discussion_prompt,
23-
create_initial_discussion_prompt,
24-
create_json_prompt,
2522
create_prompt,
2623
)
2724
from .url_utils import (
@@ -66,7 +63,6 @@
6663
"get_provider",
6764
"clean_annotation",
6865
"identify_controversial_clusters",
69-
"select_best_prediction",
7066
# Logging
7167
"setup_logging",
7268
"write_log",
@@ -85,10 +81,8 @@
8581
# Prompts
8682
"create_prompt",
8783
"create_batch_prompt",
88-
"create_json_prompt",
8984
"create_discussion_prompt",
9085
"create_consensus_check_prompt",
91-
"create_initial_discussion_prompt",
9286
# Consensus
9387
"check_consensus",
9488
"process_controversial_clusters",

python/mllmcelltype/functions.py

Lines changed: 0 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -277,44 +277,6 @@ def get_provider(model: str) -> str:
277277
)
278278

279279

280-
def select_best_prediction(predictions: list[dict[str, str]]) -> dict[str, str]:
281-
"""Select the best prediction from multiple models.
282-
283-
Args:
284-
predictions: List of dictionaries mapping cluster IDs to cell type annotations
285-
286-
Returns:
287-
dict[str, str]: Dictionary mapping cluster IDs to best predictions
288-
289-
"""
290-
if not predictions:
291-
return {}
292-
293-
# Get all cluster IDs
294-
all_clusters = set()
295-
for prediction in predictions:
296-
all_clusters.update(prediction.keys())
297-
298-
# For each cluster, select the most specific prediction
299-
best_predictions = {}
300-
for cluster in all_clusters:
301-
cluster_predictions = [pred.get(cluster, "") for pred in predictions if cluster in pred]
302-
303-
# Filter out empty predictions
304-
cluster_predictions = [pred for pred in cluster_predictions if pred]
305-
306-
if not cluster_predictions:
307-
best_predictions[cluster] = "Unknown"
308-
continue
309-
310-
# Select the longest prediction (assuming it's more specific)
311-
# This is a simple heuristic and could be improved
312-
best_pred = max(cluster_predictions, key=len)
313-
best_predictions[cluster] = best_pred
314-
315-
return best_predictions
316-
317-
318280
def identify_controversial_clusters(
319281
annotations: dict[str, dict[str, str]], threshold: float = 0.6
320282
) -> list[str]:

python/mllmcelltype/prompts.py

Lines changed: 0 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -104,43 +104,6 @@ def create_consensus_check_prompt(annotations: list[str]) -> str:
104104
return prompt.replace("{annotations}", formatted_annotations)
105105

106106

107-
# Default JSON format prompt template
108-
DEFAULT_JSON_PROMPT_TEMPLATE = """You are an expert single-cell RNA-seq analyst specializing in cell type annotation.
109-
I need you to identify cell types of {species} cells from {tissue}.
110-
Below is a list of marker genes for each cluster.
111-
Please assign the most likely cell type to each cluster based on the marker genes.
112-
113-
IMPORTANT: Format your response as a valid JSON object as follows, using the EXACT SAME cluster IDs as provided in the input, and maintaining NUMERICAL ORDER:
114-
```json
115-
{{
116-
"annotations": [
117-
{{
118-
"cluster": "0",
119-
"cell_type": "T cells",
120-
"confidence": "high",
121-
"key_markers": ["CD3D", "CD3G", "CD3E"]
122-
}},
123-
{{
124-
"cluster": "1",
125-
"cell_type": "B cells",
126-
"confidence": "high",
127-
"key_markers": ["CD19", "CD79A", "MS4A1"]
128-
}},
129-
...
130-
]
131-
}}
132-
```
133-
134-
For each cluster, provide:
135-
1. The cluster ID (use the SAME ID as in the input)
136-
2. The cell type name (be concise but specific)
137-
3. Your confidence level (high, medium, low)
138-
4. A list of 2-4 key markers that support your annotation
139-
140-
Here are the marker genes for each cluster:
141-
{markers}
142-
"""
143-
144107
# Template for facilitating discussion for controversial clusters
145108
DEFAULT_DISCUSSION_TEMPLATE = """You are an expert in single-cell RNA-seq cell type annotation tasked with resolving disagreements between model predictions.
146109
@@ -335,33 +298,6 @@ def create_batch_prompt(
335298
return prompt
336299

337300

338-
def create_json_prompt(
339-
marker_genes: dict[str, list[str]],
340-
species: str,
341-
tissue: Optional[str] = None,
342-
additional_context: Optional[str] = None,
343-
) -> str:
344-
"""Create a prompt for cell type annotation with JSON output format.
345-
346-
Args:
347-
marker_genes: Dictionary mapping cluster names to lists of marker genes
348-
species: Species name (e.g., 'human', 'mouse')
349-
tissue: Tissue name (e.g., 'brain', 'blood')
350-
additional_context: Additional context to include in the prompt
351-
352-
Returns:
353-
str: The generated prompt
354-
355-
"""
356-
return create_prompt(
357-
marker_genes=marker_genes,
358-
species=species,
359-
tissue=tissue,
360-
additional_context=additional_context,
361-
prompt_template=DEFAULT_JSON_PROMPT_TEMPLATE,
362-
)
363-
364-
365301
def create_discussion_prompt(
366302
cluster_id: str,
367303
marker_genes: list[str],
@@ -526,57 +462,3 @@ def create_discussion_consensus_check_prompt(
526462

527463
write_log(f"Generated discussion consensus check prompt with {len(prompt)} characters")
528464
return prompt
529-
530-
531-
def create_initial_discussion_prompt(
532-
cluster_id: str, marker_genes: list[str], species: str, tissue: Optional[str] = None
533-
) -> str:
534-
"""Create a prompt for initial cell type discussion about a cluster.
535-
536-
Args:
537-
cluster_id: ID of the cluster
538-
marker_genes: List of marker genes for the cluster
539-
species: Species name (e.g., 'human', 'mouse')
540-
tissue: Tissue name (e.g., 'brain', 'blood')
541-
542-
Returns:
543-
str: The generated prompt
544-
545-
"""
546-
write_log(f"Creating initial discussion prompt for cluster {cluster_id}")
547-
548-
# Default tissue if none provided
549-
tissue_text = tissue if tissue else "unknown tissue"
550-
551-
# Format marker genes text
552-
marker_genes_text = ", ".join(marker_genes)
553-
554-
# Template for initial discussion
555-
template = """You are an expert in single-cell RNA-seq analysis, assigned to identify the cell type for a specific cluster.
556-
557-
Cluster ID: {cluster_id}
558-
Species: {species}
559-
Tissue: {tissue}
560-
561-
Marker genes: {marker_genes}
562-
563-
Your task:
564-
1. Analyze these marker genes and their expression patterns
565-
2. Consider the cell types that might express this combination of genes
566-
3. Provide a detailed reasoning process
567-
4. Determine the most likely cell type for this cluster
568-
569-
Give a thorough analysis, explaining which genes are most informative and why.
570-
End with a clear cell type determination.
571-
"""
572-
573-
# Fill in the template
574-
prompt = template.format(
575-
cluster_id=cluster_id,
576-
species=species,
577-
tissue=tissue_text,
578-
marker_genes=marker_genes_text,
579-
)
580-
581-
write_log(f"Generated initial discussion prompt with {len(prompt)} characters")
582-
return prompt

0 commit comments

Comments
 (0)