diff --git a/__pycache__/normalizer.cpython-314.pyc b/__pycache__/normalizer.cpython-314.pyc new file mode 100644 index 0000000..afd2926 Binary files /dev/null and b/__pycache__/normalizer.cpython-314.pyc differ diff --git a/__pycache__/schemas.cpython-314.pyc b/__pycache__/schemas.cpython-314.pyc new file mode 100644 index 0000000..8ec9823 Binary files /dev/null and b/__pycache__/schemas.cpython-314.pyc differ diff --git a/__pycache__/validator.cpython-314.pyc b/__pycache__/validator.cpython-314.pyc new file mode 100644 index 0000000..daea8fd Binary files /dev/null and b/__pycache__/validator.cpython-314.pyc differ diff --git a/mapper.py b/mapper.py new file mode 100644 index 0000000..38fa932 --- /dev/null +++ b/mapper.py @@ -0,0 +1,162 @@ +import pandas as pd +from rapidfuzz import fuzz + +from schemas import SCHEMA_SYNONYMS +from normalizer import normalize_column +from validator import REQUIRED_FIELDS + + +def map_column(column_name): + + normalized = normalize_column(column_name) + + best_match = None + highest_score = 0 + matched_synonym = None + + for schema_field, synonyms in SCHEMA_SYNONYMS.items(): + + for synonym in synonyms: + + score = fuzz.ratio(normalized, synonym) + + if score > highest_score: + highest_score = score + best_match = schema_field + matched_synonym = synonym + + if highest_score >= 90: + status = "AUTO_MAPPED" + + elif highest_score >= 70: + status = "REVIEW_REQUIRED" + + else: + status = "UNKNOWN" + + return { + "original_column": column_name, + "normalized_column": normalized, + "mapped_to": best_match, + "matched_synonym": matched_synonym, + "confidence": highest_score, + "status": status + } + + +def detect_schema(columns): + + normalized_columns = [ + normalize_column(col) + for col in columns + ] + + mutation_keywords = [ + "gene", + "gene_symbol", + "hugo_symbol", + "chromosome", + "start_position", + "end_position" + ] + + mutation_score = 0 + + for col in normalized_columns: + + if col in mutation_keywords: + mutation_score += 1 + + if mutation_score > 0: + return "MUTATION" + + return "UNKNOWN" + + +def validate_mappings(mapped_results, schema_type): + + mapped_fields = [ + item["mapped_to"] + for item in mapped_results + ] + + required_fields = REQUIRED_FIELDS.get(schema_type, []) + + errors = [] + + for field in required_fields: + + if field not in mapped_fields: + + errors.append( + f"Missing required field: {field}" + ) + + return errors + + +def print_preview(results): + + print("\nCOLUMN MAPPINGS:\n") + + for result in results: + + print( + f"{result['original_column']}" + f" --> " + f"{result['mapped_to']}" + f" ({result['confidence']}%)" + ) + + +def process_file(csv_file): + + df = pd.read_csv(csv_file) + + columns = list(df.columns) + + schema_type = detect_schema(columns) + + print(f"\nDetected Schema: {schema_type}") + + mapped_results = [] + + for col in columns: + + result = map_column(col) + + mapped_results.append(result) + + print_preview(mapped_results) + + errors = validate_mappings( + mapped_results, + schema_type + ) + + print("\nVALIDATION:\n") + + if not errors: + + print("Validation Passed ✅") + + else: + + for error in errors: + print(error) + + +if __name__ == "__main__": + + import sys + + if len(sys.argv) != 2: + + print( + "Usage:\n" + "python mapper.py input.csv" + ) + + else: + + process_file(sys.argv[1]) \ No newline at end of file diff --git a/normalizer.py b/normalizer.py new file mode 100644 index 0000000..76f76ed --- /dev/null +++ b/normalizer.py @@ -0,0 +1,203 @@ +import re +import pandas as pd + + +# ----------------------------------------------------------------------------- +# Canonical cBioPortal column mappings +# ----------------------------------------------------------------------------- + +COLUMN_MAP = { + + # Patient identifiers + "patient": "PATIENT_ID", + "patient_id": "PATIENT_ID", + "patient id": "PATIENT_ID", + "sample_patient": "PATIENT_ID", + + # Sample identifiers + "sample": "SAMPLE_ID", + "sample_id": "SAMPLE_ID", + "sample id": "SAMPLE_ID", + + # Sex / gender + "gender": "SEX", + "sex": "SEX", + + # Age + "age": "AGE", + "age_at_diagnosis": "AGE", + + # Cancer type + "cancer_type": "CANCER_TYPE", + "tumor_type": "CANCER_TYPE", + + # Survival + "os_status": "OS_STATUS", + "os_months": "OS_MONTHS", +} + + +# ----------------------------------------------------------------------------- +# Normalize text helper +# ----------------------------------------------------------------------------- + +def normalize_text(text): + """ + Normalize text for matching: + - lowercase + - strip spaces + - replace underscores + """ + + return ( + str(text) + .strip() + .lower() + .replace("_", " ") + ) + + +# ----------------------------------------------------------------------------- +# NEW SINGLE COLUMN NORMALIZER +# ----------------------------------------------------------------------------- + +def normalize_column(column_name): + """ + Normalize a SINGLE column name. + + Example: + Tumor Sample Barcode + -> + tumor_sample_barcode + """ + + column_name = str(column_name).strip().lower() + + column_name = re.sub(r"[\s\-]+", "_", column_name) + + column_name = re.sub(r"[^a-z0-9_]", "", column_name) + + column_name = re.sub(r"_+", "_", column_name) + + return column_name + + +# ----------------------------------------------------------------------------- +# Normalize column names in dataframe +# ----------------------------------------------------------------------------- + +def normalize_columns(df: pd.DataFrame) -> pd.DataFrame: + """ + Rename messy columns into canonical cBioPortal columns. + """ + + new_columns = {} + + for col in df.columns: + + normalized = normalize_text(col) + + if normalized in COLUMN_MAP: + + new_columns[col] = COLUMN_MAP[normalized] + + else: + + # fallback → uppercase cleaned version + new_columns[col] = ( + normalized + .replace(" ", "_") + .upper() + ) + + df = df.rename(columns=new_columns) + + return df + + +# ----------------------------------------------------------------------------- +# Normalize values inside columns +# ----------------------------------------------------------------------------- + +def normalize_values(df: pd.DataFrame) -> pd.DataFrame: + """ + Normalize categorical values. + """ + + # Normalize SEX column + if "SEX" in df.columns: + + sex_map = { + "m": "MALE", + "male": "MALE", + "f": "FEMALE", + "female": "FEMALE", + } + + df["SEX"] = ( + df["SEX"] + .astype(str) + .str.strip() + .str.lower() + .map(lambda x: sex_map.get(x, x.upper())) + ) + + # Normalize OS_STATUS + if "OS_STATUS" in df.columns: + + os_map = { + "0": "LIVING", + "1": "DECEASED", + "living": "LIVING", + "deceased": "DECEASED", + } + + df["OS_STATUS"] = ( + df["OS_STATUS"] + .astype(str) + .str.strip() + .str.lower() + .map(lambda x: os_map.get(x, x.upper())) + ) + + return df + + +# ----------------------------------------------------------------------------- +# Main normalization pipeline +# ----------------------------------------------------------------------------- + +def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Full normalization pipeline. + """ + + df = normalize_columns(df) + + df = normalize_values(df) + + return df + + +# ----------------------------------------------------------------------------- +# Example local testing +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + + data = { + "patient": ["P1", "P2"], + "gender": ["male", "f"], + "age": [45, 60], + } + + df = pd.DataFrame(data) + + print("ORIGINAL DATAFRAME") + print(df) + + print("\nNORMALIZED DATAFRAME") + + normalized_df = normalize_dataframe(df) + + print(normalized_df) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 179b8ff..454dff1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ python-docx>=1.1.0 plotly>=5.18.0 requests>=2.31.0 python-dotenv>=1.0.0 +rapidfuzz diff --git a/schemas.py b/schemas.py new file mode 100644 index 0000000..9228496 --- /dev/null +++ b/schemas.py @@ -0,0 +1,53 @@ +SCHEMA_SYNONYMS = { + + "SAMPLE_ID": [ + "sample_id", + "sample", + "tumor_sample_barcode", + "sample_barcode", + "tumor_sample", + "sampleid" + ], + + "PATIENT_ID": [ + "patient_id", + "patient", + "patient_identifier", + "case_id", + "case", + "patientid" + ], + + "HUGO_SYMBOL": [ + "gene", + "gene_name", + "gene_symbol", + "hugo_symbol", + "symbol", + "geneid" + ], + + "CHROMOSOME": [ + "chromosome", + "chr", + "chrom" + ], + + "START_POSITION": [ + "start_position", + "start", + "position_start" + ], + + "END_POSITION": [ + "end_position", + "end", + "position_end" + ], + + "VARIANT_CLASSIFICATION": [ + "variant_classification", + "mutation_type", + "variant_type" + ] +} \ No newline at end of file diff --git a/streamlit_app.py b/streamlit_app.py index 2e55835..b6ee401 100644 --- a/streamlit_app.py +++ b/streamlit_app.py @@ -11,6 +11,7 @@ """ from __future__ import annotations +from normalizer import normalize_dataframe import json import os @@ -496,66 +497,158 @@ def _render_inline_report(meta: dict[str, Any], summary: dict[str, Any]) -> None type=["xlsx", "xls", "csv", "tsv", "txt", "tab", "maf"], key="detect_file", ) - use_ai = st.checkbox("Use AI for ambiguous files", value=True, key="use_ai_detection") + + use_ai = st.checkbox( + "Use AI for ambiguous files", + value=True, + key="use_ai_detection" + ) if st.button("Classify File", disabled=detect_file is None): + try: from cbio_detector import detect_file_type from file_parser import parse_file + except Exception as exc: st.error(f"Could not load classification modules: {exc}") st.stop() + # ------------------------------------------------------------------ + # Parse + Normalize + # ------------------------------------------------------------------ with st.spinner("Parsing file..."): + try: - df = parse_file(detect_file.getvalue(), detect_file.name) + df = parse_file( + detect_file.getvalue(), + detect_file.name + ) + + # Normalize dataframe + normalized_df = normalize_dataframe(df) + except Exception as exc: st.error(f"Could not read file: {exc}") st.stop() - st.markdown("#### File Preview") - st.dataframe(df.head(10), use_container_width=True) + # ------------------------------------------------------------------ + # Preview Tables + # ------------------------------------------------------------------ + st.markdown("### Original File") + st.dataframe( + df.head(10), + use_container_width=True + ) + st.markdown("### Normalized File") + st.dataframe( + normalized_df.head(10), + use_container_width=True + ) + + # ------------------------------------------------------------------ + # Classification + # ------------------------------------------------------------------ api_key = _get_api_key() if use_ai else None + with st.spinner("Classifying file..."): + try: - result = detect_file_type(df, anthropic_api_key=api_key) + result = detect_file_type( + normalized_df, + anthropic_api_key=api_key + ) + except Exception as exc: st.error(f"Classification failed: {exc}") st.stop() st.divider() + col1, col2, col3 = st.columns(3) - col1.metric("Detected Format", result.get("type", "—")) - col2.metric("Confidence", f"{float(result.get('confidence', 0)) * 100:.0f}%") - col3.metric("Method", "Rule-based" if result.get("method") == "heuristic" else result.get("method", "—")) + + col1.metric( + "Detected Format", + result.get("type", "—") + ) + + col2.metric( + "Confidence", + f"{float(result.get('confidence', 0)) * 100:.0f}%" + ) + + col3.metric( + "Method", + "Rule-based" + if result.get("method") == "heuristic" + else result.get("method", "—") + ) if result.get("reasoning"): st.info(result["reasoning"]) + if result.get("low_confidence"): - st.warning("Confidence is low — please verify the detected format manually.") + st.warning( + "Confidence is low — please verify the detected format manually." + ) mappings = result.get("column_mappings") or {} + if mappings: st.markdown("#### Suggested Column Mappings") + st.dataframe( - pd.DataFrame(list(mappings.items()), columns=["Original Column", "cBioPortal Column"]), + pd.DataFrame( + list(mappings.items()), + columns=[ + "Original Column", + "cBioPortal Column" + ] + ), use_container_width=True, hide_index=True, ) + # ------------------------------------------------------------------ + # Detailed Spec Matching + # ------------------------------------------------------------------ try: from spec_match import classify_sheet - spec_result = classify_sheet(df) + spec_result = classify_sheet(normalized_df) + with st.expander("Detailed classification scores"): - st.markdown(f"**Best match:** {spec_result.format_key} ({spec_result.confidence:.1f}% confidence)") - st.markdown(f"**Target file:** {spec_result.target_file}") + + st.markdown( + f"**Best match:** " + f"{spec_result.format_key} " + f"({spec_result.confidence:.1f}% confidence)" + ) + + st.markdown( + f"**Target file:** " + f"{spec_result.target_file}" + ) + if spec_result.required_missing: - st.warning("Missing required columns: " + ", ".join(spec_result.required_missing)) + st.warning( + "Missing required columns: " + + ", ".join(spec_result.required_missing) + ) + if spec_result.required_present: - st.success("Required columns found: " + ", ".join(spec_result.required_present)) + st.success( + "Required columns found: " + + ", ".join(spec_result.required_present) + ) + if spec_result.all_scores: - st.dataframe(pd.DataFrame(spec_result.all_scores), use_container_width=True, hide_index=True) + st.dataframe( + pd.DataFrame(spec_result.all_scores), + use_container_width=True, + hide_index=True + ) + except Exception: pass diff --git a/test.csv b/test.csv new file mode 100644 index 0000000..8b482f7 --- /dev/null +++ b/test.csv @@ -0,0 +1,3 @@ +Tumor Sample Barcode,Gene Name,Patient Identifier +S1,TP53,P1 +S2,BRCA1,P2 \ No newline at end of file diff --git a/validator.py b/validator.py new file mode 100644 index 0000000..66fa156 --- /dev/null +++ b/validator.py @@ -0,0 +1,10 @@ +REQUIRED_FIELDS = { + "MUTATION": [ + "HUGO_SYMBOL", + "SAMPLE_ID" + ], + + "CNA": [ + "SAMPLE_ID" + ] +} \ No newline at end of file