diff --git a/__pycache__/normalizer.cpython-314.pyc b/__pycache__/normalizer.cpython-314.pyc
new file mode 100644
index 0000000..afd2926
Binary files /dev/null and b/__pycache__/normalizer.cpython-314.pyc differ
diff --git a/__pycache__/schemas.cpython-314.pyc b/__pycache__/schemas.cpython-314.pyc
new file mode 100644
index 0000000..8ec9823
Binary files /dev/null and b/__pycache__/schemas.cpython-314.pyc differ
diff --git a/__pycache__/validator.cpython-314.pyc b/__pycache__/validator.cpython-314.pyc
new file mode 100644
index 0000000..daea8fd
Binary files /dev/null and b/__pycache__/validator.cpython-314.pyc differ
diff --git a/mapper.py b/mapper.py
new file mode 100644
index 0000000..38fa932
--- /dev/null
+++ b/mapper.py
@@ -0,0 +1,162 @@
+import pandas as pd
+from rapidfuzz import fuzz
+
+from schemas import SCHEMA_SYNONYMS
+from normalizer import normalize_column
+from validator import REQUIRED_FIELDS
+
+
+def map_column(column_name):
+
+    normalized = normalize_column(column_name)
+
+    best_match = None
+    highest_score = 0
+    matched_synonym = None
+
+    for schema_field, synonyms in SCHEMA_SYNONYMS.items():
+
+        for synonym in synonyms:
+
+            score = fuzz.ratio(normalized, synonym)
+
+            if score > highest_score:
+                highest_score = score
+                best_match = schema_field
+                matched_synonym = synonym
+
+    if highest_score >= 90:
+        status = "AUTO_MAPPED"
+
+    elif highest_score >= 70:
+        status = "REVIEW_REQUIRED"
+
+    else:
+        status = "UNKNOWN"
+
+    return {
+        "original_column": column_name,
+        "normalized_column": normalized,
+        "mapped_to": best_match,
+        "matched_synonym": matched_synonym,
+        "confidence": highest_score,
+        "status": status
+    }
+
+
+def detect_schema(columns):
+
+    normalized_columns = [
+        normalize_column(col)
+        for col in columns
+    ]
+
+    mutation_keywords = [
+        "gene",
+        "gene_symbol",
+        "hugo_symbol",
+        "chromosome",
+        "start_position",
+        "end_position"
+    ]
+
+    mutation_score = 0
+
+    for col in normalized_columns:
+
+        if col in mutation_keywords:
+            mutation_score += 1
+
+    if mutation_score > 0:
+        return "MUTATION"
+
+    return "UNKNOWN"
+
+
+def validate_mappings(mapped_results, schema_type):
+
+    mapped_fields = [
+        item["mapped_to"]
+        for item in mapped_results
+    ]
+
+    required_fields = REQUIRED_FIELDS.get(schema_type, [])
+
+    errors = []
+
+    for field in required_fields:
+
+        if field not in mapped_fields:
+
+            errors.append(
+                f"Missing required field: {field}"
+            )
+
+    return errors
+
+
+def print_preview(results):
+
+    print("\nCOLUMN MAPPINGS:\n")
+
+    for result in results:
+
+        print(
+            f"{result['original_column']}"
+            f" --> "
+            f"{result['mapped_to']}"
+            f" ({result['confidence']}%)"
+        )
+
+
+def process_file(csv_file):
+
+    df = pd.read_csv(csv_file)
+
+    columns = list(df.columns)
+
+    schema_type = detect_schema(columns)
+
+    print(f"\nDetected Schema: {schema_type}")
+
+    mapped_results = []
+
+    for col in columns:
+
+        result = map_column(col)
+
+        mapped_results.append(result)
+
+    print_preview(mapped_results)
+
+    errors = validate_mappings(
+        mapped_results,
+        schema_type
+    )
+
+    print("\nVALIDATION:\n")
+
+    if not errors:
+
+        print("Validation Passed ✅")
+
+    else:
+
+        for error in errors:
+            print(error)
+
+
+if __name__ == "__main__":
+
+    import sys
+
+    if len(sys.argv) != 2:
+
+        print(
+            "Usage:\n"
+            "python mapper.py input.csv"
+        )
+
+    else:
+
+        process_file(sys.argv[1])
\ No newline at end of file
diff --git a/normalizer.py b/normalizer.py
new file mode 100644
index 0000000..76f76ed
--- /dev/null
+++ b/normalizer.py
@@ -0,0 +1,203 @@
+import re
+import pandas as pd
+
+
+# -----------------------------------------------------------------------------
+# Canonical cBioPortal column mappings
+# -----------------------------------------------------------------------------
+
+COLUMN_MAP = {
+
+    # Patient identifiers
+    "patient": "PATIENT_ID",
+    "patient_id": "PATIENT_ID",
+    "patient id": "PATIENT_ID",
+    "sample_patient": "PATIENT_ID",
+
+    # Sample identifiers
+    "sample": "SAMPLE_ID",
+    "sample_id": "SAMPLE_ID",
+    "sample id": "SAMPLE_ID",
+
+    # Sex / gender
+    "gender": "SEX",
+    "sex": "SEX",
+
+    # Age
+    "age": "AGE",
+    "age_at_diagnosis": "AGE",
+
+    # Cancer type
+    "cancer_type": "CANCER_TYPE",
+    "tumor_type": "CANCER_TYPE",
+
+    # Survival
+    "os_status": "OS_STATUS",
+    "os_months": "OS_MONTHS",
+}
+
+
+# -----------------------------------------------------------------------------
+# Normalize text helper
+# -----------------------------------------------------------------------------
+
+def normalize_text(text):
+    """
+    Normalize text for matching:
+    - lowercase
+    - strip spaces
+    - replace underscores
+    """
+
+    return (
+        str(text)
+        .strip()
+        .lower()
+        .replace("_", " ")
+    )
+
+
+# -----------------------------------------------------------------------------
+# NEW SINGLE COLUMN NORMALIZER
+# -----------------------------------------------------------------------------
+
+def normalize_column(column_name):
+    """
+    Normalize a SINGLE column name.
+
+    Example:
+    Tumor Sample Barcode
+    ->
+    tumor_sample_barcode
+    """
+
+    column_name = str(column_name).strip().lower()
+
+    column_name = re.sub(r"[\s\-]+", "_", column_name)
+
+    column_name = re.sub(r"[^a-z0-9_]", "", column_name)
+
+    column_name = re.sub(r"_+", "_", column_name)
+
+    return column_name
+
+
+# -----------------------------------------------------------------------------
+# Normalize column names in dataframe
+# -----------------------------------------------------------------------------
+
+def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Rename messy columns into canonical cBioPortal columns.
+    """
+
+    new_columns = {}
+
+    for col in df.columns:
+
+        normalized = normalize_text(col)
+
+        if normalized in COLUMN_MAP:
+
+            new_columns[col] = COLUMN_MAP[normalized]
+
+        else:
+
+            # fallback → uppercase cleaned version
+            new_columns[col] = (
+                normalized
+                .replace(" ", "_")
+                .upper()
+            )
+
+    df = df.rename(columns=new_columns)
+
+    return df
+
+
+# -----------------------------------------------------------------------------
+# Normalize values inside columns
+# -----------------------------------------------------------------------------
+
+def normalize_values(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normalize categorical values.
+    """
+
+    # Normalize SEX column
+    if "SEX" in df.columns:
+
+        sex_map = {
+            "m": "MALE",
+            "male": "MALE",
+            "f": "FEMALE",
+            "female": "FEMALE",
+        }
+
+        df["SEX"] = (
+            df["SEX"]
+            .astype(str)
+            .str.strip()
+            .str.lower()
+            .map(lambda x: sex_map.get(x, x.upper()))
+        )
+
+    # Normalize OS_STATUS
+    if "OS_STATUS" in df.columns:
+
+        os_map = {
+            "0": "LIVING",
+            "1": "DECEASED",
+            "living": "LIVING",
+            "deceased": "DECEASED",
+        }
+
+        df["OS_STATUS"] = (
+            df["OS_STATUS"]
+            .astype(str)
+            .str.strip()
+            .str.lower()
+            .map(lambda x: os_map.get(x, x.upper()))
+        )
+
+    return df
+
+
+# -----------------------------------------------------------------------------
+# Main normalization pipeline
+# -----------------------------------------------------------------------------
+
+def normalize_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Full normalization pipeline.
+    """
+
+    df = normalize_columns(df)
+
+    df = normalize_values(df)
+
+    return df
+
+
+# -----------------------------------------------------------------------------
+# Example local testing
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+
+    data = {
+        "patient": ["P1", "P2"],
+        "gender": ["male", "f"],
+        "age": [45, 60],
+    }
+
+    df = pd.DataFrame(data)
+
+    print("ORIGINAL DATAFRAME")
+    print(df)
+
+    print("\nNORMALIZED DATAFRAME")
+
+    normalized_df = normalize_dataframe(df)
+
+    print(normalized_df)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 179b8ff..454dff1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ python-docx>=1.1.0
 plotly>=5.18.0
 requests>=2.31.0
 python-dotenv>=1.0.0
+rapidfuzz
diff --git a/schemas.py b/schemas.py
new file mode 100644
index 0000000..9228496
--- /dev/null
+++ b/schemas.py
@@ -0,0 +1,53 @@
+SCHEMA_SYNONYMS = {
+
+    "SAMPLE_ID": [
+        "sample_id",
+        "sample",
+        "tumor_sample_barcode",
+        "sample_barcode",
+        "tumor_sample",
+        "sampleid"
+    ],
+
+    "PATIENT_ID": [
+        "patient_id",
+        "patient",
+        "patient_identifier",
+        "case_id",
+        "case",
+        "patientid"
+    ],
+
+    "HUGO_SYMBOL": [
+        "gene",
+        "gene_name",
+        "gene_symbol",
+        "hugo_symbol",
+        "symbol",
+        "geneid"
+    ],
+
+    "CHROMOSOME": [
+        "chromosome",
+        "chr",
+        "chrom"
+    ],
+
+    "START_POSITION": [
+        "start_position",
+        "start",
+        "position_start"
+    ],
+
+    "END_POSITION": [
+        "end_position",
+        "end",
+        "position_end"
+    ],
+
+    "VARIANT_CLASSIFICATION": [
+        "variant_classification",
+        "mutation_type",
+        "variant_type"
+    ]
+}
\ No newline at end of file
diff --git a/streamlit_app.py b/streamlit_app.py
index 2e55835..b6ee401 100644
--- a/streamlit_app.py
+++ b/streamlit_app.py
@@ -11,6 +11,7 @@
 """
 
 from __future__ import annotations
+from normalizer import normalize_dataframe
 
 import json
 import os
@@ -496,66 +497,158 @@ def _render_inline_report(meta: dict[str, Any], summary: dict[str, Any]) -> None
         type=["xlsx", "xls", "csv", "tsv", "txt", "tab", "maf"],
         key="detect_file",
     )
-    use_ai = st.checkbox("Use AI for ambiguous files", value=True, key="use_ai_detection")
+
+    use_ai = st.checkbox(
+        "Use AI for ambiguous files",
+        value=True,
+        key="use_ai_detection"
+    )
 
     if st.button("Classify File", disabled=detect_file is None):
+
         try:
             from cbio_detector import detect_file_type
             from file_parser import parse_file
+
         except Exception as exc:
             st.error(f"Could not load classification modules: {exc}")
             st.stop()
 
+        # ------------------------------------------------------------------
+        # Parse + Normalize
+        # ------------------------------------------------------------------
         with st.spinner("Parsing file..."):
+
             try:
-                df = parse_file(detect_file.getvalue(), detect_file.name)
+                df = parse_file(
+                    detect_file.getvalue(),
+                    detect_file.name
+                )
+
+                # Normalize dataframe
+                normalized_df = normalize_dataframe(df)
+
             except Exception as exc:
                 st.error(f"Could not read file: {exc}")
                 st.stop()
 
-        st.markdown("#### File Preview")
-        st.dataframe(df.head(10), use_container_width=True)
+        # ------------------------------------------------------------------
+        # Preview Tables
+        # ------------------------------------------------------------------
+        st.markdown("### Original File")
+        st.dataframe(
+            df.head(10),
+            use_container_width=True
+        )
 
+        st.markdown("### Normalized File")
+        st.dataframe(
+            normalized_df.head(10),
+            use_container_width=True
+        )
+
+        # ------------------------------------------------------------------
+        # Classification
+        # ------------------------------------------------------------------
         api_key = _get_api_key() if use_ai else None
+
         with st.spinner("Classifying file..."):
+
             try:
-                result = detect_file_type(df, anthropic_api_key=api_key)
+                result = detect_file_type(
+                    normalized_df,
+                    anthropic_api_key=api_key
+                )
+
             except Exception as exc:
                 st.error(f"Classification failed: {exc}")
                 st.stop()
 
         st.divider()
+
         col1, col2, col3 = st.columns(3)
-        col1.metric("Detected Format", result.get("type", "—"))
-        col2.metric("Confidence", f"{float(result.get('confidence', 0)) * 100:.0f}%")
-        col3.metric("Method", "Rule-based" if result.get("method") == "heuristic" else result.get("method", "—"))
+
+        col1.metric(
+            "Detected Format",
+            result.get("type", "—")
+        )
+
+        col2.metric(
+            "Confidence",
+            f"{float(result.get('confidence', 0)) * 100:.0f}%"
+        )
+
+        col3.metric(
+            "Method",
+            "Rule-based"
+            if result.get("method") == "heuristic"
+            else result.get("method", "—")
+        )
 
         if result.get("reasoning"):
             st.info(result["reasoning"])
+
         if result.get("low_confidence"):
-            st.warning("Confidence is low — please verify the detected format manually.")
+            st.warning(
+                "Confidence is low — please verify the detected format manually."
+            )
 
         mappings = result.get("column_mappings") or {}
+
         if mappings:
             st.markdown("#### Suggested Column Mappings")
+
             st.dataframe(
-                pd.DataFrame(list(mappings.items()), columns=["Original Column", "cBioPortal Column"]),
+                pd.DataFrame(
+                    list(mappings.items()),
+                    columns=[
+                        "Original Column",
+                        "cBioPortal Column"
+                    ]
+                ),
                 use_container_width=True,
                 hide_index=True,
             )
 
+        # ------------------------------------------------------------------
+        # Detailed Spec Matching
+        # ------------------------------------------------------------------
         try:
             from spec_match import classify_sheet
 
-            spec_result = classify_sheet(df)
+            spec_result = classify_sheet(normalized_df)
+
             with st.expander("Detailed classification scores"):
-                st.markdown(f"**Best match:** {spec_result.format_key} ({spec_result.confidence:.1f}% confidence)")
-                st.markdown(f"**Target file:** {spec_result.target_file}")
+
+                st.markdown(
+                    f"**Best match:** "
+                    f"{spec_result.format_key} "
+                    f"({spec_result.confidence:.1f}% confidence)"
+                )
+
+                st.markdown(
+                    f"**Target file:** "
+                    f"{spec_result.target_file}"
+                )
+
                 if spec_result.required_missing:
-                    st.warning("Missing required columns: " + ", ".join(spec_result.required_missing))
+                    st.warning(
+                        "Missing required columns: "
+                        + ", ".join(spec_result.required_missing)
+                    )
+
                 if spec_result.required_present:
-                    st.success("Required columns found: " + ", ".join(spec_result.required_present))
+                    st.success(
+                        "Required columns found: "
+                        + ", ".join(spec_result.required_present)
+                    )
+
                 if spec_result.all_scores:
-                    st.dataframe(pd.DataFrame(spec_result.all_scores), use_container_width=True, hide_index=True)
+                    st.dataframe(
+                        pd.DataFrame(spec_result.all_scores),
+                        use_container_width=True,
+                        hide_index=True
+                    )
+
         except Exception:
             pass
diff --git a/test.csv b/test.csv
new file mode 100644
index 0000000..8b482f7
--- /dev/null
+++ b/test.csv
@@ -0,0 +1,3 @@
+Tumor Sample Barcode,Gene Name,Patient Identifier
+S1,TP53,P1
+S2,BRCA1,P2
\ No newline at end of file
diff --git a/validator.py b/validator.py
new file mode 100644
index 0000000..66fa156
--- /dev/null
+++ b/validator.py
@@ -0,0 +1,10 @@
+REQUIRED_FIELDS = {
+    "MUTATION": [
+        "HUGO_SYMBOL",
+        "SAMPLE_ID"
+    ],
+
+    "CNA": [
+        "SAMPLE_ID"
+    ]
+}
\ No newline at end of file