GeiserX
diff --git a/‎.coderabbit.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.coderabbit.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/container-test.yml‎
Lines changed: 2 additions & 3 deletions b/‎.github/workflows/container-test.yml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎.github/workflows/nextflow.yml‎
Lines changed: 86 additions & 0 deletions b/‎.github/workflows/nextflow.yml‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.nf-core.yml‎
Lines changed: 18 additions & 0 deletions b/‎.nf-core.yml‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 32 additions & 7 deletions b/‎README.md‎
Lines changed: 32 additions & 7 deletions
diff --git a/‎ROADMAP.md‎
Lines changed: 31 additions & 6 deletions b/‎ROADMAP.md‎
Lines changed: 31 additions & 6 deletions
diff --git a/‎assets/schema_input.json‎
Lines changed: 38 additions & 0 deletions b/‎assets/schema_input.json‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎assets/stub/EMPTY‎ b/‎assets/stub/EMPTY‎
diff --git a/‎assets/stub/EMPTY_CLINICAL‎ b/‎assets/stub/EMPTY_CLINICAL‎
@@ -11,6 +11,7 @@ reviews:
   auto_review:
     enabled: true
     drafts: false
+    auto_pause_after_reviewed_commits: 0
     ignore_title_keywords:
       - "WIP"
       - "DO NOT REVIEW"
 
@@ -39,12 +39,11 @@ jobs:
             cmd: "java -version"
           - image: pgkb/pharmcat:3.2.0
             cmd: "pharmcat_pipeline --version"
-          - image: python:3.11-slim
+          - image: python:3.11
             cmd: "python3 --version"
           - image: quay.io/biocontainers/vcfanno:0.3.7--he881be0_0
             cmd: "bash -c 'vcfanno 2>&1 | grep -q \"vcfanno version\"'"
-          - image: quay.io/biocontainers/slivar:0.3.3--h5f107b1_0
-            cmd: "bash -c 'slivar 2>&1 | grep -q \"slivar version\"'"
+          # slivar uses staphb/bcftools + pre-built binary (no dedicated container)
           - image: quay.io/biocontainers/pypgx:0.26.0--pyh7e72e81_0
             cmd: "pypgx --version"
     steps:
 
@@ -0,0 +1,86 @@
+name: Nextflow
+
+on:
+  push:
+    branches: [main]
+    paths: ['main.nf', 'nextflow.config', 'modules/**', 'workflows/**', 'conf/**', 'nextflow_schema.json', '.github/workflows/nextflow.yml']
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  nextflow-validate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+
+      - uses: actions/setup-java@c5195efecf7bdfc987ee8bae7a71cb8b11521c00 # v4
+        with:
+          distribution: 'temurin'
+          java-version: '17'
+
+      - uses: nf-core/setup-nextflow@561fcfc7146dcb12e3871909b635ab092a781f34 # v2.0.0
+
+      - name: Validate Nextflow config
+        run: nextflow config main.nf -profile docker
+
+      - name: Validate pipeline entry point
+        run: nextflow run main.nf -help
+
+      - name: Validate nextflow_schema.json syntax
+        run: python3 -m json.tool nextflow_schema.json > /dev/null
+
+      - name: Validate samplesheet schema syntax
+        run: python3 -m json.tool assets/schema_input.json > /dev/null
+
+      - name: Run stub test
+        run: |
+          # Override Docker user + disable reports for CI:
+          # Many bioinformatics images run as non-root, causing permission
+          # issues with Nextflow work directories on GitHub Actions runners.
+          cat > ci.config <<'CICONF'
+          trace.enabled = false
+          dag.enabled = false
+          timeline.enabled = false
+          report.enabled = false
+          docker.userEmulation = false
+          docker.runOptions = '-u 0:0'
+          CICONF
+          nextflow run main.nf -profile test,docker -stub -c ci.config
+
+      - name: Check Nextflow module container tags match versions.env
+        run: |
+          echo "Checking Nextflow module container tags against versions.env..."
+          . ./versions.env
+          FAIL=0
+
+          for nf_file in modules/local/*/main.nf; do
+            [ -f "$nf_file" ] || continue
+            # Extract container directives and check against versions.env
+            # Use temp file to propagate failures out of the loop
+            grep -oP "container\s+'[^']+'" "$nf_file" | sed "s/container '//;s/'//" > /tmp/nf_images.txt
+            while read -r image; do
+              base="${image%%:*}"
+              tag="${image##*:}"
+              [ "$base" = "$tag" ] && continue
+
+              match=$(grep -F "$base" versions.env | head -1 || true)
+              if [ -n "$match" ]; then
+                env_tag=$(echo "$match" | grep -oP ':\K[^"]+' | tr -d '"')
+                if [ "$tag" != "$env_tag" ]; then
+                  echo "FAIL: $nf_file uses ${base}:${tag} but versions.env has ${base}:${env_tag}"
+                  FAIL=1
+                fi
+              fi
+            done < /tmp/nf_images.txt
+          done
+
+          [ "$FAIL" -eq 0 ] && echo "OK: Nextflow container tags match versions.env"
+          exit "$FAIL"
@@ -30,6 +30,9 @@ __pycache__/
 *.mmi
 *.dict
 
+# Stub test data (override genomics ignores above)
+!assets/stub/**
+
 # Archives
 *.tar.gz
 *.tgz
 
@@ -0,0 +1,18 @@
+# nf-core lint configuration
+# This pipeline uses nf-core template patterns but is NOT an official nf-core pipeline.
+# Skip branding and naming checks that only apply to nf-core organization pipelines.
+
+repository_type: pipeline
+nf_core_version: '3.2.0'
+
+lint:
+  # Skip checks that require nf-core organization membership
+  pipeline_name_conventions: false
+  # Skip checks for nf-core-specific CI workflows
+  actions_ci: false
+  actions_awstest: false
+  actions_awsfulltest: false
+  # Skip nf-core branding requirements
+  readme: false
+  # Skip multiqc (will be added in PR 3)
+  multiqc_config: false
@@ -14,6 +14,7 @@
   <a href="https://github.com/GeiserX/Personal-Genome-Pipeline/actions/workflows/lint.yml"><img src="https://img.shields.io/github/actions/workflow/status/GeiserX/Personal-Genome-Pipeline/lint.yml?style=flat-square&label=CI" alt="CI"></a>
   <a href="https://github.com/GeiserX/Personal-Genome-Pipeline/stargazers"><img src="https://img.shields.io/github/stars/GeiserX/Personal-Genome-Pipeline?style=flat-square&logo=github" alt="GitHub Stars"></a>
   <a href="https://www.docker.com/"><img src="https://img.shields.io/badge/runs%20with-Docker-0db7ed?style=flat-square&logo=docker&logoColor=white" alt="Docker"></a>
+  <a href="https://www.nextflow.io/"><img src="https://img.shields.io/badge/runs%20with-Nextflow-3ac486?style=flat-square&logo=data:image/svg%2bxml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgMCAyNCAyNCIgZmlsbD0id2hpdGUiPjxwYXRoIGQ9Ik0xMiAyTDIgMTlIMjJMMTIgMloiLz48L3N2Zz4=&logoColor=white" alt="Nextflow"></a>
   <a href="https://geiserx.github.io/personal-genome-pipeline"><img src="https://img.shields.io/badge/docs-GitHub%20Pages-blue?style=flat-square&logo=github" alt="Docs"></a>
 </p>
 
@@ -79,23 +80,21 @@ graph LR
     vcfanno --> slivar["slivar<br/><small>Prioritization</small>"]
     vcfanno --> clinical["Clinical Filter"]
     VCF --> cpsr["CPSR<br/><small>Cancer predisposition</small>"]
-    VCF --> eh["ExpansionHunter<br/><small>STRs</small>"]
     VCF --> roh["ROH Analysis"]
     VCF --> prs["PRS<br/><small>Polygenic risk</small>"]
     VCF --> ancestry["Ancestry SNPs"]
 
     %% BAM-based analyses
     BAM --> manta["Manta<br/><small>SVs</small>"]
     BAM --> delly["Delly<br/><small>SVs</small>"]
-    BAM --> gridss["GRIDSS<br/><small>SVs</small>"]
     BAM --> cnvnator["CNVnator<br/><small>CNVs</small>"]
+    manta --> duphold["duphold"]
+    duphold --> annotsv["AnnotSV"]
     manta --> consensus["SV Consensus"]
     delly --> consensus
-    gridss --> consensus
     cnvnator --> consensus
-    consensus --> duphold["duphold"]
-    duphold --> annotsv["AnnotSV"]
 
+    BAM --> eh["ExpansionHunter<br/><small>STRs</small>"]
     BAM --> pypgx["pypgx<br/><small>23-gene PGx<br/>+ CYP2D6 SV</small>"]
     BAM --> cyrius["Cyrius<br/><small>CYP2D6</small>"]
     BAM --> telomere["TelomereHunter"]
@@ -121,7 +120,7 @@ graph LR
     class FASTQ,BAM,VCF input
     class fastp,align,DV core
     class clinvar,pharmcat,cpic,cpsr,eh,roh,prs,ancestry,pypgx,cyrius,telomere,coverage,mito,haplo analysis
-    class manta,delly,gridss,cnvnator,consensus,duphold,annotsv sv
+    class manta,delly,cnvnator,consensus,duphold,annotsv sv
     class vep,vcfanno,slivar,clinical annotation
     class report report
 ```
@@ -142,7 +141,7 @@ graph LR
 | 9 | [STR Expansions](docs/09-str-expansions.md) | ExpansionHunter | `quay.io/biocontainers/expansionhunter:5.0.0` | ~15 min | Recommended |
 | 10 | [Telomere Length](docs/10-telomere-analysis.md) | TelomereHunter | `lgalarno/telomerehunter:latest` | ~1 hr | Optional |
 | 11 | [ROH Analysis](docs/11-roh-analysis.md) | bcftools roh | `staphb/bcftools:1.21` | ~5 min | Recommended |
-| 12 | [Mito Haplogroup](docs/12-mito-haplogroup.md) | haplogrep3 | `genepi/haplogrep3:latest`\* | ~1 min | Optional |
+| 12 | [Mito Haplogroup](docs/12-mito-haplogroup.md) | haplogrep3 | `jtb114/haplogrep3:latest`\* | ~1 min | Optional |
 | 13 | [VEP Annotation](docs/13-vep-annotation.md) | VEP | `ensemblorg/ensembl-vep:release_112.0` | ~2-4 hr | Recommended |
 | 14 | [Imputation Prep](docs/14-imputation-prep.md) | bcftools | `staphb/bcftools:1.21` | ~10 min | Optional |
 | 15 | [SV Quality](docs/15-duphold.md) | duphold | `brentp/duphold:v0.2.3` | ~20 min | If step 4 run |
@@ -286,6 +285,32 @@ ORA is Illumina's proprietary compressed FASTQ format. Decompress first, then fo
 # ... continue as Path A
 ```
 
+### Nextflow
+
+A Nextflow DSL2 execution path (v0.5.0) covers post-calling interpretation and clinical analysis — it accepts VCF + BAM from any upstream caller and runs the same pharmacogenomics, annotation, and clinical steps as the bash scripts. Both paths are maintained and produce biologically equivalent results (output file names and report scope may differ).
+
+```bash
+# Minimal run — default tools need no external databases
+nextflow run main.nf --input samplesheet.csv --reference /path/to/GRCh38.fasta -profile docker
+
+# Enable database-requiring tools (VEP, CPSR, ClinVar, ExpansionHunter)
+nextflow run main.nf --input samplesheet.csv --reference /path/to/GRCh38.fasta \
+    --tools 'pharmcat,cpic,vcfanno,roh,prs,mito_haplogroup,hla_typing,telomere_hunter,mosdepth,mito_variants,cyrius,html_report,multiqc,vep,slivar,clinical_filter,cpsr,clinvar,expansion_hunter,pypgx,ancestry' \
+    --vep_cache /path/to/vep_cache \
+    --pcgr_data /path/to/pcgr_data \
+    --vep_cache_cpsr /path/to/vep_cache_113 \
+    --clinvar /path/to/clinvar.vcf.gz \
+    --clinvar_index /path/to/clinvar.vcf.gz.tbi \
+    --expansion_catalog /path/to/variant_catalog.json \
+    --hla_dat /path/to/hla.dat \
+    --slivar_bin /path/to/slivar \
+    --pypgx_bundle /path/to/pypgx-bundle \
+    --ancestry_ref /path/to/1kg_common_snps.vcf.gz \
+    -profile docker
+```
+
+See [docs/nextflow.md](docs/nextflow.md) for samplesheet format, tool selection, sarek integration, and bash vs Nextflow comparison.
+
 ---
 
 ## Prerequisites
 
@@ -44,14 +44,39 @@ Deep pathogenicity scoring, structured variant querying, and broader pharmacogen
 - [x] **Variant prioritization with inheritance queries** (`scripts/31-slivar.sh`) — slivar (GEMINI successor) for streaming VCF filtering with JS expressions. Rare HIGH/MODERATE variants, ClinVar pathogenic, compound het detection, gene constraint enrichment
 - [x] **pypgx alongside PharmCAT** (`scripts/32-pypgx.sh`) — 23-gene curated star allele calling including CYP2D6 structural variation from BAM read depth. Cross-validates with PharmCAT on shared genes
 
-## v0.5.0 — Workflow engine integration
+## v0.5.0 — Nextflow workflow engine
 
-The 44 bash scripts work but lack built-in parallelism, resume-on-failure, and HPC portability. The [nf-core](https://nf-co.re/) ecosystem (147 community pipelines including [sarek](https://nf-co.re/sarek) with 15 variant callers and [raredisease](https://github.com/nf-core/raredisease) for clinical genomics) demonstrates the community standard.
+The bash scripts work but lack built-in parallelism, resume-on-failure, and HPC portability. v0.5.0 adds a [Nextflow](https://www.nextflow.io/) DSL2 execution path alongside the existing bash scripts (which remain first-class).
 
-- [ ] **Nextflow DSL2 wrapper** — convert the pipeline into a Nextflow workflow with channels and processes, preserving the current Docker-based execution model
-- [ ] **nf-core module compatibility** — use [nf-core/modules](https://github.com/nf-core/modules) where they exist (BWA-MEM2, DeepVariant, VEP, Manta, bcftools) for community-maintained containers and automated testing
-- [ ] **Snakemake alternative** — optional Snakemake wrapper for HPC environments that prefer it over Nextflow
-- [ ] This unlocks: automatic parallelism via DAG-based step ordering, resume on failure, Singularity/Apptainer for HPC clusters, and optional cloud portability
+### Why Nextflow over Snakemake?
+
+Both are mature workflow engines. We chose Nextflow because:
+
+- **nf-core ecosystem**: 147 community pipelines including [sarek](https://nf-co.re/sarek) (WGS variant calling) and [raredisease](https://github.com/nf-core/raredisease) (clinical genomics). Sarek's output is our primary input — channel compatibility matters.
+- **nf-core/modules**: 1000+ reusable modules. We can both use existing modules and contribute novel ones (PharmCAT, pypgx, slivar) under MIT.
+- **Container-first design**: Nextflow's `container` directive maps directly to our Docker-based architecture. Singularity support is automatic.
+- **Resume**: Content-hash caching is more robust than file-existence checks.
+- **Industry momentum**: Seqera/Nextflow has commercial backing; major sequencing centers standardize on Nextflow.
+
+Snakemake's Python DSL and HPC scheduler integration are genuine strengths, but the nf-core ecosystem size and sarek compatibility are decisive.
+
+### Scope: Post-processing focus
+
+Steps 1-6 (alignment, variant calling) are already covered by nf-core/sarek. Rather than duplicate that work, this pipeline focuses on what sarek does NOT cover: pharmacogenomics, PRS, ancestry, telomere, repeat expansions, clinical interpretation, and reporting. The Nextflow pipeline accepts sarek output (VCF + BAM) as its primary input.
+
+### Delivery
+
+All 27 modules across 6 workflows are implemented. The stub-testable subset (tools that do not require external databases) is CI-validated; database-dependent tools (vep, cpsr, clinvar, expansion_hunter) are validated manually. The Nextflow path is usable for post-calling interpretation and produces biologically equivalent results to the bash scripts. See [docs/nextflow.md](docs/nextflow.md) for known limitations.
+
+- [x] **PR #17 — Full Nextflow pipeline** (v0.5.0): All 6 workflows (PGX, ANNOTATION, CLINICAL, BAM_ANALYSIS, SV, REPORTING) with 27 modules, `--tools` gating, stub CI, Docker + Singularity profiles
+
+### Parallel track: nf-core module contributions
+
+PharmCAT, pypgx, and slivar modules will be contributed to [nf-core/modules](https://github.com/nf-core/modules) under MIT license — independent of the pipeline's GPL-3.0 license. Once merged, these modules will be available to all nf-core pipelines.
+
+### Bash scripts
+
+The bash scripts remain in `scripts/` as a maintained, simpler alternative for users who do not need workflow orchestration. After PR 3 validates the Nextflow path end-to-end, new features will be Nextflow-first. Bash scripts will continue to receive bug fixes and tool version bumps but not new analysis steps.
 
 ## v0.6.0 — Multi-sample & joint analysis
 
 
@@ -0,0 +1,38 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema",
+    "$id": "https://github.com/GeiserX/Personal-Genome-Pipeline/blob/main/assets/schema_input.json",
+    "title": "Personal Genome Pipeline — Samplesheet Schema",
+    "description": "Schema for the input samplesheet CSV",
+    "type": "array",
+    "items": {
+        "type": "object",
+        "properties": {
+            "sample": {
+                "type": "string",
+                "description": "Sample identifier (used as output prefix)",
+                "pattern": "^[a-zA-Z0-9._-]+$"
+            },
+            "vcf": {
+                "type": "string",
+                "description": "Path to sample VCF file (bgzipped)",
+                "pattern": "^\\S+\\.vcf\\.gz$"
+            },
+            "vcf_index": {
+                "type": "string",
+                "description": "Path to VCF tabix index (.tbi)",
+                "pattern": "^\\S+\\.vcf\\.gz\\.tbi$"
+            },
+            "bam": {
+                "type": "string",
+                "description": "Path to aligned BAM file (optional, needed for BAM-based steps)",
+                "pattern": "^\\S+\\.bam$"
+            },
+            "bam_index": {
+                "type": "string",
+                "description": "Path to BAM index (.bai)",
+                "pattern": "^\\S+\\.bam\\.bai$"
+            }
+        },
+        "required": ["sample", "vcf", "vcf_index"]
+    }
+}