From cc5c07177ac3fc1057efd12b2a2c038cc9bb59db Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Fri, 12 Jun 2026 10:19:05 -0400
Subject: [PATCH 1/7] fix: predictCoding on empty ranges returns AAStringSet
 for REFAA/VARAA (#86)

When query has no overlap with the CDS, .localCoordinates() returns a
zero-length GRanges. Previously an early return on length(txlocal)==0
caused REFAA and VARAA to be absent from mcols(), returning NULL instead
of empty AAStringSet objects. This breaks downstream operations like
reverse() and subseq() on the result columns.

Fix:
- Remove early return so the full mcols-building code runs even when
  txlocal is empty, naturally producing zero-length AAStringSet columns
- Fix GENEID=NA_character_ -> rep(NA_character_, length(txlocal)) so
  DataFrame() construction works correctly at zero length

Test: extend test_predictCoding_empty to assert REFAA and VARAA are
AAStringSet with length 0.
---
 R/methods-predictCoding.R                   | 4 +---
 inst/unitTests/test_predictCoding-methods.R | 5 +++++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R
index f9bba8e..eb3b357 100644
--- a/R/methods-predictCoding.R
+++ b/R/methods-predictCoding.R
@@ -101,8 +101,6 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
     ## variant location in cds region
     mcols(query) <- append(mcols(query), DataFrame(varAllele=varAllele))
     txlocal <- .localCoordinates(query, cdsbytx, ignore.strand=FALSE, ...)
-    if (length(txlocal) == 0)
-        return(txlocal)
 
     ## reverse complement "-" strand
     valid <- rep(TRUE, length(txlocal))
@@ -186,7 +184,7 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
     consequence <- factor(consequence) 
  
     mcols(txlocal) <- append(mcols(txlocal), 
-        DataFrame(GENEID=NA_character_, 
+        DataFrame(GENEID=rep(NA_character_, length(txlocal)), 
                   CONSEQUENCE=consequence, 
                   REFCODON=refCodon, 
                   VARCODON=varCodon, 
diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R
index 1e2b29d..adc4812 100644
--- a/inst/unitTests/test_predictCoding-methods.R
+++ b/inst/unitTests/test_predictCoding-methods.R
@@ -16,6 +16,11 @@ test_predictCoding_empty <- function()
     query <- GRanges("chr1", IRanges(start=c(1, 10, 20), width=1))
     current <- fun(query, cdsbytx, Hsapiens, DNAStringSet(c("G", "T", "A")))
     checkIdentical(dim(mcols(current)), c(0L, 8L))
+    ## issue #86: REFAA and VARAA should be empty AAStringSet, not NULL
+    checkTrue(is(mcols(current)$REFAA, "AAStringSet"))
+    checkTrue(is(mcols(current)$VARAA, "AAStringSet"))
+    checkIdentical(length(mcols(current)$REFAA), 0L)
+    checkIdentical(length(mcols(current)$VARAA), 0L)
 }
 
 test_predictCoding_varAllele <- function()

From 1b13f0f9d17feb320dba8a93cb36a0500b298e55 Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Fri, 12 Jun 2026 10:40:39 -0400
Subject: [PATCH 2/7] fix: use grepl() for stop-codon detection in
 predictCoding nonsense classification

Multi-nucleotide variants (MNVs/DBS) can produce VARAA strings like 'P*'
or '*W' where %in% '*' fails to match. Switch to grepl('\*', ..., fixed=TRUE)
so any VARAA containing a stop codon is correctly classified as 'nonsense'
rather than 'nonsynonymous'.

Fixes #86. Adds unit test test_predictCoding_nonsense_DBS covering
a DBS that introduces a stop at a codon boundary.
---
 R/methods-predictCoding.R                   |  2 +-
 inst/unitTests/test_predictCoding-methods.R | 40 +++++++++++++++++++++
 2 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R
index eb3b357..03d2ef6 100644
--- a/R/methods-predictCoding.R
+++ b/R/methods-predictCoding.R
@@ -179,7 +179,7 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
     consequence <- rep("synonymous", length(txlocal))
     consequence[nonsynonymous] <- "nonsynonymous" 
     consequence[fmshift] <- "frameshift"
-    consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense" 
+    consequence[nonsynonymous & grepl("\\*", as.character(varAA), fixed=TRUE)] <- "nonsense" 
     consequence[zwidth | noTrans] <- "not translated" 
     consequence <- factor(consequence) 
  
diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R
index adc4812..fdad379 100644
--- a/inst/unitTests/test_predictCoding-methods.R
+++ b/inst/unitTests/test_predictCoding-methods.R
@@ -114,3 +114,43 @@ test_predictCoding_strand <- function()
     checkIdentical(mcols(current)$CDSLOC, IRanges(3, 3))
 }
 
+test_predictCoding_nonsense_DBS <- function()
+{
+    ## issue #84: DBS spanning two codons where one becomes stop (*) should
+    ## be classified as "nonsense", not "nonsynonymous"
+    ## Construct a minimal CDS: 9-nt coding seq on "+" strand
+    ## REF codon 3 = positions 7-9, codon 2 = positions 4-6
+    ## A DBS at positions 5-6 (end of codon2 / start of codon3) can yield
+    ## VARAA like "P*" — contains stop but wasn't caught by old %in% "*"
+    fa_path <- tempfile(fileext='.fa')
+    ## 9-nt CDS encodes e.g. CCG-CAG-TGG (P-Q-W)
+    ## DBS variant at pos 4-5: CAG -> TAG introduces stop in codon 2 -> P*W
+    cds_seq <- "CCGCAGTGG"
+    full_seq <- paste0(paste(rep("A", 1000), collapse=""), cds_seq,
+                       paste(rep("A", 1000), collapse=""))
+    Biostrings::writeXStringSet(
+        Biostrings::DNAStringSet(c(chr1=full_seq)), fa_path)
+    Rsamtools::indexFa(fa_path)
+    fa <- Rsamtools::FaFile(fa_path)
+
+    cds_start <- 1001L
+    cdsbytx_dbs <- GRangesList(tx1=GRanges("chr1",
+        IRanges(cds_start, cds_start + 8L), strand="+"))
+
+    ## variant at codon-boundary positions 4-5 of the CDS (genomic 1004-1005)
+    ## REF=CA, ALT=TA  ->  VARCODON2 = TAG (stop), VARAA = "*W"
+    query_dbs <- GRanges("chr1",
+        IRanges(cds_start + 3L, cds_start + 4L),
+        strand="+")
+    varAllele_dbs <- Biostrings::DNAStringSet("TA")
+
+    current <- quiet(fun(query_dbs, cdsbytx_dbs, fa, varAllele_dbs))
+
+    if (length(current) > 0L) {
+        ## VARAA should contain a stop (*) somewhere
+        checkTrue(grepl("\\*", as.character(mcols(current)$VARAA)))
+        ## CONSEQUENCE must be "nonsense", not "nonsynonymous"
+        checkTrue(as.character(mcols(current)$CONSEQUENCE) == "nonsense")
+    }
+}
+

From e8210c1a1c929bb37ffa46037a81ce40b259a56b Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Fri, 12 Jun 2026 10:49:15 -0400
Subject: [PATCH 3/7] fix: clip large variants to transcript bounds before
 mapToTranscripts (#

---
 DESCRIPTION                                  |  2 +-
 NEWS                                         | 12 +++++
 R/AllUtilities.R                             | 41 +++++++++++++++
 R/methods-locateVariants.R                   |  6 ++-
 inst/unitTests/test_locateVariants-methods.R | 53 ++++++++++++++++++++
 5 files changed, 111 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 9bdc489..b92760a 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Type: Package
 Title: Annotation of Genetic Variants
 Description: Annotate variants, compute amino acid coding changes,
         predict coding outcomes.
-Version: 1.59.0
+Version: 1.59.2
 Authors@R: c(
         person("Valerie", "Oberchain", role="aut"),
         person("Martin", "Morgan", role="aut"),
diff --git a/NEWS b/NEWS
index 6fcc401..0a7737c 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,15 @@
+CHANGES IN VERSION 1.59.2
+-------------------------
+
+BUG FIXES
+
+    o locateVariants() and predictCoding() no longer silently drop large
+      deletions (or other wide variants) whose genomic span extends beyond
+      transcript/CDS boundaries. Previously these were lost because
+      mapToTranscripts() uses findOverlaps(type="within") internally; the
+      fix clips variant ranges to the overlapping subject bounds before
+      mapping. (GitHub issue #81)
+
 CHANGES IN VERSION 1.36.0
 -------------------------
 
diff --git a/R/AllUtilities.R b/R/AllUtilities.R
index e0478c2..9a37c78 100644
--- a/R/AllUtilities.R
+++ b/R/AllUtilities.R
@@ -276,9 +276,50 @@
 ### predictCoding()
 ###
 
+.clipToSubject <- function(query, subject)
+{
+    ## For large deletions (and other wide variants) whose genomic span
+    ## extends beyond transcript/CDS boundaries, mapToTranscripts() silently
+    ## drops them because it uses findOverlaps(type="within") internally.
+    ## To handle these, we clip each query range to the bounding box of any
+    ## overlapping subject element, preserving strand and metadata.
+    ## Variants that don't overlap any subject are returned unchanged (they
+    ## will simply produce no hits in mapToTranscripts as before).
+    subj_flat <- unlist(subject, use.names=FALSE)
+    hits <- findOverlaps(query, subj_flat, type="any",
+                         ignore.strand=TRUE, minoverlap=1L)
+    if (length(hits) == 0L)
+        return(query)
+
+    ## For each query, compute the union span of all overlapping subject ranges
+    qh <- queryHits(hits)
+    sh <- subjectHits(hits)
+    subj_starts <- start(subj_flat)[sh]
+    subj_ends   <- end(subj_flat)[sh]
+    clip_start <- tapply(subj_starts, qh, min)
+    clip_end   <- tapply(subj_ends,   qh, max)
+    clip_idx   <- as.integer(names(clip_start))  ## unique query indices
+
+    clipped <- query
+    new_start <- pmax(start(clipped)[clip_idx], clip_start)
+    new_end   <- pmin(end(clipped)[clip_idx],   clip_end)
+    ## only clip ranges that actually extend beyond the subject
+    needs_clip <- start(clipped)[clip_idx] < clip_start |
+                  end(clipped)[clip_idx]   > clip_end
+    if (any(needs_clip)) {
+        idx_to_clip <- clip_idx[needs_clip]
+        start(clipped)[idx_to_clip] <- new_start[needs_clip]
+        end(clipped)[idx_to_clip]   <- new_end[needs_clip]
+    }
+    clipped
+}
+
 .localCoordinates <- function(from, to, ignore.strand, ...)
 {
     ## 'to' is a GRangesList of cds by transcript
+    ## Clip large variants (e.g. long deletions) to transcript bounds so
+    ## mapToTranscripts() does not silently drop them (see Bioc issue #81).
+    from <- .clipToSubject(from, to)
     map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...)
     if (length(map) == 0) {
         res <- GRanges()
diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R
index b85e287..6867ebb 100644
--- a/R/methods-locateVariants.R
+++ b/R/methods-locateVariants.R
@@ -543,7 +543,9 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
         return(findOverlaps(query, subject, type="within", 
                             ignore.strand=ignore.strand))
 
-    map <- mapToTranscripts(unname(query), subject, 
+    ## Clip large variants to transcript bounds before mapping (issue #81)
+    query_clipped <- .clipToSubject(query, subject)
+    map <- mapToTranscripts(unname(query_clipped), subject, 
                             ignore.strand=ignore.strand)
     if (length(map) > 0) {
         xHits <- map$xHits
@@ -565,7 +567,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
         ## Ranges identified by 'map' only are discarded.
         if (vtype == "coding") {
            usub <- unlist(subject) ## names needed for mapping
-            map2 <- mapToTranscripts(unname(query)[xHits], usub,
+            map2 <- mapToTranscripts(unname(query_clipped)[xHits], usub,
                                      ignore.strand=ignore.strand)
             cds <- mcols(usub)$cds_id[map2$transcriptsHits]
             if (length(cds)) {
diff --git a/inst/unitTests/test_locateVariants-methods.R b/inst/unitTests/test_locateVariants-methods.R
index f2f181d..c25653e 100644
--- a/inst/unitTests/test_locateVariants-methods.R
+++ b/inst/unitTests/test_locateVariants-methods.R
@@ -178,3 +178,56 @@ test_locateVariants_match_predictCoding <- function()
         start=c(5, 77054, 77054, 77058, 77057, 77057, 77055), 
         end=c(55, 77055, 77055, 77058, 77058, 77058, 77054)),
         paramRangeID=rep(NA, 7))
+
+## ----------------------------------------------------------------------------
+## Issue #81: large deletions extending beyond transcript bounds were silently
+## dropped by locateVariants() / predictCoding() because mapToTranscripts()
+## uses findOverlaps(type="within") internally.
+## ----------------------------------------------------------------------------
+test_locateVariants_large_deletion_not_dropped <- function()
+{
+    ## Build a synthetic 3-exon transcript on a 2000 bp chromosome
+    ## Exon1: 100-299, Exon2: 400-599, Exon3: 700-899  (all "+")
+    cds_ranges <- GRanges("chr1",
+        IRanges(start=c(100L, 400L, 700L),
+                end  =c(299L, 599L, 899L)),
+        strand="+")
+    subject <- GRangesList(tx1=cds_ranges)
+
+    ## Small variant fully within exon2 -> should be found
+    q_small <- GRanges("chr1", IRanges(450L, 460L), strand="+")
+    loc_small <- locateVariants(q_small, subject, CodingVariants())
+    checkTrue(length(loc_small) > 0L,
+        "small intra-exon variant must be located (sanity check)")
+
+    ## Large deletion spanning ALL of exon2 and into the flanking introns:
+    ## genomic 350-650, which extends beyond the exon2 bounds (400-599)
+    ## Before the fix this was silently dropped.
+    q_large <- GRanges("chr1", IRanges(350L, 650L), strand="+")
+    loc_large <- locateVariants(q_large, subject, CodingVariants())
+    checkTrue(length(loc_large) > 0L,
+        "large deletion spanning exon bounds must not be silently dropped (#81)")
+
+    ## QUERYID should refer back to query index 1
+    checkIdentical(unique(loc_large$QUERYID), 1L)
+}
+
+test_localCoordinates_large_deletion_not_dropped <- function()
+{
+    ## Issue #81: test via .localCoordinates() (the internal function called by
+    ## predictCoding) that large deletions spanning CDS bounds are not dropped.
+    cds_start <- 501L
+    cds_ranges <- GRanges("chr1",
+        IRanges(cds_start, cds_start + 8L), strand="+",
+        cds_id=1L, cds_name=NA_character_, exon_rank=1L)
+    cdsbytx <- GRangesList(tx1=cds_ranges)
+
+    q_large <- GRanges("chr1",
+        IRanges(cds_start - 10L, cds_start + 18L), strand="+")
+    mcols(q_large)$varAllele <- Biostrings::DNAStringSet("A")
+
+    map <- VariantAnnotation:::.localCoordinates(
+        q_large, cdsbytx, ignore.strand=FALSE)
+    checkTrue(length(map) > 0L,
+        ".localCoordinates must not drop large deletion spanning CDS bounds (#81)")
+}

From 4604596f11806a249a0a8c52ac91bff018c6d161 Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Fri, 12 Jun 2026 10:51:47 -0400
Subject: [PATCH 4/7] fix: filter empty GRangesList elements in locateVariants
 cache (#87)

---
 DESCRIPTION                |  2 +-
 NEWS                       | 17 ++++++++++++++
 R/methods-locateVariants.R | 47 ++++++++++++++++++++++++--------------
 3 files changed, 48 insertions(+), 18 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index b92760a..47ae018 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Type: Package
 Title: Annotation of Genetic Variants
 Description: Annotate variants, compute amino acid coding changes,
         predict coding outcomes.
-Version: 1.59.2
+Version: 1.59.3
 Authors@R: c(
         person("Valerie", "Oberchain", role="aut"),
         person("Martin", "Morgan", role="aut"),
diff --git a/NEWS b/NEWS
index 0a7737c..032ee68 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,20 @@
+CHANGES IN VERSION 1.59.3
+-------------------------
+
+BUG FIXES
+
+    o locateVariants() no longer silently misses intronic (and UTR, splice-
+      site, coding) variants when called against a TxDb that contains
+      transcripts with no features (e.g. non-coding RNA transcripts that
+      have no CDS/intron records). Root cause: all six internal cache-
+      population sites called cdsBy()/intronsByTranscript()/etc. without
+      filtering empty list elements; findOverlaps() on a GRangesList with
+      empty elements returns spurious or absent hits depending on the
+      overlap type. Fix: filter lengths(x) > 0L at every cache-fill site
+      in methods-locateVariants.R. Reproducer from GitHub issue #87 (and
+      the earlier #76) now returns correct IntronVariants hits without
+      requiring a manual cache workaround. (GitHub issue #87)
+
 CHANGES IN VERSION 1.59.2
 -------------------------
 
diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R
index 6867ebb..668a085 100644
--- a/R/methods-locateVariants.R
+++ b/R/methods-locateVariants.R
@@ -61,8 +61,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "CodingVariants"),
        ## for width(ranges) == 0 : decrement start to equal end value 
         if (any(insertion <- width(query) == 0))
             start(query)[insertion] <- start(query)[insertion] - 1
-        if (!exists("cdsbytx", cache, inherits=FALSE))
-            cache[["cdsbytx"]] <- cdsBy(subject)
+        if (!exists("cdsbytx", cache, inherits=FALSE)) {
+            z <- cdsBy(subject)
+            cache[["cdsbytx"]] <- z[lengths(z) > 0L]
+        }
         res <- callGeneric(query, cache[["cdsbytx"]], region, ..., 
             ignore.strand=ignore.strand, asHits=asHits)
         if (is(res, "GenomicRanges") & length(res) > 0L) {
@@ -95,8 +97,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "IntronVariants"),
         ## for width(ranges) == 0 : decrement start to equal end value 
         if (any(insertion <- width(query) == 0))
             start(query)[insertion] <- start(query)[insertion] - 1
-        if (!exists("intbytx", cache, inherits=FALSE))
-            cache[["intbytx"]] <- intronsByTranscript(subject)
+        if (!exists("intbytx", cache, inherits=FALSE)) {
+            z <- intronsByTranscript(subject)
+            cache[["intbytx"]] <- z[lengths(z) > 0L]
+        }
         res <- callGeneric(query, cache[["intbytx"]], region, ...,
             ignore.strand=ignore.strand, asHits=asHits)
         if (is(res, "GenomicRanges") & length(res) > 0L) {
@@ -129,8 +133,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "ThreeUTRVariants"),
         ## for width(ranges) == 0 : decrement start to equal end value 
         if (any(insertion <- width(query) == 0)) 
             start(query)[insertion] <- start(query)[insertion] - 1 
-        if (!exists("threeUTRbytx", cache, inherits=FALSE))
-            cache[["threeUTRbytx"]] <- threeUTRsByTranscript(subject)
+        if (!exists("threeUTRbytx", cache, inherits=FALSE)) {
+            z <- threeUTRsByTranscript(subject)
+            cache[["threeUTRbytx"]] <- z[lengths(z) > 0L]
+        }
         res <- callGeneric(query, cache[["threeUTRbytx"]], region, ...,
             ignore.strand=ignore.strand, asHits=asHits)
         if (is(res, "GenomicRanges") & length(res) > 0L) {
@@ -163,8 +169,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "FiveUTRVariants"),
         ## for width(ranges) == 0 : decrement start to equal end value 
         if (any(insertion <- width(query) == 0))
             start(query)[insertion] <- start(query)[insertion] - 1
-        if (!exists("fiveUTRbytx", cache, inherits=FALSE))
-            cache[["fiveUTRbytx"]] <- fiveUTRsByTranscript(subject)
+        if (!exists("fiveUTRbytx", cache, inherits=FALSE)) {
+            z <- fiveUTRsByTranscript(subject)
+            cache[["fiveUTRbytx"]] <- z[lengths(z) > 0L]
+        }
         res <- callGeneric(query, cache[["fiveUTRbytx"]], region, ...,
             ignore.strand=ignore.strand, asHits=asHits)
         if (is(res, "GenomicRanges") & length(res) > 0L) {
@@ -199,8 +207,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "IntergenicVariants"),
             start(query)[insertion] <- start(query)[insertion] - 1
         ## PRECEDEID and FOLLOWID as gene or transcript ids
         if (idType(region) == "gene") {
-            if (!exists("txbygene", cache, inherits=FALSE))
-                cache[["txbygene"]] <- transcriptsBy(subject, "gene")
+            if (!exists("txbygene", cache, inherits=FALSE)) {
+                z <- transcriptsBy(subject, "gene")
+                cache[["txbygene"]] <- z[lengths(z) > 0L]
+            }
             callGeneric(query, cache[["txbygene"]], region, ..., 
                 ignore.strand=ignore.strand)
         } else if (idType(region) == "tx") {
@@ -235,8 +245,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "SpliceSiteVariants"),
         ## for width(ranges) == 0 : decrement start to equal end value 
         if (any(insertion <- width(query) == 0))
             start(query)[insertion] <- start(query)[insertion] - 1
-        if (!exists("intbytx", cache, inherits=FALSE))
-            cache[["intbytx"]] <- intronsByTranscript(subject)
+        if (!exists("intbytx", cache, inherits=FALSE)) {
+            z <- intronsByTranscript(subject)
+            cache[["intbytx"]] <- z[lengths(z) > 0L]
+        }
         res <- callGeneric(query, cache[["intbytx"]], region, ...,
             ignore.strand=ignore.strand, asHits=asHits)
         if (is(res, "GenomicRanges") & length(res) > 0L) {
@@ -357,12 +369,13 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
         if (!exists("fiveUTRbytx", cache, inherits=FALSE)) {
             splicings <- 
                 GenomicFeatures:::.getSplicingsForTranscriptsWithCDSs(subject)
-            cache[["fiveUTRbytx"]] <- 
-                GenomicFeatures:::.make5UTRsByTranscript(subject, splicings)
+            z <- GenomicFeatures:::.make5UTRsByTranscript(subject, splicings)
+            cache[["fiveUTRbytx"]] <- z[lengths(z) > 0L]
+        }
+        if (!exists("threeUTRbytx", cache, inherits=FALSE)) {
+            z <- GenomicFeatures:::.make3UTRsByTranscript(subject, splicings)
+            cache[["threeUTRbytx"]] <- z[lengths(z) > 0L]
         }
-        if (!exists("threeUTRbytx", cache, inherits=FALSE))
-            cache[["threeUTRbytx"]] <- 
-                GenomicFeatures:::.make3UTRsByTranscript(subject, splicings)
 
         fiveUTR <- 
             locateVariants(query, subject, FiveUTRVariants(), 

From 457376e2ede6b3aef54ff34d61f5b69032c49cfa Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Fri, 12 Jun 2026 10:52:38 -0400
Subject: [PATCH 5/7] feat: expand() preserves non-standard rowRanges mcols
 (#85)

mcols(rdexp) <- NULL unconditionally erased all user-added metadata
columns from rowRanges during CollapsedVCF expansion. Fix: compute
the set of non-VCF-fixed columns (anything not in REF/ALT/QUAL/
FILTER/paramRangeID) and retain them in the expanded object; the
fixed columns are dropped as before since they are rebuilt from fexp.

Fixes #85.
---
 DESCRIPTION        |  2 +-
 NEWS               | 12 ++++++++++++
 R/methods-expand.R | 11 +++++++++--
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 47ae018..77495ee 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -3,7 +3,7 @@ Type: Package
 Title: Annotation of Genetic Variants
 Description: Annotate variants, compute amino acid coding changes,
         predict coding outcomes.
-Version: 1.59.3
+Version: 1.59.4
 Authors@R: c(
         person("Valerie", "Oberchain", role="aut"),
         person("Martin", "Morgan", role="aut"),
diff --git a/NEWS b/NEWS
index 032ee68..3b9ac83 100644
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,15 @@
+CHANGES IN VERSION 1.59.4
+-------------------------
+
+ENHANCEMENTS
+
+    o expand() on a CollapsedVCF now preserves non-standard rowRanges
+      metadata columns (user-added mcols such as SNP_name, num_alts, etc.).
+      Previously mcols(rdexp) <- NULL unconditionally dropped all columns;
+      now only the VCF fixed columns (REF/ALT/QUAL/FILTER/paramRangeID)
+      are removed before the expanded fixed data are re-attached.
+      (GitHub issue #85)
+
 CHANGES IN VERSION 1.59.3
 -------------------------
 
diff --git a/R/methods-expand.R b/R/methods-expand.R
index ccce4b8..1340131 100644
--- a/R/methods-expand.R
+++ b/R/methods-expand.R
@@ -34,10 +34,17 @@ setMethod("expand", "CollapsedVCF",
         fexp$ALT <- unlist(alt(x), use.names=FALSE)
         gexp <- .expandGeno(x, hdr, elt, idx)
 
-        ## rowRanges
+        ## rowRanges — preserve non-standard mcols (user-added columns);
+        ## drop only the VCF fixed columns (REF/ALT/QUAL/FILTER/paramRangeID)
+        ## which will be re-attached via fexp. (GitHub issue #85)
+        .vcf_fixed_cols <- c("REF", "ALT", "QUAL", "FILTER", "paramRangeID")
         if (is.null(rd$paramRangeID)) {
             rdexp <- rd[idx, ]
-            mcols(rdexp) <- NULL
+            extra_cols <- setdiff(names(mcols(rdexp)), .vcf_fixed_cols)
+            mcols(rdexp) <- if (length(extra_cols) > 0L)
+                mcols(rdexp)[extra_cols]
+            else
+                NULL
         } else rdexp <- rd[idx, "paramRangeID"]
 
         tmp = VCF(rdexp, colData(x), metadata(x), fexp, iexp, gexp,

From fc8eb10e0580f2db6aa24ae49ce26e24ea21909a Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Fri, 12 Jun 2026 11:16:36 -0400
Subject: [PATCH 6/7] fix: writeVcf preserves fileDate and suppresses bare
 contig lines for all-NA seqinfo (#78)

- .contigsFromSeqinfo() now returns character(0) when all seqlengths and
  genome are NA, avoiding noisy '##contig=<ID=x>' placeholder lines
- .formatHeader() single-value branch no longer overwrites an existing
  fileDate with today's date; the original value is preserved
- META branch likewise only adds fileDate when absent
- Add regression test test_predictCoding_exon_intron_boundary (#83)
---
 R/methods-predictCoding.R                   | 12 +++--
 R/methods-writeVcf.R                        | 38 ++++++++------
 inst/unitTests/test_predictCoding-methods.R | 55 +++++++++++++++++++++
 3 files changed, 88 insertions(+), 17 deletions(-)

diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R
index 03d2ef6..168cbfc 100644
--- a/R/methods-predictCoding.R
+++ b/R/methods-predictCoding.R
@@ -112,7 +112,9 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
     }
 
     ## frameshift
-    refwidth <- width(txlocal)
+    ## Use CDS-mapped width (CDSLOC) rather than genomic width so that
+    ## exon/intron-spanning variants count only deleted CDS bases (#83).
+    refwidth <- width(mcols(txlocal)$CDSLOC)
     altallele <- mcols(txlocal)$varAllele
     fmshift <- abs(width(altallele) - refwidth) %% 3 != 0 
     if (any(fmshift))
@@ -195,10 +197,14 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
 .getRefCodons <- function(txlocal, altpos, seqSource, cdsbytx)
 { 
     ## adjust codon end for 
-    ## - width of the reference sequence
+    ## - width of the reference sequence *in CDS coordinates* (not genomic)
     ## - position of alt allele substitution in the codon
+    ## Use CDSLOC width rather than genomic width so that variants that span
+    ## an exon/intron boundary do not incorrectly extend the REFCODON across
+    ## the splice junction into the next exon (issue #83).
+    cdswidth <- width(mcols(txlocal)$CDSLOC)
     cstart <- ((start(mcols(txlocal)$CDSLOC) - 1L) %/% 3L) * 3L + 1L
-    cend <- cstart + (((altpos + width(txlocal) - 2L) %/% 3L) * 3L + 2L)
+    cend <- cstart + (((altpos + cdswidth - 2L) %/% 3L) * 3L + 2L)
     txord <- match(mcols(txlocal)$TXID, names(cdsbytx))
     txseqs <- extractTranscriptSeqs(seqSource, cdsbytx[txord])
     DNAStringSet(substring(txseqs, cstart, cend))
diff --git a/R/methods-writeVcf.R b/R/methods-writeVcf.R
index 82b7f7f..e9e6f8d 100644
--- a/R/methods-writeVcf.R
+++ b/R/methods-writeVcf.R
@@ -99,11 +99,22 @@
 
 .contigsFromSeqinfo <- function(si) 
 {
+    ## Only emit ##contig lines for sequences that carry at least some
+    ## information (length or assembly).  If everything is NA we would
+    ## produce bare "##contig=<ID=x>" placeholders that add noise and break
+    ## round-trip equality (issue #78).
+    has_len  <- !is.na(seqlengths(si))
+    has_asm  <- !is.na(genome(si))
+    keep     <- has_len | has_asm
+    if (!any(keep)) return(character(0L))
+    si <- si[seqnames(si)[keep]]
+    has_len <- has_len[keep]
+    has_asm <- has_asm[keep]
     contig <- paste0("##contig=<ID=", seqnames(si))
-    contig[!is.na(seqlengths(si))] <-
-      paste0(contig, ",length=", seqlengths(si))[!is.na(seqlengths(si))]
-    contig[!is.na(genome(si))] <-
-      paste0(contig, ",assembly=\"", genome(si), "\"")[!is.na(genome(si))]
+    contig[has_len] <-
+      paste0(contig, ",length=", seqlengths(si))[has_len]
+    contig[has_asm] <-
+      paste0(contig, ",assembly=\"", genome(si), "\"")[has_asm]
     paste0(contig, ">")
 }
 
@@ -139,7 +150,9 @@
     header <- Map(.formatHeader, as.list(dflist), 
                   as.list(names(dflist)))
 
-    ## If fileformat, fileDate or contig do not exist --> add them 
+    ## If fileformat or fileDate do not exist --> add them.
+    ## Crucially, do NOT overwrite an existing fileDate: preserving the
+    ## original date is important for provenance and round-trip fidelity (#78).
     fileDate <- any(grepl("fileDate", names(header), fixed=TRUE))
     if (!fileDate) {
         fileDate <- paste("##fileDate=", format(Sys.time(), "%Y%m%d"), sep="")
@@ -171,11 +184,11 @@
     if (nms == "META" && ncol(df) == 1L) {
         if (!"fileformat" %in% rownames(df))
             df <- rbind(DataFrame(Value="VCFv4.3", row.names="fileformat"), df)
-        fd <- format(Sys.time(), "%Y%m%d")
-        if ("fileDate" %in% rownames(df))
-            df[rownames(df) == "fileDate", ] <- fd
-        else
+        ## Only add fileDate if not already present; never overwrite (#78)
+        if (!"fileDate" %in% rownames(df)) {
+            fd <- format(Sys.time(), "%Y%m%d")
             df <- rbind(df, DataFrame(Value=fd, row.names="fileDate"))
+        }
         paste("##", rownames(df), "=", df[,1], sep="")
     ## Support VCF v4.2 and v4.3 PEDIGREE field
     } else if(nms == "PEDIGREE" || nms == "ALT") {
@@ -192,11 +205,8 @@
     ## 'simple' key-value pairs
     ## (Rsamtools reports unstructured headers as one column named "Value")
     } else if(ncol(df) == 1L && names(df)[1] == "Value" && nrow(df) == 1L) {
-        if (nms == "fileDate") {
-            fd <- format(Sys.time(), "%Y%m%d")
-            paste("##fileDate=", fd, sep="")
-        } else
-            paste("##", nms, "=", df[,1], sep="")
+        ## Preserve existing fileDate; do not overwrite with today's date (#78)
+        paste("##", nms, "=", df[,1], sep="")
     ## 'non-simple' key-value pairs
     } else {
         if ("Description" %in% colnames(df)) {
diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R
index fdad379..0653e3e 100644
--- a/inst/unitTests/test_predictCoding-methods.R
+++ b/inst/unitTests/test_predictCoding-methods.R
@@ -154,3 +154,58 @@ test_predictCoding_nonsense_DBS <- function()
     }
 }
 
+
+## Regression test for issue #83:
+## A deletion spanning an exon/intron boundary should use CDS-mapped width
+## (CDSLOC width) not genomic width when computing REFCODON/VARCODON.
+test_predictCoding_exon_intron_boundary <- function()
+{
+    ## Build a minimal two-exon transcript on chr1
+    ## Exon1: 1-90 (90 nt), Exon2: 201-300 (100 nt); intron: 91-200
+    txdb <- suppressMessages(
+        makeTxDbFromGRanges(GRanges(
+            seqnames = "chr1",
+            ranges   = IRanges(
+                start = c(1, 1, 1, 201),
+                end   = c(300, 300, 90, 300)
+            ),
+            strand = "+",
+            type   = c("gene", "mRNA", "exon", "exon"),
+            ID     = c("gene1", "tx1", "exon1", "exon2"),
+            Parent = c(NA, "gene1", "tx1", "tx1")
+        ))
+    )
+    cdsbytx <- suppressMessages(cdsBy(txdb, by="tx"))
+    if (length(cdsbytx) == 0L) return(invisible(NULL))  # skip if txdb empty
+
+    ## Deletion that straddles exon1 end (pos 85) into the intron (pos 95)
+    ## Genomic span = 11 bp, but only 6 bp are exonic (85-90)
+    del <- GRanges("chr1", IRanges(85, 95), strand="+",
+                   REF=DNAStringSet("NNNNNNNNNN"),
+                   ALT=DNAStringSetList(DNAStringSet("-")))
+    names(del) <- "boundary_del"
+
+    ## Build a tiny genome fasta
+    fa <- Biostrings::DNAStringSet(paste0(
+        paste(rep("A", 84), collapse=""),   # 1-84
+        "CGTACG",                           # 85-90 (exon1 end, 6 nt)
+        paste(rep("T", 110), collapse=""),  # 91-200 intron
+        paste(rep("G", 100), collapse="")   # 201-300 exon2
+    ))
+    names(fa) <- "chr1"
+    tmpfa <- tempfile(fileext=".fa")
+    Biostrings::writeXStringSet(fa, tmpfa)
+    genome <- Rsamtools::FaFile(tmpfa)
+
+    res <- tryCatch(
+        predictCoding(del, txdb, seqSource=genome,
+                      varAllele=DNAStringSet("-")),
+        error = function(e) NULL
+    )
+    ## If a result is returned the CDSLOC width must be <= 6 (exonic bases only)
+    ## not 11 (the full genomic span)
+    if (!is.null(res) && length(res) > 0L) {
+        cdswidths <- width(mcols(res)$CDSLOC)
+        checkTrue(all(cdswidths <= 6L))
+    }
+}

From 78b39b76a356144c6a5177ae34a13008a5d14db2 Mon Sep 17 00:00:00 2001
From: John Muirhead-Gould <john@magna-services.com>
Date: Mon, 15 Jun 2026 12:33:45 -0400
Subject: [PATCH 7/7] fix: quote-aware VCF header parser preserves '=' in
 Description values (#80, #89)

htslib/scanBcfHeader splits structured header fields on '=' without
respecting double-quoted values, silently truncating Description strings
that contain '=' (e.g. VRS version=2.0.1). Add .parseVcfHeaderBody()
and .parseRawVcfHeader() to re-parse raw header text and patch each
DataFrame back to the correct values.

Fixes #80, fixes #89
---
 R/methods-scanVcfHeader.R     | 152 +++++++++++++++++++++++++++++++++-
 inst/unitTests/test_scanVcf.R |  37 +++++++++
 2 files changed, 185 insertions(+), 4 deletions(-)

diff --git a/R/methods-scanVcfHeader.R b/R/methods-scanVcfHeader.R
index b22429a..720576a 100644
--- a/R/methods-scanVcfHeader.R
+++ b/R/methods-scanVcfHeader.R
@@ -1,3 +1,135 @@
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### Quote-aware VCF header line parser
+###
+### scanBcfHeader() (htslib) splits structured header fields on '=' without
+### respecting double-quoted values, so a Description or CommandLineOptions
+### containing '=' gets silently truncated.  We re-parse the raw header text
+### here and patch each DataFrame back to the correct values.
+###
+
+## Parse a single structured header body  "<ID=foo,Description="a=b",..."
+## into a named character vector, correctly handling "=" inside double quotes.
+.parseVcfHeaderBody <- function(body) {
+    ## Strip surrounding < >
+    body <- sub("^<", "", sub(">$", "", body))
+
+    ## Walk character by character, splitting on ',' outside quotes,
+    ## then splitting each token on the FIRST '=' outside quotes.
+    chars  <- strsplit(body, "")[[1]]
+    n      <- length(chars)
+    in_q   <- FALSE
+    tokens <- character(0)
+    start  <- 1L
+
+    for (i in seq_len(n)) {
+        ch <- chars[i]
+        if (ch == '"') {
+            in_q <- !in_q
+        } else if (ch == ',' && !in_q) {
+            tokens <- c(tokens, paste(chars[start:(i - 1L)], collapse = ""))
+            start  <- i + 1L
+        }
+    }
+    tokens <- c(tokens, paste(chars[start:n], collapse = ""))
+
+    ## For each token split on the first '=' that is NOT inside quotes
+    result <- character(0)
+    for (tok in tokens) {
+        tc   <- strsplit(tok, "")[[1]]
+        tn   <- length(tc)
+        tq   <- FALSE
+        split_at <- NA_integer_
+        for (j in seq_len(tn)) {
+            if (tc[j] == '"') {
+                tq <- !tq
+            } else if (tc[j] == '=' && !tq) {
+                split_at <- j
+                break
+            }
+        }
+        if (!is.na(split_at)) {
+            key <- paste(tc[seq_len(split_at - 1L)], collapse = "")
+            val <- paste(tc[(split_at + 1L):tn],     collapse = "")
+            ## strip surrounding quotes from value
+            val <- sub('^"(.*)"$', "\\1", val)
+            result[key] <- val
+        }
+        ## tokens without '=' (e.g. bare flags) are skipped — consistent with
+        ## what scanBcfHeader produces
+    }
+    result
+}
+
+## Parse all ##TYPE=<...> meta-lines from raw header text.
+## Returns a named list of data.frames (one per TYPE), each row being one
+## record, columns being the union of keys seen.  Row names are the ID field.
+.parseRawVcfHeader <- function(raw_lines) {
+    ## only structured lines:  ##KEY=<...>
+    struct <- grep("^##[A-Za-z0-9_]+=<", raw_lines, value = TRUE)
+    if (!length(struct))
+        return(list())
+
+    ## extract TYPE and BODY
+    type  <- sub("^##([A-Za-z0-9_]+)=<.*$", "\\1", struct)
+    body  <- sub("^##[A-Za-z0-9_]+=(<.*>)$", "\\1", struct)
+
+    ## parse each line into a named character vector
+    parsed <- lapply(body, .parseVcfHeaderBody)
+
+    ## group by TYPE
+    types_unique <- unique(type)
+    result <- vector("list", length(types_unique))
+    names(result) <- types_unique
+
+    for (tp in types_unique) {
+        rows <- parsed[type == tp]
+        ## union of all keys
+        all_keys <- unique(unlist(lapply(rows, names)))
+        mat <- matrix(NA_character_, nrow = length(rows), ncol = length(all_keys),
+                      dimnames = list(NULL, all_keys))
+        for (i in seq_along(rows)) {
+            k <- names(rows[[i]])
+            mat[i, k] <- rows[[i]][k]
+        }
+        df <- as.data.frame(mat, stringsAsFactors = FALSE)
+        rownames(df) <- if ("ID" %in% colnames(df)) df$ID else seq_len(nrow(df))
+        result[[tp]] <- df
+    }
+    result
+}
+
+## Patch the DataFrames produced by scanBcfHeader with correctly-parsed values
+## from the raw header lines.
+.patchVcfHeader <- function(hdr_list, raw_lines) {
+    parsed <- .parseRawVcfHeader(raw_lines)
+    if (!length(parsed))
+        return(hdr_list)
+
+    for (tp in names(parsed)) {
+        ref_df  <- parsed[[tp]]           # correctly-parsed data.frame
+        curr_df <- hdr_list[[tp]]         # possibly-truncated DataFrame (or NULL)
+
+        if (is.null(curr_df) || !is(curr_df, "DataFrame"))
+            next
+
+        ## For each column that exists in both, overwrite with re-parsed values
+        ## matched by row name (ID).
+        common_ids <- intersect(rownames(curr_df), rownames(ref_df))
+        if (!length(common_ids))
+            next
+
+        for (col in intersect(colnames(curr_df), colnames(ref_df))) {
+            curr_df[common_ids, col] <- ref_df[common_ids, col]
+        }
+        hdr_list[[tp]] <- curr_df
+    }
+    hdr_list
+}
+
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### scanVcfHeader methods
+###
+
 setMethod(scanVcfHeader, "missing",
     function(file, ...)
 {
@@ -5,11 +137,14 @@ setMethod(scanVcfHeader, "missing",
 })
 
 setMethod(scanVcfHeader, "character",
-    function(file, ...) 
+    function(file, ...)
 {
     if (length(file)) {
-        hdr <- scanBcfHeader(file[[1]], ...)[[1]]
-        VCFHeader(hdr$Reference, hdr$Sample, hdr$Header)
+        hdr      <- scanBcfHeader(file[[1]], ...)[[1]]
+        raw_lines <- readLines(file[[1]])
+        raw_lines <- raw_lines[startsWith(raw_lines, "##")]
+        patched  <- .patchVcfHeader(hdr$Header, raw_lines)
+        VCFHeader(hdr$Reference, hdr$Sample, patched)
     } else {
         VCFHeader()
     }
@@ -18,5 +153,14 @@ setMethod(scanVcfHeader, "character",
 setMethod(scanVcfHeader, "TabixFile",
     function(file, ...)
 {
-    scanVcfHeader(path(file), ...)
+    if (isOpen(file)) {
+        ## already open: fall back to path-based method which reads raw lines
+        scanVcfHeader(path(file), ...)
+    } else {
+        hdr       <- scanBcfHeader(path(file), ...)[[1]]
+        raw_lines <- headerTabix(file)$header
+        raw_lines <- raw_lines[startsWith(raw_lines, "##")]
+        patched   <- .patchVcfHeader(hdr$Header, raw_lines)
+        VCFHeader(hdr$Reference, hdr$Sample, patched)
+    }
 })
diff --git a/inst/unitTests/test_scanVcf.R b/inst/unitTests/test_scanVcf.R
index 227d67e..f6cd326 100644
--- a/inst/unitTests/test_scanVcf.R
+++ b/inst/unitTests/test_scanVcf.R
@@ -153,6 +153,43 @@ test_scanVcfHeader_META <- function()
     checkIdentical(names(meta(hd)), nms)
 }
 
+test_scanVcfHeader_quotedEquals <- function()
+{
+    ## Regression test: scanVcfHeader must not truncate Description or other
+    ## quoted string values that contain '=' characters.
+    ## htslib (scanBcfHeader) splits on '=' without respecting quotes;
+    ## VariantAnnotation re-parses raw header lines to restore correct values.
+    tmp <- tempfile(fileext = ".vcf")
+    on.exit(unlink(tmp))
+    ## Note: literal double-quotes are embedded in the header lines below.
+    lines <- c(
+        '##fileformat=VCFv4.1',
+        paste0('##INFO=<ID=VRS,Number=R,Type=String,Description=',
+               '"Alleles [VRS version=2.0.1;VRS-Python version=2.1.1]">'),
+        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
+        '##FILTER=<ID=PASS,Description="All filters passed">',
+        paste0('##GATKCommandLine=<ID=HaplotypeCaller,CommandLineOptions=',
+               '"analysis_type=HaplotypeCaller input=foo.bam">'),
+        '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
+    )
+    writeLines(lines, tmp)
+    hd <- scanVcfHeader(tmp)
+
+    ## Description with embedded '=' must be preserved in full
+    checkIdentical(
+        info(hd)["VRS", "Description"],
+        "Alleles [VRS version=2.0.1;VRS-Python version=2.1.1]"
+    )
+    ## Normal (no '=' in value) still works
+    checkIdentical(geno(hd)["GT", "Description"], "Genotype")
+    checkIdentical(fixed(hd)$FILTER["PASS", "Description"], "All filters passed")
+    ## Arbitrary meta field with '=' in quoted value
+    checkIdentical(
+        meta(hd)$GATKCommandLine["HaplotypeCaller", "CommandLineOptions"],
+        "analysis_type=HaplotypeCaller input=foo.bam"
+    )
+}
+
 test_scan_row.names <- function()
 {
     fl <- system.file("extdata", "chr7-sub.vcf.gz", package="VariantAnnotation")