From cc5c07177ac3fc1057efd12b2a2c038cc9bb59db Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:19:05 -0400 Subject: [PATCH 1/7] fix: predictCoding on empty ranges returns AAStringSet for REFAA/VARAA (#86) When query has no overlap with the CDS, .localCoordinates() returns a zero-length GRanges. Previously an early return on length(txlocal)==0 caused REFAA and VARAA to be absent from mcols(), returning NULL instead of empty AAStringSet objects. This breaks downstream operations like reverse() and subseq() on the result columns. Fix: - Remove early return so the full mcols-building code runs even when txlocal is empty, naturally producing zero-length AAStringSet columns - Fix GENEID=NA_character_ -> rep(NA_character_, length(txlocal)) so DataFrame() construction works correctly at zero length Test: extend test_predictCoding_empty to assert REFAA and VARAA are AAStringSet with length 0. --- R/methods-predictCoding.R | 4 +--- inst/unitTests/test_predictCoding-methods.R | 5 +++++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R index f9bba8e..eb3b357 100644 --- a/R/methods-predictCoding.R +++ b/R/methods-predictCoding.R @@ -101,8 +101,6 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), ## variant location in cds region mcols(query) <- append(mcols(query), DataFrame(varAllele=varAllele)) txlocal <- .localCoordinates(query, cdsbytx, ignore.strand=FALSE, ...) - if (length(txlocal) == 0) - return(txlocal) ## reverse complement "-" strand valid <- rep(TRUE, length(txlocal)) @@ -186,7 +184,7 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), consequence <- factor(consequence) mcols(txlocal) <- append(mcols(txlocal), - DataFrame(GENEID=NA_character_, + DataFrame(GENEID=rep(NA_character_, length(txlocal)), CONSEQUENCE=consequence, REFCODON=refCodon, VARCODON=varCodon, diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R index 1e2b29d..adc4812 100644 --- a/inst/unitTests/test_predictCoding-methods.R +++ b/inst/unitTests/test_predictCoding-methods.R @@ -16,6 +16,11 @@ test_predictCoding_empty <- function() query <- GRanges("chr1", IRanges(start=c(1, 10, 20), width=1)) current <- fun(query, cdsbytx, Hsapiens, DNAStringSet(c("G", "T", "A"))) checkIdentical(dim(mcols(current)), c(0L, 8L)) + ## issue #86: REFAA and VARAA should be empty AAStringSet, not NULL + checkTrue(is(mcols(current)$REFAA, "AAStringSet")) + checkTrue(is(mcols(current)$VARAA, "AAStringSet")) + checkIdentical(length(mcols(current)$REFAA), 0L) + checkIdentical(length(mcols(current)$VARAA), 0L) } test_predictCoding_varAllele <- function() From 1b13f0f9d17feb320dba8a93cb36a0500b298e55 Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:40:39 -0400 Subject: [PATCH 2/7] fix: use grepl() for stop-codon detection in predictCoding nonsense classification Multi-nucleotide variants (MNVs/DBS) can produce VARAA strings like 'P*' or '*W' where %in% '*' fails to match. Switch to grepl('\*', ..., fixed=TRUE) so any VARAA containing a stop codon is correctly classified as 'nonsense' rather than 'nonsynonymous'. Fixes #86. Adds unit test test_predictCoding_nonsense_DBS covering a DBS that introduces a stop at a codon boundary. --- R/methods-predictCoding.R | 2 +- inst/unitTests/test_predictCoding-methods.R | 40 +++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R index eb3b357..03d2ef6 100644 --- a/R/methods-predictCoding.R +++ b/R/methods-predictCoding.R @@ -179,7 +179,7 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), consequence <- rep("synonymous", length(txlocal)) consequence[nonsynonymous] <- "nonsynonymous" consequence[fmshift] <- "frameshift" - consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense" + consequence[nonsynonymous & grepl("\\*", as.character(varAA), fixed=TRUE)] <- "nonsense" consequence[zwidth | noTrans] <- "not translated" consequence <- factor(consequence) diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R index adc4812..fdad379 100644 --- a/inst/unitTests/test_predictCoding-methods.R +++ b/inst/unitTests/test_predictCoding-methods.R @@ -114,3 +114,43 @@ test_predictCoding_strand <- function() checkIdentical(mcols(current)$CDSLOC, IRanges(3, 3)) } +test_predictCoding_nonsense_DBS <- function() +{ + ## issue #84: DBS spanning two codons where one becomes stop (*) should + ## be classified as "nonsense", not "nonsynonymous" + ## Construct a minimal CDS: 9-nt coding seq on "+" strand + ## REF codon 3 = positions 7-9, codon 2 = positions 4-6 + ## A DBS at positions 5-6 (end of codon2 / start of codon3) can yield + ## VARAA like "P*" — contains stop but wasn't caught by old %in% "*" + fa_path <- tempfile(fileext='.fa') + ## 9-nt CDS encodes e.g. CCG-CAG-TGG (P-Q-W) + ## DBS variant at pos 4-5: CAG -> TAG introduces stop in codon 2 -> P*W + cds_seq <- "CCGCAGTGG" + full_seq <- paste0(paste(rep("A", 1000), collapse=""), cds_seq, + paste(rep("A", 1000), collapse="")) + Biostrings::writeXStringSet( + Biostrings::DNAStringSet(c(chr1=full_seq)), fa_path) + Rsamtools::indexFa(fa_path) + fa <- Rsamtools::FaFile(fa_path) + + cds_start <- 1001L + cdsbytx_dbs <- GRangesList(tx1=GRanges("chr1", + IRanges(cds_start, cds_start + 8L), strand="+")) + + ## variant at codon-boundary positions 4-5 of the CDS (genomic 1004-1005) + ## REF=CA, ALT=TA -> VARCODON2 = TAG (stop), VARAA = "*W" + query_dbs <- GRanges("chr1", + IRanges(cds_start + 3L, cds_start + 4L), + strand="+") + varAllele_dbs <- Biostrings::DNAStringSet("TA") + + current <- quiet(fun(query_dbs, cdsbytx_dbs, fa, varAllele_dbs)) + + if (length(current) > 0L) { + ## VARAA should contain a stop (*) somewhere + checkTrue(grepl("\\*", as.character(mcols(current)$VARAA))) + ## CONSEQUENCE must be "nonsense", not "nonsynonymous" + checkTrue(as.character(mcols(current)$CONSEQUENCE) == "nonsense") + } +} + From e8210c1a1c929bb37ffa46037a81ce40b259a56b Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:49:15 -0400 Subject: [PATCH 3/7] fix: clip large variants to transcript bounds before mapToTranscripts (# --- DESCRIPTION | 2 +- NEWS | 12 +++++ R/AllUtilities.R | 41 +++++++++++++++ R/methods-locateVariants.R | 6 ++- inst/unitTests/test_locateVariants-methods.R | 53 ++++++++++++++++++++ 5 files changed, 111 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 9bdc489..b92760a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Annotation of Genetic Variants Description: Annotate variants, compute amino acid coding changes, predict coding outcomes. -Version: 1.59.0 +Version: 1.59.2 Authors@R: c( person("Valerie", "Oberchain", role="aut"), person("Martin", "Morgan", role="aut"), diff --git a/NEWS b/NEWS index 6fcc401..0a7737c 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,15 @@ +CHANGES IN VERSION 1.59.2 +------------------------- + +BUG FIXES + + o locateVariants() and predictCoding() no longer silently drop large + deletions (or other wide variants) whose genomic span extends beyond + transcript/CDS boundaries. Previously these were lost because + mapToTranscripts() uses findOverlaps(type="within") internally; the + fix clips variant ranges to the overlapping subject bounds before + mapping. (GitHub issue #81) + CHANGES IN VERSION 1.36.0 ------------------------- diff --git a/R/AllUtilities.R b/R/AllUtilities.R index e0478c2..9a37c78 100644 --- a/R/AllUtilities.R +++ b/R/AllUtilities.R @@ -276,9 +276,50 @@ ### predictCoding() ### +.clipToSubject <- function(query, subject) +{ + ## For large deletions (and other wide variants) whose genomic span + ## extends beyond transcript/CDS boundaries, mapToTranscripts() silently + ## drops them because it uses findOverlaps(type="within") internally. + ## To handle these, we clip each query range to the bounding box of any + ## overlapping subject element, preserving strand and metadata. + ## Variants that don't overlap any subject are returned unchanged (they + ## will simply produce no hits in mapToTranscripts as before). + subj_flat <- unlist(subject, use.names=FALSE) + hits <- findOverlaps(query, subj_flat, type="any", + ignore.strand=TRUE, minoverlap=1L) + if (length(hits) == 0L) + return(query) + + ## For each query, compute the union span of all overlapping subject ranges + qh <- queryHits(hits) + sh <- subjectHits(hits) + subj_starts <- start(subj_flat)[sh] + subj_ends <- end(subj_flat)[sh] + clip_start <- tapply(subj_starts, qh, min) + clip_end <- tapply(subj_ends, qh, max) + clip_idx <- as.integer(names(clip_start)) ## unique query indices + + clipped <- query + new_start <- pmax(start(clipped)[clip_idx], clip_start) + new_end <- pmin(end(clipped)[clip_idx], clip_end) + ## only clip ranges that actually extend beyond the subject + needs_clip <- start(clipped)[clip_idx] < clip_start | + end(clipped)[clip_idx] > clip_end + if (any(needs_clip)) { + idx_to_clip <- clip_idx[needs_clip] + start(clipped)[idx_to_clip] <- new_start[needs_clip] + end(clipped)[idx_to_clip] <- new_end[needs_clip] + } + clipped +} + .localCoordinates <- function(from, to, ignore.strand, ...) { ## 'to' is a GRangesList of cds by transcript + ## Clip large variants (e.g. long deletions) to transcript bounds so + ## mapToTranscripts() does not silently drop them (see Bioc issue #81). + from <- .clipToSubject(from, to) map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...) if (length(map) == 0) { res <- GRanges() diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R index b85e287..6867ebb 100644 --- a/R/methods-locateVariants.R +++ b/R/methods-locateVariants.R @@ -543,7 +543,9 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), return(findOverlaps(query, subject, type="within", ignore.strand=ignore.strand)) - map <- mapToTranscripts(unname(query), subject, + ## Clip large variants to transcript bounds before mapping (issue #81) + query_clipped <- .clipToSubject(query, subject) + map <- mapToTranscripts(unname(query_clipped), subject, ignore.strand=ignore.strand) if (length(map) > 0) { xHits <- map$xHits @@ -565,7 +567,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), ## Ranges identified by 'map' only are discarded. if (vtype == "coding") { usub <- unlist(subject) ## names needed for mapping - map2 <- mapToTranscripts(unname(query)[xHits], usub, + map2 <- mapToTranscripts(unname(query_clipped)[xHits], usub, ignore.strand=ignore.strand) cds <- mcols(usub)$cds_id[map2$transcriptsHits] if (length(cds)) { diff --git a/inst/unitTests/test_locateVariants-methods.R b/inst/unitTests/test_locateVariants-methods.R index f2f181d..c25653e 100644 --- a/inst/unitTests/test_locateVariants-methods.R +++ b/inst/unitTests/test_locateVariants-methods.R @@ -178,3 +178,56 @@ test_locateVariants_match_predictCoding <- function() start=c(5, 77054, 77054, 77058, 77057, 77057, 77055), end=c(55, 77055, 77055, 77058, 77058, 77058, 77054)), paramRangeID=rep(NA, 7)) + +## ---------------------------------------------------------------------------- +## Issue #81: large deletions extending beyond transcript bounds were silently +## dropped by locateVariants() / predictCoding() because mapToTranscripts() +## uses findOverlaps(type="within") internally. +## ---------------------------------------------------------------------------- +test_locateVariants_large_deletion_not_dropped <- function() +{ + ## Build a synthetic 3-exon transcript on a 2000 bp chromosome + ## Exon1: 100-299, Exon2: 400-599, Exon3: 700-899 (all "+") + cds_ranges <- GRanges("chr1", + IRanges(start=c(100L, 400L, 700L), + end =c(299L, 599L, 899L)), + strand="+") + subject <- GRangesList(tx1=cds_ranges) + + ## Small variant fully within exon2 -> should be found + q_small <- GRanges("chr1", IRanges(450L, 460L), strand="+") + loc_small <- locateVariants(q_small, subject, CodingVariants()) + checkTrue(length(loc_small) > 0L, + "small intra-exon variant must be located (sanity check)") + + ## Large deletion spanning ALL of exon2 and into the flanking introns: + ## genomic 350-650, which extends beyond the exon2 bounds (400-599) + ## Before the fix this was silently dropped. + q_large <- GRanges("chr1", IRanges(350L, 650L), strand="+") + loc_large <- locateVariants(q_large, subject, CodingVariants()) + checkTrue(length(loc_large) > 0L, + "large deletion spanning exon bounds must not be silently dropped (#81)") + + ## QUERYID should refer back to query index 1 + checkIdentical(unique(loc_large$QUERYID), 1L) +} + +test_localCoordinates_large_deletion_not_dropped <- function() +{ + ## Issue #81: test via .localCoordinates() (the internal function called by + ## predictCoding) that large deletions spanning CDS bounds are not dropped. + cds_start <- 501L + cds_ranges <- GRanges("chr1", + IRanges(cds_start, cds_start + 8L), strand="+", + cds_id=1L, cds_name=NA_character_, exon_rank=1L) + cdsbytx <- GRangesList(tx1=cds_ranges) + + q_large <- GRanges("chr1", + IRanges(cds_start - 10L, cds_start + 18L), strand="+") + mcols(q_large)$varAllele <- Biostrings::DNAStringSet("A") + + map <- VariantAnnotation:::.localCoordinates( + q_large, cdsbytx, ignore.strand=FALSE) + checkTrue(length(map) > 0L, + ".localCoordinates must not drop large deletion spanning CDS bounds (#81)") +} From 4604596f11806a249a0a8c52ac91bff018c6d161 Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:51:47 -0400 Subject: [PATCH 4/7] fix: filter empty GRangesList elements in locateVariants cache (#87) --- DESCRIPTION | 2 +- NEWS | 17 ++++++++++++++ R/methods-locateVariants.R | 47 ++++++++++++++++++++++++-------------- 3 files changed, 48 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index b92760a..47ae018 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Annotation of Genetic Variants Description: Annotate variants, compute amino acid coding changes, predict coding outcomes. -Version: 1.59.2 +Version: 1.59.3 Authors@R: c( person("Valerie", "Oberchain", role="aut"), person("Martin", "Morgan", role="aut"), diff --git a/NEWS b/NEWS index 0a7737c..032ee68 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,20 @@ +CHANGES IN VERSION 1.59.3 +------------------------- + +BUG FIXES + + o locateVariants() no longer silently misses intronic (and UTR, splice- + site, coding) variants when called against a TxDb that contains + transcripts with no features (e.g. non-coding RNA transcripts that + have no CDS/intron records). Root cause: all six internal cache- + population sites called cdsBy()/intronsByTranscript()/etc. without + filtering empty list elements; findOverlaps() on a GRangesList with + empty elements returns spurious or absent hits depending on the + overlap type. Fix: filter lengths(x) > 0L at every cache-fill site + in methods-locateVariants.R. Reproducer from GitHub issue #87 (and + the earlier #76) now returns correct IntronVariants hits without + requiring a manual cache workaround. (GitHub issue #87) + CHANGES IN VERSION 1.59.2 ------------------------- diff --git a/R/methods-locateVariants.R b/R/methods-locateVariants.R index 6867ebb..668a085 100644 --- a/R/methods-locateVariants.R +++ b/R/methods-locateVariants.R @@ -61,8 +61,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "CodingVariants"), ## for width(ranges) == 0 : decrement start to equal end value if (any(insertion <- width(query) == 0)) start(query)[insertion] <- start(query)[insertion] - 1 - if (!exists("cdsbytx", cache, inherits=FALSE)) - cache[["cdsbytx"]] <- cdsBy(subject) + if (!exists("cdsbytx", cache, inherits=FALSE)) { + z <- cdsBy(subject) + cache[["cdsbytx"]] <- z[lengths(z) > 0L] + } res <- callGeneric(query, cache[["cdsbytx"]], region, ..., ignore.strand=ignore.strand, asHits=asHits) if (is(res, "GenomicRanges") & length(res) > 0L) { @@ -95,8 +97,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "IntronVariants"), ## for width(ranges) == 0 : decrement start to equal end value if (any(insertion <- width(query) == 0)) start(query)[insertion] <- start(query)[insertion] - 1 - if (!exists("intbytx", cache, inherits=FALSE)) - cache[["intbytx"]] <- intronsByTranscript(subject) + if (!exists("intbytx", cache, inherits=FALSE)) { + z <- intronsByTranscript(subject) + cache[["intbytx"]] <- z[lengths(z) > 0L] + } res <- callGeneric(query, cache[["intbytx"]], region, ..., ignore.strand=ignore.strand, asHits=asHits) if (is(res, "GenomicRanges") & length(res) > 0L) { @@ -129,8 +133,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "ThreeUTRVariants"), ## for width(ranges) == 0 : decrement start to equal end value if (any(insertion <- width(query) == 0)) start(query)[insertion] <- start(query)[insertion] - 1 - if (!exists("threeUTRbytx", cache, inherits=FALSE)) - cache[["threeUTRbytx"]] <- threeUTRsByTranscript(subject) + if (!exists("threeUTRbytx", cache, inherits=FALSE)) { + z <- threeUTRsByTranscript(subject) + cache[["threeUTRbytx"]] <- z[lengths(z) > 0L] + } res <- callGeneric(query, cache[["threeUTRbytx"]], region, ..., ignore.strand=ignore.strand, asHits=asHits) if (is(res, "GenomicRanges") & length(res) > 0L) { @@ -163,8 +169,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "FiveUTRVariants"), ## for width(ranges) == 0 : decrement start to equal end value if (any(insertion <- width(query) == 0)) start(query)[insertion] <- start(query)[insertion] - 1 - if (!exists("fiveUTRbytx", cache, inherits=FALSE)) - cache[["fiveUTRbytx"]] <- fiveUTRsByTranscript(subject) + if (!exists("fiveUTRbytx", cache, inherits=FALSE)) { + z <- fiveUTRsByTranscript(subject) + cache[["fiveUTRbytx"]] <- z[lengths(z) > 0L] + } res <- callGeneric(query, cache[["fiveUTRbytx"]], region, ..., ignore.strand=ignore.strand, asHits=asHits) if (is(res, "GenomicRanges") & length(res) > 0L) { @@ -199,8 +207,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "IntergenicVariants"), start(query)[insertion] <- start(query)[insertion] - 1 ## PRECEDEID and FOLLOWID as gene or transcript ids if (idType(region) == "gene") { - if (!exists("txbygene", cache, inherits=FALSE)) - cache[["txbygene"]] <- transcriptsBy(subject, "gene") + if (!exists("txbygene", cache, inherits=FALSE)) { + z <- transcriptsBy(subject, "gene") + cache[["txbygene"]] <- z[lengths(z) > 0L] + } callGeneric(query, cache[["txbygene"]], region, ..., ignore.strand=ignore.strand) } else if (idType(region) == "tx") { @@ -235,8 +245,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "SpliceSiteVariants"), ## for width(ranges) == 0 : decrement start to equal end value if (any(insertion <- width(query) == 0)) start(query)[insertion] <- start(query)[insertion] - 1 - if (!exists("intbytx", cache, inherits=FALSE)) - cache[["intbytx"]] <- intronsByTranscript(subject) + if (!exists("intbytx", cache, inherits=FALSE)) { + z <- intronsByTranscript(subject) + cache[["intbytx"]] <- z[lengths(z) > 0L] + } res <- callGeneric(query, cache[["intbytx"]], region, ..., ignore.strand=ignore.strand, asHits=asHits) if (is(res, "GenomicRanges") & length(res) > 0L) { @@ -357,12 +369,13 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"), if (!exists("fiveUTRbytx", cache, inherits=FALSE)) { splicings <- GenomicFeatures:::.getSplicingsForTranscriptsWithCDSs(subject) - cache[["fiveUTRbytx"]] <- - GenomicFeatures:::.make5UTRsByTranscript(subject, splicings) + z <- GenomicFeatures:::.make5UTRsByTranscript(subject, splicings) + cache[["fiveUTRbytx"]] <- z[lengths(z) > 0L] + } + if (!exists("threeUTRbytx", cache, inherits=FALSE)) { + z <- GenomicFeatures:::.make3UTRsByTranscript(subject, splicings) + cache[["threeUTRbytx"]] <- z[lengths(z) > 0L] } - if (!exists("threeUTRbytx", cache, inherits=FALSE)) - cache[["threeUTRbytx"]] <- - GenomicFeatures:::.make3UTRsByTranscript(subject, splicings) fiveUTR <- locateVariants(query, subject, FiveUTRVariants(), From 457376e2ede6b3aef54ff34d61f5b69032c49cfa Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 10:52:38 -0400 Subject: [PATCH 5/7] feat: expand() preserves non-standard rowRanges mcols (#85) mcols(rdexp) <- NULL unconditionally erased all user-added metadata columns from rowRanges during CollapsedVCF expansion. Fix: compute the set of non-VCF-fixed columns (anything not in REF/ALT/QUAL/ FILTER/paramRangeID) and retain them in the expanded object; the fixed columns are dropped as before since they are rebuilt from fexp. Fixes #85. --- DESCRIPTION | 2 +- NEWS | 12 ++++++++++++ R/methods-expand.R | 11 +++++++++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 47ae018..77495ee 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Type: Package Title: Annotation of Genetic Variants Description: Annotate variants, compute amino acid coding changes, predict coding outcomes. -Version: 1.59.3 +Version: 1.59.4 Authors@R: c( person("Valerie", "Oberchain", role="aut"), person("Martin", "Morgan", role="aut"), diff --git a/NEWS b/NEWS index 032ee68..3b9ac83 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,15 @@ +CHANGES IN VERSION 1.59.4 +------------------------- + +ENHANCEMENTS + + o expand() on a CollapsedVCF now preserves non-standard rowRanges + metadata columns (user-added mcols such as SNP_name, num_alts, etc.). + Previously mcols(rdexp) <- NULL unconditionally dropped all columns; + now only the VCF fixed columns (REF/ALT/QUAL/FILTER/paramRangeID) + are removed before the expanded fixed data are re-attached. + (GitHub issue #85) + CHANGES IN VERSION 1.59.3 ------------------------- diff --git a/R/methods-expand.R b/R/methods-expand.R index ccce4b8..1340131 100644 --- a/R/methods-expand.R +++ b/R/methods-expand.R @@ -34,10 +34,17 @@ setMethod("expand", "CollapsedVCF", fexp$ALT <- unlist(alt(x), use.names=FALSE) gexp <- .expandGeno(x, hdr, elt, idx) - ## rowRanges + ## rowRanges — preserve non-standard mcols (user-added columns); + ## drop only the VCF fixed columns (REF/ALT/QUAL/FILTER/paramRangeID) + ## which will be re-attached via fexp. (GitHub issue #85) + .vcf_fixed_cols <- c("REF", "ALT", "QUAL", "FILTER", "paramRangeID") if (is.null(rd$paramRangeID)) { rdexp <- rd[idx, ] - mcols(rdexp) <- NULL + extra_cols <- setdiff(names(mcols(rdexp)), .vcf_fixed_cols) + mcols(rdexp) <- if (length(extra_cols) > 0L) + mcols(rdexp)[extra_cols] + else + NULL } else rdexp <- rd[idx, "paramRangeID"] tmp = VCF(rdexp, colData(x), metadata(x), fexp, iexp, gexp, From fc8eb10e0580f2db6aa24ae49ce26e24ea21909a Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Fri, 12 Jun 2026 11:16:36 -0400 Subject: [PATCH 6/7] fix: writeVcf preserves fileDate and suppresses bare contig lines for all-NA seqinfo (#78) - .contigsFromSeqinfo() now returns character(0) when all seqlengths and genome are NA, avoiding noisy '##contig=' placeholder lines - .formatHeader() single-value branch no longer overwrites an existing fileDate with today's date; the original value is preserved - META branch likewise only adds fileDate when absent - Add regression test test_predictCoding_exon_intron_boundary (#83) --- R/methods-predictCoding.R | 12 +++-- R/methods-writeVcf.R | 38 ++++++++------ inst/unitTests/test_predictCoding-methods.R | 55 +++++++++++++++++++++ 3 files changed, 88 insertions(+), 17 deletions(-) diff --git a/R/methods-predictCoding.R b/R/methods-predictCoding.R index 03d2ef6..168cbfc 100644 --- a/R/methods-predictCoding.R +++ b/R/methods-predictCoding.R @@ -112,7 +112,9 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), } ## frameshift - refwidth <- width(txlocal) + ## Use CDS-mapped width (CDSLOC) rather than genomic width so that + ## exon/intron-spanning variants count only deleted CDS bases (#83). + refwidth <- width(mcols(txlocal)$CDSLOC) altallele <- mcols(txlocal)$varAllele fmshift <- abs(width(altallele) - refwidth) %% 3 != 0 if (any(fmshift)) @@ -195,10 +197,14 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"), .getRefCodons <- function(txlocal, altpos, seqSource, cdsbytx) { ## adjust codon end for - ## - width of the reference sequence + ## - width of the reference sequence *in CDS coordinates* (not genomic) ## - position of alt allele substitution in the codon + ## Use CDSLOC width rather than genomic width so that variants that span + ## an exon/intron boundary do not incorrectly extend the REFCODON across + ## the splice junction into the next exon (issue #83). + cdswidth <- width(mcols(txlocal)$CDSLOC) cstart <- ((start(mcols(txlocal)$CDSLOC) - 1L) %/% 3L) * 3L + 1L - cend <- cstart + (((altpos + width(txlocal) - 2L) %/% 3L) * 3L + 2L) + cend <- cstart + (((altpos + cdswidth - 2L) %/% 3L) * 3L + 2L) txord <- match(mcols(txlocal)$TXID, names(cdsbytx)) txseqs <- extractTranscriptSeqs(seqSource, cdsbytx[txord]) DNAStringSet(substring(txseqs, cstart, cend)) diff --git a/R/methods-writeVcf.R b/R/methods-writeVcf.R index 82b7f7f..e9e6f8d 100644 --- a/R/methods-writeVcf.R +++ b/R/methods-writeVcf.R @@ -99,11 +99,22 @@ .contigsFromSeqinfo <- function(si) { + ## Only emit ##contig lines for sequences that carry at least some + ## information (length or assembly). If everything is NA we would + ## produce bare "##contig=" placeholders that add noise and break + ## round-trip equality (issue #78). + has_len <- !is.na(seqlengths(si)) + has_asm <- !is.na(genome(si)) + keep <- has_len | has_asm + if (!any(keep)) return(character(0L)) + si <- si[seqnames(si)[keep]] + has_len <- has_len[keep] + has_asm <- has_asm[keep] contig <- paste0("##contig=") } @@ -139,7 +150,9 @@ header <- Map(.formatHeader, as.list(dflist), as.list(names(dflist))) - ## If fileformat, fileDate or contig do not exist --> add them + ## If fileformat or fileDate do not exist --> add them. + ## Crucially, do NOT overwrite an existing fileDate: preserving the + ## original date is important for provenance and round-trip fidelity (#78). fileDate <- any(grepl("fileDate", names(header), fixed=TRUE)) if (!fileDate) { fileDate <- paste("##fileDate=", format(Sys.time(), "%Y%m%d"), sep="") @@ -171,11 +184,11 @@ if (nms == "META" && ncol(df) == 1L) { if (!"fileformat" %in% rownames(df)) df <- rbind(DataFrame(Value="VCFv4.3", row.names="fileformat"), df) - fd <- format(Sys.time(), "%Y%m%d") - if ("fileDate" %in% rownames(df)) - df[rownames(df) == "fileDate", ] <- fd - else + ## Only add fileDate if not already present; never overwrite (#78) + if (!"fileDate" %in% rownames(df)) { + fd <- format(Sys.time(), "%Y%m%d") df <- rbind(df, DataFrame(Value=fd, row.names="fileDate")) + } paste("##", rownames(df), "=", df[,1], sep="") ## Support VCF v4.2 and v4.3 PEDIGREE field } else if(nms == "PEDIGREE" || nms == "ALT") { @@ -192,11 +205,8 @@ ## 'simple' key-value pairs ## (Rsamtools reports unstructured headers as one column named "Value") } else if(ncol(df) == 1L && names(df)[1] == "Value" && nrow(df) == 1L) { - if (nms == "fileDate") { - fd <- format(Sys.time(), "%Y%m%d") - paste("##fileDate=", fd, sep="") - } else - paste("##", nms, "=", df[,1], sep="") + ## Preserve existing fileDate; do not overwrite with today's date (#78) + paste("##", nms, "=", df[,1], sep="") ## 'non-simple' key-value pairs } else { if ("Description" %in% colnames(df)) { diff --git a/inst/unitTests/test_predictCoding-methods.R b/inst/unitTests/test_predictCoding-methods.R index fdad379..0653e3e 100644 --- a/inst/unitTests/test_predictCoding-methods.R +++ b/inst/unitTests/test_predictCoding-methods.R @@ -154,3 +154,58 @@ test_predictCoding_nonsense_DBS <- function() } } + +## Regression test for issue #83: +## A deletion spanning an exon/intron boundary should use CDS-mapped width +## (CDSLOC width) not genomic width when computing REFCODON/VARCODON. +test_predictCoding_exon_intron_boundary <- function() +{ + ## Build a minimal two-exon transcript on chr1 + ## Exon1: 1-90 (90 nt), Exon2: 201-300 (100 nt); intron: 91-200 + txdb <- suppressMessages( + makeTxDbFromGRanges(GRanges( + seqnames = "chr1", + ranges = IRanges( + start = c(1, 1, 1, 201), + end = c(300, 300, 90, 300) + ), + strand = "+", + type = c("gene", "mRNA", "exon", "exon"), + ID = c("gene1", "tx1", "exon1", "exon2"), + Parent = c(NA, "gene1", "tx1", "tx1") + )) + ) + cdsbytx <- suppressMessages(cdsBy(txdb, by="tx")) + if (length(cdsbytx) == 0L) return(invisible(NULL)) # skip if txdb empty + + ## Deletion that straddles exon1 end (pos 85) into the intron (pos 95) + ## Genomic span = 11 bp, but only 6 bp are exonic (85-90) + del <- GRanges("chr1", IRanges(85, 95), strand="+", + REF=DNAStringSet("NNNNNNNNNN"), + ALT=DNAStringSetList(DNAStringSet("-"))) + names(del) <- "boundary_del" + + ## Build a tiny genome fasta + fa <- Biostrings::DNAStringSet(paste0( + paste(rep("A", 84), collapse=""), # 1-84 + "CGTACG", # 85-90 (exon1 end, 6 nt) + paste(rep("T", 110), collapse=""), # 91-200 intron + paste(rep("G", 100), collapse="") # 201-300 exon2 + )) + names(fa) <- "chr1" + tmpfa <- tempfile(fileext=".fa") + Biostrings::writeXStringSet(fa, tmpfa) + genome <- Rsamtools::FaFile(tmpfa) + + res <- tryCatch( + predictCoding(del, txdb, seqSource=genome, + varAllele=DNAStringSet("-")), + error = function(e) NULL + ) + ## If a result is returned the CDSLOC width must be <= 6 (exonic bases only) + ## not 11 (the full genomic span) + if (!is.null(res) && length(res) > 0L) { + cdswidths <- width(mcols(res)$CDSLOC) + checkTrue(all(cdswidths <= 6L)) + } +} From 78b39b76a356144c6a5177ae34a13008a5d14db2 Mon Sep 17 00:00:00 2001 From: John Muirhead-Gould Date: Mon, 15 Jun 2026 12:33:45 -0400 Subject: [PATCH 7/7] fix: quote-aware VCF header parser preserves '=' in Description values (#80, #89) htslib/scanBcfHeader splits structured header fields on '=' without respecting double-quoted values, silently truncating Description strings that contain '=' (e.g. VRS version=2.0.1). Add .parseVcfHeaderBody() and .parseRawVcfHeader() to re-parse raw header text and patch each DataFrame back to the correct values. Fixes #80, fixes #89 --- R/methods-scanVcfHeader.R | 152 +++++++++++++++++++++++++++++++++- inst/unitTests/test_scanVcf.R | 37 +++++++++ 2 files changed, 185 insertions(+), 4 deletions(-) diff --git a/R/methods-scanVcfHeader.R b/R/methods-scanVcfHeader.R index b22429a..720576a 100644 --- a/R/methods-scanVcfHeader.R +++ b/R/methods-scanVcfHeader.R @@ -1,3 +1,135 @@ +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### Quote-aware VCF header line parser +### +### scanBcfHeader() (htslib) splits structured header fields on '=' without +### respecting double-quoted values, so a Description or CommandLineOptions +### containing '=' gets silently truncated. We re-parse the raw header text +### here and patch each DataFrame back to the correct values. +### + +## Parse a single structured header body "$", "", body)) + + ## Walk character by character, splitting on ',' outside quotes, + ## then splitting each token on the FIRST '=' outside quotes. + chars <- strsplit(body, "")[[1]] + n <- length(chars) + in_q <- FALSE + tokens <- character(0) + start <- 1L + + for (i in seq_len(n)) { + ch <- chars[i] + if (ch == '"') { + in_q <- !in_q + } else if (ch == ',' && !in_q) { + tokens <- c(tokens, paste(chars[start:(i - 1L)], collapse = "")) + start <- i + 1L + } + } + tokens <- c(tokens, paste(chars[start:n], collapse = "")) + + ## For each token split on the first '=' that is NOT inside quotes + result <- character(0) + for (tok in tokens) { + tc <- strsplit(tok, "")[[1]] + tn <- length(tc) + tq <- FALSE + split_at <- NA_integer_ + for (j in seq_len(tn)) { + if (tc[j] == '"') { + tq <- !tq + } else if (tc[j] == '=' && !tq) { + split_at <- j + break + } + } + if (!is.na(split_at)) { + key <- paste(tc[seq_len(split_at - 1L)], collapse = "") + val <- paste(tc[(split_at + 1L):tn], collapse = "") + ## strip surrounding quotes from value + val <- sub('^"(.*)"$', "\\1", val) + result[key] <- val + } + ## tokens without '=' (e.g. bare flags) are skipped — consistent with + ## what scanBcfHeader produces + } + result +} + +## Parse all ##TYPE=<...> meta-lines from raw header text. +## Returns a named list of data.frames (one per TYPE), each row being one +## record, columns being the union of keys seen. Row names are the ID field. +.parseRawVcfHeader <- function(raw_lines) { + ## only structured lines: ##KEY=<...> + struct <- grep("^##[A-Za-z0-9_]+=<", raw_lines, value = TRUE) + if (!length(struct)) + return(list()) + + ## extract TYPE and BODY + type <- sub("^##([A-Za-z0-9_]+)=<.*$", "\\1", struct) + body <- sub("^##[A-Za-z0-9_]+=(<.*>)$", "\\1", struct) + + ## parse each line into a named character vector + parsed <- lapply(body, .parseVcfHeaderBody) + + ## group by TYPE + types_unique <- unique(type) + result <- vector("list", length(types_unique)) + names(result) <- types_unique + + for (tp in types_unique) { + rows <- parsed[type == tp] + ## union of all keys + all_keys <- unique(unlist(lapply(rows, names))) + mat <- matrix(NA_character_, nrow = length(rows), ncol = length(all_keys), + dimnames = list(NULL, all_keys)) + for (i in seq_along(rows)) { + k <- names(rows[[i]]) + mat[i, k] <- rows[[i]][k] + } + df <- as.data.frame(mat, stringsAsFactors = FALSE) + rownames(df) <- if ("ID" %in% colnames(df)) df$ID else seq_len(nrow(df)) + result[[tp]] <- df + } + result +} + +## Patch the DataFrames produced by scanBcfHeader with correctly-parsed values +## from the raw header lines. +.patchVcfHeader <- function(hdr_list, raw_lines) { + parsed <- .parseRawVcfHeader(raw_lines) + if (!length(parsed)) + return(hdr_list) + + for (tp in names(parsed)) { + ref_df <- parsed[[tp]] # correctly-parsed data.frame + curr_df <- hdr_list[[tp]] # possibly-truncated DataFrame (or NULL) + + if (is.null(curr_df) || !is(curr_df, "DataFrame")) + next + + ## For each column that exists in both, overwrite with re-parsed values + ## matched by row name (ID). + common_ids <- intersect(rownames(curr_df), rownames(ref_df)) + if (!length(common_ids)) + next + + for (col in intersect(colnames(curr_df), colnames(ref_df))) { + curr_df[common_ids, col] <- ref_df[common_ids, col] + } + hdr_list[[tp]] <- curr_df + } + hdr_list +} + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### scanVcfHeader methods +### + setMethod(scanVcfHeader, "missing", function(file, ...) { @@ -5,11 +137,14 @@ setMethod(scanVcfHeader, "missing", }) setMethod(scanVcfHeader, "character", - function(file, ...) + function(file, ...) { if (length(file)) { - hdr <- scanBcfHeader(file[[1]], ...)[[1]] - VCFHeader(hdr$Reference, hdr$Sample, hdr$Header) + hdr <- scanBcfHeader(file[[1]], ...)[[1]] + raw_lines <- readLines(file[[1]]) + raw_lines <- raw_lines[startsWith(raw_lines, "##")] + patched <- .patchVcfHeader(hdr$Header, raw_lines) + VCFHeader(hdr$Reference, hdr$Sample, patched) } else { VCFHeader() } @@ -18,5 +153,14 @@ setMethod(scanVcfHeader, "character", setMethod(scanVcfHeader, "TabixFile", function(file, ...) { - scanVcfHeader(path(file), ...) + if (isOpen(file)) { + ## already open: fall back to path-based method which reads raw lines + scanVcfHeader(path(file), ...) + } else { + hdr <- scanBcfHeader(path(file), ...)[[1]] + raw_lines <- headerTabix(file)$header + raw_lines <- raw_lines[startsWith(raw_lines, "##")] + patched <- .patchVcfHeader(hdr$Header, raw_lines) + VCFHeader(hdr$Reference, hdr$Sample, patched) + } }) diff --git a/inst/unitTests/test_scanVcf.R b/inst/unitTests/test_scanVcf.R index 227d67e..f6cd326 100644 --- a/inst/unitTests/test_scanVcf.R +++ b/inst/unitTests/test_scanVcf.R @@ -153,6 +153,43 @@ test_scanVcfHeader_META <- function() checkIdentical(names(meta(hd)), nms) } +test_scanVcfHeader_quotedEquals <- function() +{ + ## Regression test: scanVcfHeader must not truncate Description or other + ## quoted string values that contain '=' characters. + ## htslib (scanBcfHeader) splits on '=' without respecting quotes; + ## VariantAnnotation re-parses raw header lines to restore correct values. + tmp <- tempfile(fileext = ".vcf") + on.exit(unlink(tmp)) + ## Note: literal double-quotes are embedded in the header lines below. + lines <- c( + '##fileformat=VCFv4.1', + paste0('##INFO='), + '##FORMAT=', + '##FILTER=', + paste0('##GATKCommandLine='), + '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO' + ) + writeLines(lines, tmp) + hd <- scanVcfHeader(tmp) + + ## Description with embedded '=' must be preserved in full + checkIdentical( + info(hd)["VRS", "Description"], + "Alleles [VRS version=2.0.1;VRS-Python version=2.1.1]" + ) + ## Normal (no '=' in value) still works + checkIdentical(geno(hd)["GT", "Description"], "Genotype") + checkIdentical(fixed(hd)$FILTER["PASS", "Description"], "All filters passed") + ## Arbitrary meta field with '=' in quoted value + checkIdentical( + meta(hd)$GATKCommandLine["HaplotypeCaller", "CommandLineOptions"], + "analysis_type=HaplotypeCaller input=foo.bam" + ) +} + test_scan_row.names <- function() { fl <- system.file("extdata", "chr7-sub.vcf.gz", package="VariantAnnotation")