Skip to content
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ Type: Package
Title: Annotation of Genetic Variants
Description: Annotate variants, compute amino acid coding changes,
predict coding outcomes.
Version: 1.59.0
Version: 1.59.4
Authors@R: c(
person("Valerie", "Oberchain", role="aut"),
person("Martin", "Morgan", role="aut"),
Expand Down
41 changes: 41 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,44 @@
CHANGES IN VERSION 1.59.4
-------------------------

ENHANCEMENTS

o expand() on a CollapsedVCF now preserves non-standard rowRanges
metadata columns (user-added mcols such as SNP_name, num_alts, etc.).
Previously mcols(rdexp) <- NULL unconditionally dropped all columns;
now only the VCF fixed columns (REF/ALT/QUAL/FILTER/paramRangeID)
are removed before the expanded fixed data are re-attached.
(GitHub issue #85)

CHANGES IN VERSION 1.59.3
-------------------------

BUG FIXES

o locateVariants() no longer silently misses intronic (and UTR, splice-
site, coding) variants when called against a TxDb that contains
transcripts with no features (e.g. non-coding RNA transcripts that
have no CDS/intron records). Root cause: all six internal cache-
population sites called cdsBy()/intronsByTranscript()/etc. without
filtering empty list elements; findOverlaps() on a GRangesList with
empty elements returns spurious or absent hits depending on the
overlap type. Fix: filter lengths(x) > 0L at every cache-fill site
in methods-locateVariants.R. Reproducer from GitHub issue #87 (and
the earlier #76) now returns correct IntronVariants hits without
requiring a manual cache workaround. (GitHub issue #87)

CHANGES IN VERSION 1.59.2
-------------------------

BUG FIXES

o locateVariants() and predictCoding() no longer silently drop large
deletions (or other wide variants) whose genomic span extends beyond
transcript/CDS boundaries. Previously these were lost because
mapToTranscripts() uses findOverlaps(type="within") internally; the
fix clips variant ranges to the overlapping subject bounds before
mapping. (GitHub issue #81)

CHANGES IN VERSION 1.36.0
-------------------------

Expand Down
41 changes: 41 additions & 0 deletions R/AllUtilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,50 @@
### predictCoding()
###

.clipToSubject <- function(query, subject)
{
## For large deletions (and other wide variants) whose genomic span
## extends beyond transcript/CDS boundaries, mapToTranscripts() silently
## drops them because it uses findOverlaps(type="within") internally.
## To handle these, we clip each query range to the bounding box of any
## overlapping subject element, preserving strand and metadata.
## Variants that don't overlap any subject are returned unchanged (they
## will simply produce no hits in mapToTranscripts as before).
subj_flat <- unlist(subject, use.names=FALSE)
hits <- findOverlaps(query, subj_flat, type="any",
ignore.strand=TRUE, minoverlap=1L)
if (length(hits) == 0L)
return(query)

## For each query, compute the union span of all overlapping subject ranges
qh <- queryHits(hits)
sh <- subjectHits(hits)
subj_starts <- start(subj_flat)[sh]
subj_ends <- end(subj_flat)[sh]
clip_start <- tapply(subj_starts, qh, min)
clip_end <- tapply(subj_ends, qh, max)
clip_idx <- as.integer(names(clip_start)) ## unique query indices

clipped <- query
new_start <- pmax(start(clipped)[clip_idx], clip_start)
new_end <- pmin(end(clipped)[clip_idx], clip_end)
## only clip ranges that actually extend beyond the subject
needs_clip <- start(clipped)[clip_idx] < clip_start |
end(clipped)[clip_idx] > clip_end
if (any(needs_clip)) {
idx_to_clip <- clip_idx[needs_clip]
start(clipped)[idx_to_clip] <- new_start[needs_clip]
end(clipped)[idx_to_clip] <- new_end[needs_clip]
}
clipped
}

.localCoordinates <- function(from, to, ignore.strand, ...)
{
## 'to' is a GRangesList of cds by transcript
## Clip large variants (e.g. long deletions) to transcript bounds so
## mapToTranscripts() does not silently drop them (see Bioc issue #81).
from <- .clipToSubject(from, to)
map <- mapToTranscripts(unname(from), to, ignore.strand=ignore.strand, ...)
if (length(map) == 0) {
res <- GRanges()
Expand Down
11 changes: 9 additions & 2 deletions R/methods-expand.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,17 @@ setMethod("expand", "CollapsedVCF",
fexp$ALT <- unlist(alt(x), use.names=FALSE)
gexp <- .expandGeno(x, hdr, elt, idx)

## rowRanges
## rowRanges — preserve non-standard mcols (user-added columns);
## drop only the VCF fixed columns (REF/ALT/QUAL/FILTER/paramRangeID)
## which will be re-attached via fexp. (GitHub issue #85)
.vcf_fixed_cols <- c("REF", "ALT", "QUAL", "FILTER", "paramRangeID")
if (is.null(rd$paramRangeID)) {
rdexp <- rd[idx, ]
mcols(rdexp) <- NULL
extra_cols <- setdiff(names(mcols(rdexp)), .vcf_fixed_cols)
mcols(rdexp) <- if (length(extra_cols) > 0L)
mcols(rdexp)[extra_cols]
else
NULL
} else rdexp <- rd[idx, "paramRangeID"]

tmp = VCF(rdexp, colData(x), metadata(x), fexp, iexp, gexp,
Expand Down
53 changes: 34 additions & 19 deletions R/methods-locateVariants.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "CodingVariants"),
## for width(ranges) == 0 : decrement start to equal end value
if (any(insertion <- width(query) == 0))
start(query)[insertion] <- start(query)[insertion] - 1
if (!exists("cdsbytx", cache, inherits=FALSE))
cache[["cdsbytx"]] <- cdsBy(subject)
if (!exists("cdsbytx", cache, inherits=FALSE)) {
z <- cdsBy(subject)
cache[["cdsbytx"]] <- z[lengths(z) > 0L]
}
res <- callGeneric(query, cache[["cdsbytx"]], region, ...,
ignore.strand=ignore.strand, asHits=asHits)
if (is(res, "GenomicRanges") & length(res) > 0L) {
Expand Down Expand Up @@ -95,8 +97,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "IntronVariants"),
## for width(ranges) == 0 : decrement start to equal end value
if (any(insertion <- width(query) == 0))
start(query)[insertion] <- start(query)[insertion] - 1
if (!exists("intbytx", cache, inherits=FALSE))
cache[["intbytx"]] <- intronsByTranscript(subject)
if (!exists("intbytx", cache, inherits=FALSE)) {
z <- intronsByTranscript(subject)
cache[["intbytx"]] <- z[lengths(z) > 0L]
}
res <- callGeneric(query, cache[["intbytx"]], region, ...,
ignore.strand=ignore.strand, asHits=asHits)
if (is(res, "GenomicRanges") & length(res) > 0L) {
Expand Down Expand Up @@ -129,8 +133,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "ThreeUTRVariants"),
## for width(ranges) == 0 : decrement start to equal end value
if (any(insertion <- width(query) == 0))
start(query)[insertion] <- start(query)[insertion] - 1
if (!exists("threeUTRbytx", cache, inherits=FALSE))
cache[["threeUTRbytx"]] <- threeUTRsByTranscript(subject)
if (!exists("threeUTRbytx", cache, inherits=FALSE)) {
z <- threeUTRsByTranscript(subject)
cache[["threeUTRbytx"]] <- z[lengths(z) > 0L]
}
res <- callGeneric(query, cache[["threeUTRbytx"]], region, ...,
ignore.strand=ignore.strand, asHits=asHits)
if (is(res, "GenomicRanges") & length(res) > 0L) {
Expand Down Expand Up @@ -163,8 +169,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "FiveUTRVariants"),
## for width(ranges) == 0 : decrement start to equal end value
if (any(insertion <- width(query) == 0))
start(query)[insertion] <- start(query)[insertion] - 1
if (!exists("fiveUTRbytx", cache, inherits=FALSE))
cache[["fiveUTRbytx"]] <- fiveUTRsByTranscript(subject)
if (!exists("fiveUTRbytx", cache, inherits=FALSE)) {
z <- fiveUTRsByTranscript(subject)
cache[["fiveUTRbytx"]] <- z[lengths(z) > 0L]
}
res <- callGeneric(query, cache[["fiveUTRbytx"]], region, ...,
ignore.strand=ignore.strand, asHits=asHits)
if (is(res, "GenomicRanges") & length(res) > 0L) {
Expand Down Expand Up @@ -199,8 +207,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "IntergenicVariants"),
start(query)[insertion] <- start(query)[insertion] - 1
## PRECEDEID and FOLLOWID as gene or transcript ids
if (idType(region) == "gene") {
if (!exists("txbygene", cache, inherits=FALSE))
cache[["txbygene"]] <- transcriptsBy(subject, "gene")
if (!exists("txbygene", cache, inherits=FALSE)) {
z <- transcriptsBy(subject, "gene")
cache[["txbygene"]] <- z[lengths(z) > 0L]
}
callGeneric(query, cache[["txbygene"]], region, ...,
ignore.strand=ignore.strand)
} else if (idType(region) == "tx") {
Expand Down Expand Up @@ -235,8 +245,10 @@ setMethod("locateVariants", c("GRanges", "TxDb", "SpliceSiteVariants"),
## for width(ranges) == 0 : decrement start to equal end value
if (any(insertion <- width(query) == 0))
start(query)[insertion] <- start(query)[insertion] - 1
if (!exists("intbytx", cache, inherits=FALSE))
cache[["intbytx"]] <- intronsByTranscript(subject)
if (!exists("intbytx", cache, inherits=FALSE)) {
z <- intronsByTranscript(subject)
cache[["intbytx"]] <- z[lengths(z) > 0L]
}
res <- callGeneric(query, cache[["intbytx"]], region, ...,
ignore.strand=ignore.strand, asHits=asHits)
if (is(res, "GenomicRanges") & length(res) > 0L) {
Expand Down Expand Up @@ -357,12 +369,13 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
if (!exists("fiveUTRbytx", cache, inherits=FALSE)) {
splicings <-
GenomicFeatures:::.getSplicingsForTranscriptsWithCDSs(subject)
cache[["fiveUTRbytx"]] <-
GenomicFeatures:::.make5UTRsByTranscript(subject, splicings)
z <- GenomicFeatures:::.make5UTRsByTranscript(subject, splicings)
cache[["fiveUTRbytx"]] <- z[lengths(z) > 0L]
}
if (!exists("threeUTRbytx", cache, inherits=FALSE)) {
z <- GenomicFeatures:::.make3UTRsByTranscript(subject, splicings)
cache[["threeUTRbytx"]] <- z[lengths(z) > 0L]
}
if (!exists("threeUTRbytx", cache, inherits=FALSE))
cache[["threeUTRbytx"]] <-
GenomicFeatures:::.make3UTRsByTranscript(subject, splicings)

fiveUTR <-
locateVariants(query, subject, FiveUTRVariants(),
Expand Down Expand Up @@ -543,7 +556,9 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
return(findOverlaps(query, subject, type="within",
ignore.strand=ignore.strand))

map <- mapToTranscripts(unname(query), subject,
## Clip large variants to transcript bounds before mapping (issue #81)
query_clipped <- .clipToSubject(query, subject)
map <- mapToTranscripts(unname(query_clipped), subject,
ignore.strand=ignore.strand)
if (length(map) > 0) {
xHits <- map$xHits
Expand All @@ -565,7 +580,7 @@ setMethod("locateVariants", c("GRanges", "TxDb", "AllVariants"),
## Ranges identified by 'map' only are discarded.
if (vtype == "coding") {
usub <- unlist(subject) ## names needed for mapping
map2 <- mapToTranscripts(unname(query)[xHits], usub,
map2 <- mapToTranscripts(unname(query_clipped)[xHits], usub,
ignore.strand=ignore.strand)
cds <- mcols(usub)$cds_id[map2$transcriptsHits]
if (length(cds)) {
Expand Down
18 changes: 11 additions & 7 deletions R/methods-predictCoding.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,6 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
## variant location in cds region
mcols(query) <- append(mcols(query), DataFrame(varAllele=varAllele))
txlocal <- .localCoordinates(query, cdsbytx, ignore.strand=FALSE, ...)
if (length(txlocal) == 0)
return(txlocal)

## reverse complement "-" strand
valid <- rep(TRUE, length(txlocal))
Expand All @@ -114,7 +112,9 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
}

## frameshift
refwidth <- width(txlocal)
## Use CDS-mapped width (CDSLOC) rather than genomic width so that
## exon/intron-spanning variants count only deleted CDS bases (#83).
refwidth <- width(mcols(txlocal)$CDSLOC)
altallele <- mcols(txlocal)$varAllele
fmshift <- abs(width(altallele) - refwidth) %% 3 != 0
if (any(fmshift))
Expand Down Expand Up @@ -181,12 +181,12 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
consequence <- rep("synonymous", length(txlocal))
consequence[nonsynonymous] <- "nonsynonymous"
consequence[fmshift] <- "frameshift"
consequence[nonsynonymous & (as.character(varAA) %in% "*")] <- "nonsense"
consequence[nonsynonymous & grepl("\\*", as.character(varAA), fixed=TRUE)] <- "nonsense"
consequence[zwidth | noTrans] <- "not translated"
consequence <- factor(consequence)

mcols(txlocal) <- append(mcols(txlocal),
DataFrame(GENEID=NA_character_,
DataFrame(GENEID=rep(NA_character_, length(txlocal)),
CONSEQUENCE=consequence,
REFCODON=refCodon,
VARCODON=varCodon,
Expand All @@ -197,10 +197,14 @@ setMethod("predictCoding", c("VRanges", "TxDb", "ANY", "missing"),
.getRefCodons <- function(txlocal, altpos, seqSource, cdsbytx)
{
## adjust codon end for
## - width of the reference sequence
## - width of the reference sequence *in CDS coordinates* (not genomic)
## - position of alt allele substitution in the codon
## Use CDSLOC width rather than genomic width so that variants that span
## an exon/intron boundary do not incorrectly extend the REFCODON across
## the splice junction into the next exon (issue #83).
cdswidth <- width(mcols(txlocal)$CDSLOC)
cstart <- ((start(mcols(txlocal)$CDSLOC) - 1L) %/% 3L) * 3L + 1L
cend <- cstart + (((altpos + width(txlocal) - 2L) %/% 3L) * 3L + 2L)
cend <- cstart + (((altpos + cdswidth - 2L) %/% 3L) * 3L + 2L)
txord <- match(mcols(txlocal)$TXID, names(cdsbytx))
txseqs <- extractTranscriptSeqs(seqSource, cdsbytx[txord])
DNAStringSet(substring(txseqs, cstart, cend))
Expand Down
Loading