diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index c1326560..fcf4b1c5 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -43,7 +43,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::rcmdcheck + extra-packages: github::HighlanderLab/RcppTskit/RcppTskit, any::rcmdcheck needs: check - uses: r-lib/actions/check-r-package@v2 diff --git a/.github/workflows/document.yaml b/.github/workflows/document.yaml index 24934b79..e631c37b 100644 --- a/.github/workflows/document.yaml +++ b/.github/workflows/document.yaml @@ -29,7 +29,7 @@ jobs: - name: Install dependencies uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::roxygen2 + extra-packages: github::HighlanderLab/RcppTskit/RcppTskit, any::roxygen2 needs: roxygen2 - name: Document diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index 0b260216..55f8463b 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -30,7 +30,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::pkgdown, local::. + extra-packages: github::HighlanderLab/RcppTskit/RcppTskit, any::pkgdown, local::. needs: website - name: Build site diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index e38eef6a..cb602ef5 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -36,7 +36,7 @@ jobs: - uses: r-lib/actions/setup-r-dependencies@v2 with: - extra-packages: any::covr, any::xml2 + extra-packages: github::HighlanderLab/RcppTskit/RcppTskit, any::covr, any::xml2 needs: coverage - name: Test coverage diff --git a/.gitignore b/.gitignore index 22624c10..16700790 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,5 @@ src/Makevars src/Makevars.win vignettes/*.html vignettes/*.pdf +.idea/ +src/.idea/ diff --git a/R/Class-Pop.R b/R/Class-Pop.R index ebf398c3..e6df0a89 100644 --- a/R/Class-Pop.R +++ b/R/Class-Pop.R @@ -686,7 +686,7 @@ newPop = function(rawPop,ploidy=NULL,simParam=NULL,nThreads=NULL,...){ .newPop = function(rawPop, id=NULL, mother=NULL, father=NULL, iMother=NULL, iFather=NULL, isDH=NULL, femaleParentPop=NULL, maleParentPop=NULL, - hist=NULL, simParam=NULL, nThreads=NULL,...){ + hist=NULL, histGen=NULL, simParam=NULL, nThreads=NULL,...){ if(is.null(simParam)){ simParam = get("SP",envir=.GlobalEnv) } @@ -813,7 +813,7 @@ newPop = function(rawPop,ploidy=NULL,simParam=NULL,nThreads=NULL,...){ if(simParam$isTrackPed){ if(simParam$isTrackRec){ - simParam$addToRec(lastId,id,iMother,iFather,isDH,hist,output@ploidy) + simParam$addToRec(lastId,id,iMother,iFather,isDH,hist,histGen,output@ploidy) #Jinyang modified }else{ simParam$addToPed(lastId,id,iMother,iFather,isDH) } diff --git a/R/Class-SimParam.R b/R/Class-SimParam.R index 26a661a9..0740f53f 100644 --- a/R/Class-SimParam.R +++ b/R/Class-SimParam.R @@ -111,6 +111,8 @@ SimParam = R6Class( private$.pedigree = matrix(NA_integer_,nrow=0,ncol=3) private$.isTrackRec = FALSE private$.recHist = list() + private$.isTrackRecGen = FALSE + private$.recHistGen = list() # Jinyang added private$.varA = numeric() private$.varG = numeric() private$.varE = numeric() @@ -187,6 +189,25 @@ SimParam = R6Class( invisible(self) }, + #' @description Sets genetic-coordinate recombination tracking for the simulation. Jinyang added. + #' By default this is turned off. When turned on, it will also turn on pedigree tracking. + #' + #' @param isTrackRecGen should genetic-coordinate recombination tracking be on. + #' @param force should the check for a running simulation be ignored. + setTrackRecGen = function(isTrackRecGen, force=FALSE){ + stopifnot(is.logical(isTrackRecGen)) + if(!force){ + private$.isRunning() + } + private$.isTrackRecGen = isTrackRecGen + if(isTrackRecGen){ + private$.isTrackPed = TRUE + private$.isTrackRec = TRUE + } + invisible(self) + }, + + #' @description Resets the internal lastId, the pedigree #' and recombination tracking (if in use) to the #' supplied lastId. Be careful using this function because @@ -217,6 +238,10 @@ SimParam = R6Class( if(private$.isTrackRec){ private$.recHist = private$.recHist[0:lastId] } + # Jinyang added + if(private$.isTrackRecGen){ + private$.recHistGen = private$.recHistGen[0:lastId] + } invisible(self) }, @@ -2139,9 +2164,12 @@ SimParam = R6Class( #' @param father vector of father iids #' @param isDH indicator for DH lines #' @param hist new recombination history + #' @param histGen new recombination history (genetic coordinate) #' @param ploidy ploidy level addToRec = function(lastId,id,mother,father,isDH, - hist,ploidy){ + hist, + histGen=NULL, # Jinyang added + ploidy){ nNewInd = lastId-private$.lastId stopifnot(nNewInd>0) if(length(isDH)==1) isDH = rep(isDH,nNewInd) @@ -2172,12 +2200,24 @@ SimParam = R6Class( names(newRecHist) = id private$.recHist = c(private$.recHist, newRecHist) private$.lastHaplo = tmpLastHaplo + + # Jinyang added + if(private$.isTrackRecGen){ + #newRecHistGen = vector("list", nNewInd) + #names(newRecHistGen) = id + private$.recHistGen = c(private$.recHistGen, newRecHist) + } }else{ # Add hist to recombination history private$.hasHap = c(private$.hasHap, rep(FALSE, nNewInd)) private$.isFounder = c(private$.isFounder, rep(FALSE, nNewInd)) names(hist) = id private$.recHist = c(private$.recHist, hist) + # Jinyang added + if(private$.isTrackRecGen){ + names(histGen) = id + private$.recHistGen = c(private$.recHistGen, histGen) + } } private$.pedigree = rbind(private$.pedigree, tmp) private$.lastId = lastId @@ -2292,6 +2332,8 @@ SimParam = R6Class( .pedigree="matrix", .isTrackRec="logical", .recHist="list", + .isTrackRecGen = "logical", + .recHistGen = "list", #Jinyang added .varA="numeric", .varG="numeric", .varE="numeric", @@ -2735,6 +2777,25 @@ SimParam = R6Class( } }, + #' @field isTrackRecGen is recombination being tracked. Jinyang added. + isTrackRecGen = function(value){ + if(missing(value)){ + private$.isTrackRecGen + }else{ + stop("`$isTrackRecGen` is read only",call.=FALSE) + } + }, + + #' @field recHistGen list of historic recombination events. Jinyang added. + recHistGen = function(value){ + if(missing(value)){ + private$.recHistGen + }else{ + stop("`$recHistGen` is read only",call.=FALSE) + } + }, + + #' @field haplotypes list of computed IBD haplotypes haplotypes=function(value){ if(missing(value)){ diff --git a/R/RcppExports.R b/R/RcppExports.R index ed16b7c5..f59c797e 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -292,8 +292,8 @@ createIbdMat <- function(ibd, chr, nLoci, ploidy, nThreads) { .Call(`_AlphaSimR_createIbdMat`, ibd, chr, nLoci, ploidy, nThreads) } -cross <- function(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads) { - .Call(`_AlphaSimR_cross`, motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads) +cross <- function(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads, trackRecGen) { + .Call(`_AlphaSimR_cross`, motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads, trackRecGen) } createDH2 <- function(geno, nDH, genMap, v, p, trackRec, nThreads) { diff --git a/R/alphaSimR2Ts.R b/R/alphaSimR2Ts.R new file mode 100644 index 00000000..f0874957 --- /dev/null +++ b/R/alphaSimR2Ts.R @@ -0,0 +1,276 @@ +library(jsonlite) + +recHistMatToSegDf <- function(histMat, nLoci) { + + origin <- as.integer(histMat[, 1]) + starts <- as.integer(histMat[, 2]) + + ends <- c(starts[-1] - 1L, nLoci) + + data.frame( + origin = origin, + locusStart = starts, + locusEnd = ends, + stringsAsFactors = FALSE + ) +} + + +recHistToSegDfWithParents <- function(SP, offspringPop, nLociByChr) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHist[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + nLoci <- nLociByChr[[cc]] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistMatToSegDf(haps[[h]], nLoci = nLoci) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "locusStart","locusEnd", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegFromSimOutput <- function(SP, simOutput) { + bridgeSegDfList <<- list() + + nLociByChr <- lapply(chrKeptPosBpList, length) + + for (k in 2:length(simOutput)) { + segDf <- recHistToSegDfWithParents(SP, simOutput[[k]], nLociByChr) + bridgeSegDfList[[length(bridgeSegDfList) + 1]] <<- segDf + } + + invisible(bridgeSegDfList) +} + + +segDfToEdgeDfUsingBridge <- function(segDf, chr_info) { + # segDF: childID, hap, chr, locusStart, locusEnd, origin + out <- segDf + out$left <- NA_real_ + out$right <- NA_real_ + + for (cc in sort(unique(out$chr))) { + #posBp <- bridgeEnv$chrKeptPosBpList[[cc]] + #if (is.null(posBp)) stop("bridgeEnv$chrKeptPosBpList[[", cc, "]] is NULL.") + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- as.numeric(ts$sequence_length) + + idx <- which(out$chr == cc) + + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + out +} + +bridgeAllSegToEdgeDf <- function(chr_info) { + allSeg <- do.call(rbind, bridgeSegDfList) + + out <- allSeg + out$left <- NA + out$right <- NA + + for (cc in sort(unique(out$chr))) { + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + tc <- tc_load(tsPath) + seqLen <- tc$sequence_length() + + idx <- which(out$chr == cc) + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + + out +} + +bridgeComputeIndTime <- function(pedigree) { + n <- nrow(pedigree) + indTime <- rep(NA, n) + + for (i in 1:n) { + m <- pedigree[i, "mother"] + f <- pedigree[i, "father"] + + if (m == 0 && f == 0) { + indTime[i] <- 0 + } else { + indTime[i] <- min(indTime[m], indTime[f]) - 1 + } + } + + indTime +} + + +bridgeWriteTrees <- function(chr_info, edgeDf, SP, out_dir = NULL, + out_basename = "AlphaSimR_extended") { + + indTime <- bridgeComputeIndTime(SP$pedigree) + + nodeIdMapByChr <<- vector("list", length(chr_info)) + indIdMapByChr <<- vector("list", length(chr_info)) + + for (cc in seq_along(chr_info)) { + + nodeIdMapByChr[[cc]] <<- list() + indIdMapByChr[[cc]] <<- list() + + ts <- ts_load(chr_info[[cc]]$ts_path) + tc <- ts$dump_tables() + + df <- edgeDf[edgeDf$chr == cc, , drop = FALSE] + if (nrow(df) == 0) next + + # get indIDs for sampled nodes + sampNodeId <- ts$samples() + sampIndRow <- integer(length(sampNodeId)) + for (i in seq_along(sampNodeId)) { + sampIndRow[i] <- tc$node_table_get_row(sampNodeId[i])$individual + } + if (any(sampIndRow < 0)) { + bad <- which(sampIndRow < 0)[1] + stop( + "Sample node", sampNodeId[bad], "has individual = -1. ", + "Cannot reuse founders' individuals. ", + ) + } + + nFounder <- length(sampNodeId) / ploidy + idx <- 1 + for (ind in 1:nFounder) { + indRow <- sampIndRow[idx] + indIdMapByChr[[cc]][[as.character(ind)]] <<- indRow + + for (h in 1:ploidy) { + nodeId <- as.integer(unlist(sampNodeId[[idx]]))[1] + key <- paste(ind, h, sep = "_") + nodeIdMapByChr[[cc]][[key]] <<- nodeId + # list(alphaSimR = list(id = key))) + idx <- idx + 1 + } + } + + # add indIDs for offSpring nodes + nextInd <- as.integer(tc$num_individuals()) + addNewIndividual <- function(alphaId) { + key <- as.character(alphaId) + if (!is.null(indIdMapByChr[[cc]][[key]])) return(indIdMapByChr[[cc]][[key]]) + + m <- SP$pedigree[alphaId, "mother"] + f <- SP$pedigree[alphaId, "father"] + + mRow <- addNewIndividual(m) + fRow <- addNewIndividual(f) + + newId <- nextInd + tc$individual_table_add_row( + #parents = list(as.integer(mRow), as.integer(fRow)), + parents = c(as.integer(mRow), as.integer(fRow)), + metadata = charToRaw(toJSON( + list(file_id=as.integer(newId)), + auto_unbox = TRUE))) + + indIdMapByChr[[cc]][[key]] <<- as.integer(newId) + + nextInd <<- nextInd + 1L + newId + } + + childIdsNeeded <- sort(as.integer(unique(df$childId))) + for (childId in childIdsNeeded) { + addNewIndividual(childId) + } + + # append child nodes + childKeys <- unique(paste(df$childId, df$hap, sep = "_")) + for (key in childKeys) { + if (is.null(nodeIdMapByChr[[cc]][[key]])) { + childId <- as.integer(sub("_.*$", "", key)) + indRow <- indIdMapByChr[[cc]][[as.character(childId)]] + + tc$node_table_add_row( + flags = 0L, + time = indTime[[childId]], + population = -1L, + individual = indRow, + metadata = as.character(toJSON( + list(alphaSimR = list(id = key)), + auto_unbox = TRUE, force = TRUE)) + ) + nodeIdMapByChr[[cc]][[key]] <<- as.integer(tc$num_nodes() - 1) + } + } + + # append edges + for (i in 1:nrow(df)) { + parentKey <- paste(df$parentId[i], df$parentHap[i], sep = "_") + childKey <- paste(df$childId[i], df$hap[i], sep = "_") + + if (is.null(nodeIdMapByChr[[cc]][[parentKey]])) { + stop("Missing parent node for key=", parentKey, + " on chr=", cc, ". Check founder mapping.") + } + + tc$edge_table_add_row( + left = df$left[i], + right = df$right[i], + parent = nodeIdMapByChr[[cc]][[parentKey]], + child = nodeIdMapByChr[[cc]][[childKey]] + ) + } + + tc$sort() + newTs <- tc$tree_sequence() + + outDirCc <- if (is.null(out_dir)) dirname(chr_info[[cc]]$ts_path) else out_dir + outPath <- file.path(outDirCc, paste0(out_basename, "_chr", cc - 1, ".trees")) + + newTs$dump(outPath) + cat("Wrote:", outPath, "\n") + } + + invisible(TRUE) +} diff --git a/R/alphaSimR2TsGen.R b/R/alphaSimR2TsGen.R new file mode 100644 index 00000000..d93ee2d6 --- /dev/null +++ b/R/alphaSimR2TsGen.R @@ -0,0 +1,121 @@ +library(jsonlite) + +morgan2bpRate <- function(m, x0, breaks, rates, side=c("left","right")) { + # turn breaks into Morgan + segLen <- diff(breaks) + mStart <- c(0, cumsum(rates * segLen)) + # position of the 1st SNP in Morgan + i0 <- findInterval(x0, breaks, rightmost.closed = TRUE) + i0 <- pmin(pmax(i0, 1), length(rates)) + mX0 <- mStart[i0] + rates[i0] * (x0 - breaks[i0]) + # recombination breakpoints in Morgan count from the 1st SNP + M <- m + mX0 + i <- findInterval(M, mStart, rightmost.closed = TRUE) + i <- pmin(pmax(i, 1), length(rates)) + # record zero-recombination-rate regions + mEnd <- mStart[-1] + plateau <- (rates[i] == 0) | (mEnd[i] == mStart[i]) + out <- numeric(length(M)) + + # non-zero-recombination-rate regions + ii <- which(!plateau) + if (length(ii) > 0) { + out[ii] <- breaks[i[ii]] + (M[ii] - mStart[i[ii]]) / rates[i[ii]] + } + + # zero-recombination-rate regions + jj <- which(plateau) + if (length(jj) > 0) { + out[jj] <- if (side == "left") breaks[i[jj]] else breaks[i[jj] + 1L] + } + + out + +} + + + +recHistGenMatToSegDf <- function(histMat, x0, breaks, rates, seqLen) { + + origin <- as.integer(histMat[, 1]) + mStart <- as.numeric(histMat[, 2]) + mNext <- c(mStart[-1], NA_real_) + + left <- morgan2bpRate(mStart, x0, breaks, rates, side="left") + + right <- numeric(length(mStart)) + if (length(mStart) > 1) { + right[1:(length(mStart)-1)] <- morgan2bpRate(mNext[1:(length(mStart)-1)], + x0, breaks, rates, side="right") + } + right[length(mStart)] <- seqLen + left[1] <- 0 + + keep <- right > left + + data.frame( + origin = origin[keep], + left = left[keep], + right = right[keep], + stringsAsFactors = FALSE + ) +} + +recHistGenToSegDfWithParents <- function(SP, offspringPop) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHistGen[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + tc <- tc_load(chr_info[[cc]]$ts_path) + seqLen <- as.numeric(tc$sequence_length()) + breaks <- chr_info[[cc]]$breaks + rates <- chr_info[[cc]]$rates + + x0 <- chrKeptPosBpList[[cc]][1] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistGenMatToSegDf(haps[[h]], x0, breaks, rates, seqLen) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "left","right", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegGenFromSimOutput <- function(SP, simOutput) { + bridgeSegDfListGen <<- list() + + for (k in 2:length(simOutput)) { + segDf <- recHistGenToSegDfWithParents(SP, simOutput[[k]]) + bridgeSegDfListGen[[length(bridgeSegDfListGen) + 1]] <<- segDf + } + + invisible(bridgeSegDfListGen) +} diff --git a/R/crossing.R b/R/crossing.R index 32495504..00415b1e 100644 --- a/R/crossing.R +++ b/R/crossing.R @@ -96,7 +96,8 @@ makeCross = function(pop, crossPlan, nProgeny=1, simParam$femaleCentromere, simParam$maleCentromere, simParam$quadProb, - nThreads) + nThreads, + simParam$isTrackRecGen) dim(tmp$geno) = NULL # Account for matrix bug in RcppArmadillo @@ -112,7 +113,12 @@ makeCross = function(pop, crossPlan, nProgeny=1, }else{ hist = NULL } - + # Jinyang added + if(simParam$isTrackRecGen){ + histGen = tmp$recHistGen + } else { + histGen = NULL + } return(.newPop(rawPop=rPop, mother=pop@id[crossPlan[,1]], father=pop@id[crossPlan[,2]], @@ -121,6 +127,7 @@ makeCross = function(pop, crossPlan, nProgeny=1, femaleParentPop=pop, maleParentPop=pop, hist=hist, + histGen=histGen, # Jinyang added simParam=simParam, nThreads=nThreads)) } @@ -435,7 +442,8 @@ makeCross2 = function(females, males, crossPlan, nProgeny=1, simParam=NULL, simParam$femaleCentromere, simParam$maleCentromere, simParam$quadProb, - nThreads) + nThreads, + simParam$isTrackRecGen) # Jinyang added dim(tmp$geno) = NULL # Account for matrix bug in RcppArmadillo @@ -451,7 +459,13 @@ makeCross2 = function(females, males, crossPlan, nProgeny=1, simParam=NULL, }else{ hist = NULL } - + # Jinyang added + if(simParam$isTrackRecGen){ + histGen = tmp$recHistGen + } else { + histGen = NULL + } + return(.newPop(rawPop=rPop, mother=females@id[crossPlan[,1]], father=males@id[crossPlan[,2]], @@ -460,6 +474,7 @@ makeCross2 = function(females, males, crossPlan, nProgeny=1, simParam=NULL, femaleParentPop=females, maleParentPop=males, hist=hist, + histGen=histGen, # Jinyang added simParam=simParam, nThreads=nThreads)) } @@ -676,8 +691,9 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, simParam$femaleCentromere, simParam$maleCentromere, simParam$quadProb, - nThreads) - + nThreads, + simParam$isTrackRecGen) # Jinyang added + dim(tmp$geno) = NULL # Account for matrix bug in RcppArmadillo rPop = new("RawPop", @@ -692,7 +708,12 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, }else{ hist = NULL } - + # Jinyang added + if(simParam$isTrackRecGen){ + histGen = tmp$recHistGen + } else { + histGen = NULL + } if(keepParents){ return(.newPop(rawPop=rPop, mother=pop@mother[crossPlan[,1]], @@ -702,6 +723,7 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, femaleParentPop=pop, maleParentPop=pop, hist=hist, + histGen=histGen, simParam=simParam, nThreads=nThreads)) }else{ @@ -713,6 +735,7 @@ self = function(pop, nProgeny=1, parents=NULL, keepParents=TRUE, femaleParentPop=pop, maleParentPop=pop, hist=hist, + histGen=histGen, simParam=simParam, nThreads=nThreads)) } diff --git a/R/makeFoundersFromTs.R b/R/makeFoundersFromTs.R new file mode 100644 index 00000000..34ec04b0 --- /dev/null +++ b/R/makeFoundersFromTs.R @@ -0,0 +1,265 @@ +sample_segregating_variants <- function(ts, segSites, seed) { + + # Sample segregating variants from the tree sequence. + # + # Parameters + # ========== + # ts: tskit.TreeSequence + # The tree sequence to sample from. + # segSites: int + # The number of segregating sites to sample. + # seed: int + # The random seed to use for sampling. + # + # Returns + # ======= + # list of int + # The positions of the sampled segregating sites. + # Set the random seed for reproducibility. + set.seed(seed) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix and P vector based on required sample size (segSites) + # We only need space for 'segSites' number of variants + H <- matrix(NA_integer_, nrow = num_samples, ncol = segSites) + P <- numeric(segSites) + + it <- ts$variants() + + # k tracks how many biallelic variants we have encountered so far + k <- 0 + # current_size tracks how many variants are currently in our reservoir + current_size <- 0 + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + k <- k + 1 + + if (current_size < segSites) { + # Case A: Reservoir is not full yet + current_size <- current_size + 1 + H[, current_size] <- g + P[current_size] <- v$position + } else { + # Case B: Reservoir is full, use Prob. entry: j/k + # sample.int(k, 1) returns a value from 1 to k + j <- sample.int(k, 1) + + if (j <= segSites) { + # Replace the existing variant at index j + H[, j] <- g + P[j] <- v$position + } + } + } + } + + # 4. Final check: if we found fewer biallelic sites than segSites, trim the output + if (k < segSites) { + if (k > 0) { + H <- H[, 1:k, drop = FALSE] + P <- P[1:k] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + } + + return(list(H = H, P = P)) +} + + +segregating_variants <- function(ts) { + # 1. Get dimensions for pre-allocation + max_sites <- as.integer(ts$num_sites()) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix (Rows: samples, Cols: sites) + # Using integer matrix to save memory (similar to np.int8) + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + # Pre-allocate P vector for positions + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites (exactly 2 unique alleles) + if (length(unique(g)) == 2) { + count <- count + 1 + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results to the actual number of kept variants + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + + return(list(H = H, P = P)) +} + +segregating_variants_debug <- function(ts) { + # 1. Get dimensions for pre-allocation + max_sites <- ts$num_sites() + num_samples <- ts$num_samples() + + # DEBUG: Print initial metadata + message(paste("Expected max sites:", max_sites)) + message(paste("Expected num samples (from ts):", num_samples)) + + # 2. Pre-allocate H matrix + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # DEBUG: Check dimensions on the first iteration + if (count == 0) { + message(paste("Actual length of genotype vector (g):", length(g))) + message(paste("Matrix H_full has", nrow(H_full), "rows")) + + if (length(g) != nrow(H_full)) { + stop("DIMENSION MISMATCH: The genotype vector length does not match matrix rows!") + } + } + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + count <- count + 1 + + # DEBUG: Check for column overflow + if (count > max_sites) { + stop(paste("INDEX OVERFLOW: count (", count, ") exceeded max_sites (", max_sites, ")")) + } + + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + + message(paste("Success! Final count of biallelic variants:", count)) + return(list(H = H, P = P)) +} + +# rec map used in msprime: +rateMap2cumMorgan <- function(x, breaks, rates) { + stopifnot(length(breaks) == length(rates) + 1) + + o <- order(breaks) + breaks <- breaks[o] + + # M_i = m(breaks[i]) + seg_len <- diff(breaks) + M_start <- c(0, cumsum(rates * seg_len)) # length = length(breaks) + + i <- findInterval(x, breaks, rightmost.closed = FALSE) + i <- pmin(pmax(i, 1), length(rates)) + + m <- M_start[i] + rates[i] * (x - breaks[i]) + return(m) +} + + +ts2chrData <- function(ts_path, breaks, rates, segSites, site_sampling_seed) { + ts = ts_load(ts_path) + num_pos <- ts$num_sites() + + if (!is.null(segSites)) { + + if (num_pos < segSites) { + stop("Insufficient sites (only ", num_pos, " sites in the tree sequence).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + out <- sample_segregating_variants(ts, segSites, site_sampling_seed) + + if (length(out[[2]]) < segSites) { + stop("Insufficient sites (only ", length(out[[2]]), " sites after filtering non-biallelic sites).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + } + else { + out <- segregating_variants(ts) + } + + H <- out[[1]] + pos <- out[[2]] + + ordPos <- order(pos) + + pos <- pos[ordPos] + + mpos <- rateMap2cumMorgan(pos, breaks, rates) + + # relative position, so the 1st element is 0 + mpos <- mpos - min(mpos) + + ordMap <- order(mpos) + mpos <- mpos[ordMap] + pos <- pos[ordMap] + H <- H[, ordMap, drop = FALSE] + + + list( + genMap = list(mpos), + haplotypes = list(H), + keptPosBp = pos + ) +} + +asMapPop <- function(chr_info, ploidy = 2, inbred = FALSE, segSites = NULL, site_sampling_seed = 42) { + ploidy <<- ploidy + chr_data <- lapply(chr_info, function(info) { + ts2chrData( + ts_path = info$ts_path, + breaks = info$breaks, + rates = info$rates, + segSites = info$segSites, + site_sampling_seed = site_sampling_seed + ) + }) + + # save pos in bp for tskit tables + chrKeptPosBpList <<- lapply(chr_data, `[[`, "keptPosBp") + + genMap <- do.call(c, lapply(chr_data, `[[`, "genMap")) + haplotypes <- do.call(c, lapply(chr_data, `[[`, "haplotypes")) + + newMapPop(genMap = genMap, haplotypes = haplotypes, inbred = inbred, ploidy = ploidy) +} diff --git a/dev/alphaSimR2Ts.R b/dev/alphaSimR2Ts.R new file mode 100644 index 00000000..e8b1f704 --- /dev/null +++ b/dev/alphaSimR2Ts.R @@ -0,0 +1,275 @@ +library(jsonlite) + +recHistMatToSegDf <- function(histMat, nLoci) { + + origin <- as.integer(histMat[, 1]) + starts <- as.integer(histMat[, 2]) + + ends <- c(starts[-1] - 1L, nLoci) + + data.frame( + origin = origin, + locusStart = starts, + locusEnd = ends, + stringsAsFactors = FALSE + ) +} + + +recHistToSegDfWithParents <- function(SP, offspringPop, nLociByChr) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHist[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + nLoci <- nLociByChr[[cc]] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistMatToSegDf(haps[[h]], nLoci = nLoci) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "locusStart","locusEnd", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegFromSimOutput <- function(SP, simOutput) { + bridgeSegDfList <<- list() + + nLociByChr <- lapply(chrKeptPosBpList, length) + + for (k in 2:length(simOutput)) { + segDf <- recHistToSegDfWithParents(SP, simOutput[[k]], nLociByChr) + bridgeSegDfList[[length(bridgeSegDfList) + 1]] <<- segDf + } + + invisible(bridgeSegDfList) +} + + +segDfToEdgeDfUsingBridge <- function(segDf, chr_info) { + # segDF: childID, hap, chr, locusStart, locusEnd, origin + out <- segDf + out$left <- NA_real_ + out$right <- NA_real_ + + for (cc in sort(unique(out$chr))) { + #posBp <- bridgeEnv$chrKeptPosBpList[[cc]] + #if (is.null(posBp)) stop("bridgeEnv$chrKeptPosBpList[[", cc, "]] is NULL.") + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- as.numeric(ts$sequence_length) + + idx <- which(out$chr == cc) + + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + out +} + +bridgeAllSegToEdgeDf <- function(chr_info) { + allSeg <- do.call(rbind, bridgeSegDfList) + + out <- allSeg + out$left <- NA + out$right <- NA + + for (cc in sort(unique(out$chr))) { + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + tc <- tc_load(tsPath) + seqLen <- tc$sequence_length() + + idx <- which(out$chr == cc) + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + + out +} + +bridgeComputeIndTime <- function(pedigree) { + n <- nrow(pedigree) + indTime <- rep(NA, n) + + for (i in 1:n) { + m <- pedigree[i, "mother"] + f <- pedigree[i, "father"] + + if (m == 0 && f == 0) { + indTime[i] <- 0 + } else { + indTime[i] <- min(indTime[m], indTime[f]) - 1 + } + } + + indTime +} + + +bridgeWriteTrees <- function(chr_info, edgeDf, SP, out_dir = NULL, + out_basename = "AlphaSimR_extended") { + + indTime <- bridgeComputeIndTime(SP$pedigree) + + nodeIdMapByChr <<- vector("list", length(chr_info)) + indIdMapByChr <<- vector("list", length(chr_info)) + + for (cc in seq_along(chr_info)) { + + nodeIdMapByChr[[cc]] <<- list() + indIdMapByChr[[cc]] <<- list() + + ts <- ts_load(chr_info[[cc]]$ts_path) + tc <- ts$dump_tables() + + df <- edgeDf[edgeDf$chr == cc, , drop = FALSE] + if (nrow(df) == 0) next + + # get indIDs for sampled nodes + sampNodeId <- ts$samples() + sampIndRow <- integer(length(sampNodeId)) + for (i in seq_along(sampNodeId)) { + sampIndRow[i] <- tc$node_table_get_row(sampNodeId[i])$individual + } + if (any(sampIndRow < 0)) { + bad <- which(sampIndRow < 0)[1] + stop( + "Sample node", sampNodeId[bad], "has individual = -1. ", + "Cannot reuse founders' individuals. ", + ) + } + + nFounder <- length(sampNodeId) / ploidy + idx <- 1 + for (ind in 1:nFounder) { + indRow <- sampIndRow[idx] + indIdMapByChr[[cc]][[as.character(ind)]] <<- indRow + + for (h in 1:ploidy) { + nodeId <- as.integer(unlist(sampNodeId[[idx]]))[1] + key <- paste(ind, h, sep = "_") + nodeIdMapByChr[[cc]][[key]] <<- nodeId + # list(alphaSimR = list(id = key))) + idx <- idx + 1 + } + } + + # add indIDs for offSpring nodes + nextInd <- as.integer(tc$num_individuals()) + addNewIndividual <- function(alphaId) { + key <- as.character(alphaId) + if (!is.null(indIdMapByChr[[cc]][[key]])) return(indIdMapByChr[[cc]][[key]]) + + m <- SP$pedigree[alphaId, "mother"] + f <- SP$pedigree[alphaId, "father"] + + mRow <- addNewIndividual(m) + fRow <- addNewIndividual(f) + + newId <- nextInd + tc$individual_table_add_row( + parents = list(as.integer(mRow), as.integer(fRow)), + metadata = charToRaw(toJSON( + list(file_id=as.integer(newId)), + auto_unbox = TRUE))) + + indIdMapByChr[[cc]][[key]] <<- as.integer(newId) + + nextInd <<- nextInd + 1L + newId + } + + childIdsNeeded <- sort(as.integer(unique(df$childId))) + for (childId in childIdsNeeded) { + addNewIndividual(childId) + } + + # append child nodes + childKeys <- unique(paste(df$childId, df$hap, sep = "_")) + for (key in childKeys) { + if (is.null(nodeIdMapByChr[[cc]][[key]])) { + childId <- as.integer(sub("_.*$", "", key)) + indRow <- indIdMapByChr[[cc]][[as.character(childId)]] + + tc$node_table_add_row( + flags = 0L, + time = indTime[[childId]], + population = -1L, + individual = indRow, + metadata = as.character(toJSON( + list(alphaSimR = list(id = key)), + auto_unbox = TRUE, force = TRUE)) + ) + nodeIdMapByChr[[cc]][[key]] <<- as.integer(tc$num_nodes() - 1) + } + } + + # append edges + for (i in 1:nrow(df)) { + parentKey <- paste(df$parentId[i], df$parentHap[i], sep = "_") + childKey <- paste(df$childId[i], df$hap[i], sep = "_") + + if (is.null(nodeIdMapByChr[[cc]][[parentKey]])) { + stop("Missing parent node for key=", parentKey, + " on chr=", cc, ". Check founder mapping.") + } + + tc$edge_table_add_row( + left = df$left[i], + right = df$right[i], + parent = nodeIdMapByChr[[cc]][[parentKey]], + child = nodeIdMapByChr[[cc]][[childKey]] + ) + } + + tc$sort() + newTs <- tc$tree_sequence() + + outDirCc <- if (is.null(out_dir)) dirname(chr_info[[cc]]$ts_path) else out_dir + outPath <- file.path(outDirCc, paste0(out_basename, "_chr", cc - 1, ".trees")) + + newTs$dump(outPath) + cat("Wrote:", outPath, "\n") + } + + invisible(TRUE) +} diff --git a/dev/alphaSimR2TsGen.R b/dev/alphaSimR2TsGen.R new file mode 100644 index 00000000..d93ee2d6 --- /dev/null +++ b/dev/alphaSimR2TsGen.R @@ -0,0 +1,121 @@ +library(jsonlite) + +morgan2bpRate <- function(m, x0, breaks, rates, side=c("left","right")) { + # turn breaks into Morgan + segLen <- diff(breaks) + mStart <- c(0, cumsum(rates * segLen)) + # position of the 1st SNP in Morgan + i0 <- findInterval(x0, breaks, rightmost.closed = TRUE) + i0 <- pmin(pmax(i0, 1), length(rates)) + mX0 <- mStart[i0] + rates[i0] * (x0 - breaks[i0]) + # recombination breakpoints in Morgan count from the 1st SNP + M <- m + mX0 + i <- findInterval(M, mStart, rightmost.closed = TRUE) + i <- pmin(pmax(i, 1), length(rates)) + # record zero-recombination-rate regions + mEnd <- mStart[-1] + plateau <- (rates[i] == 0) | (mEnd[i] == mStart[i]) + out <- numeric(length(M)) + + # non-zero-recombination-rate regions + ii <- which(!plateau) + if (length(ii) > 0) { + out[ii] <- breaks[i[ii]] + (M[ii] - mStart[i[ii]]) / rates[i[ii]] + } + + # zero-recombination-rate regions + jj <- which(plateau) + if (length(jj) > 0) { + out[jj] <- if (side == "left") breaks[i[jj]] else breaks[i[jj] + 1L] + } + + out + +} + + + +recHistGenMatToSegDf <- function(histMat, x0, breaks, rates, seqLen) { + + origin <- as.integer(histMat[, 1]) + mStart <- as.numeric(histMat[, 2]) + mNext <- c(mStart[-1], NA_real_) + + left <- morgan2bpRate(mStart, x0, breaks, rates, side="left") + + right <- numeric(length(mStart)) + if (length(mStart) > 1) { + right[1:(length(mStart)-1)] <- morgan2bpRate(mNext[1:(length(mStart)-1)], + x0, breaks, rates, side="right") + } + right[length(mStart)] <- seqLen + left[1] <- 0 + + keep <- right > left + + data.frame( + origin = origin[keep], + left = left[keep], + right = right[keep], + stringsAsFactors = FALSE + ) +} + +recHistGenToSegDfWithParents <- function(SP, offspringPop) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHistGen[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + tc <- tc_load(chr_info[[cc]]$ts_path) + seqLen <- as.numeric(tc$sequence_length()) + breaks <- chr_info[[cc]]$breaks + rates <- chr_info[[cc]]$rates + + x0 <- chrKeptPosBpList[[cc]][1] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistGenMatToSegDf(haps[[h]], x0, breaks, rates, seqLen) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "left","right", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegGenFromSimOutput <- function(SP, simOutput) { + bridgeSegDfListGen <<- list() + + for (k in 2:length(simOutput)) { + segDf <- recHistGenToSegDfWithParents(SP, simOutput[[k]]) + bridgeSegDfListGen[[length(bridgeSegDfListGen) + 1]] <<- segDf + } + + invisible(bridgeSegDfListGen) +} diff --git a/dev/alphaSimR2TsGenPy.R b/dev/alphaSimR2TsGenPy.R new file mode 100644 index 00000000..0a374f58 --- /dev/null +++ b/dev/alphaSimR2TsGenPy.R @@ -0,0 +1,125 @@ +library(reticulate) +library(jsonlite) + +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") + +morgan2bpRate <- function(m, x0, breaks, rates, side=c("left","right")) { + # turn breaks into Morgan + segLen <- diff(breaks) + mStart <- c(0, cumsum(rates * segLen)) + # position of the 1st SNP in Morgan + i0 <- findInterval(x0, breaks, rightmost.closed = TRUE) + i0 <- pmin(pmax(i0, 1), length(rates)) + mX0 <- mStart[i0] + rates[i0] * (x0 - breaks[i0]) + # recombination breakpoints in Morgan count from the 1st SNP + M <- m + mX0 + i <- findInterval(M, mStart, rightmost.closed = TRUE) + i <- pmin(pmax(i, 1), length(rates)) + # record zero-recombination-rate regions + mEnd <- mStart[-1] + plateau <- (rates[i] == 0) | (mEnd[i] == mStart[i]) + out <- numeric(length(M)) + + # non-zero-recombination-rate regions + ii <- which(!plateau) + if (length(ii) > 0) { + out[ii] <- breaks[i[ii]] + (M[ii] - mStart[i[ii]]) / rates[i[ii]] + } + + # zero-recombination-rate regions + jj <- which(plateau) + if (length(jj) > 0) { + out[jj] <- if (side == "left") breaks[i[jj]] else breaks[i[jj] + 1L] + } + + out + +} + +recHistGenMatToSegDfPy <- function(histMat, x0, breaks, rates, seqLen) { + + origin <- as.integer(histMat[, 1]) + mStart <- as.numeric(histMat[, 2]) + mNext <- c(mStart[-1], NA_real_) + + left <- morgan2bpRate(mStart, x0, breaks, rates, side="left") + + right <- numeric(length(mStart)) + if (length(mStart) > 1) { + right[1:(length(mStart)-1)] <- morgan2bpRate(mNext[1:(length(mStart)-1)], + x0, breaks, rates, side="right") + } + right[length(mStart)] <- seqLen + left[1] <- 0 + + keep <- right > left + + data.frame( + origin = origin[keep], + left = left[keep], + right = right[keep], + stringsAsFactors = FALSE + ) +} + + +recHistGenToSegDfWithParentsPy <- function(SP, offspringPop) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHistGen[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + ts <- tskit$load(chr_info[[cc]]$ts_path) + seqLen <- as.numeric(ts$sequence_length) + breaks <- chr_info[[cc]]$breaks + rates <- chr_info[[cc]]$rates + + x0 <- chrKeptPosBpList[[cc]][1] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistGenMatToSegDfPy(haps[[h]], x0, breaks, rates, seqLen) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "left","right", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + + +bridgeCollectSegGenFromSimOutputPy <- function(SP, simOutput) { + bridgeSegDfListGen <<- list() + + for (k in 2:length(simOutput)) { + segDf <- recHistGenToSegDfWithParentsPy(SP, simOutput[[k]]) + bridgeSegDfListGen[[length(bridgeSegDfListGen) + 1]] <<- segDf + } + + invisible(bridgeSegDfListGen) +} diff --git a/dev/alphaSimR2TsPy.R b/dev/alphaSimR2TsPy.R new file mode 100644 index 00000000..a08d2848 --- /dev/null +++ b/dev/alphaSimR2TsPy.R @@ -0,0 +1,367 @@ +library(reticulate) +library(jsonlite) + +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") + +genLogRecord <- function(parentPop, offspringPop, simParam, genIndex) { + list( + genIndex = genIndex, + offspringIds = offspringPop@id, + ibd = pullIbdHaplo(offspringPop, simParam = simParam) + ) +} + +ibdToSegDf <- function(ibdMat) { + # chr_locus + cn <- colnames(ibdMat) + # ind_hap + rn <- rownames(ibdMat) + + chr <- as.integer(sub("_.*$", "", cn)) + locus <- as.integer(sub("^.*_", "", cn)) + + childId <- sub("_.*$", "", rn) + hap <- as.integer(sub("^.*_", "", rn)) + + out <- list() + k <- 1 + + num_hap <- length(hap) + uniqueChr <- sort(unique(chr)) + + for (r in seq_len(num_hap)) { + # each row in ibdMat = every child hap + v <- ibdMat[r, ] + for (cc in uniqueChr) { + # get index from chr to extract parent hap (vv) and position (ll) + idx <- which(chr == cc) + vv <- v[idx] + ll <- locus[idx] + + # find breakpoints + chg <- which(diff(vv) != 0) + starts <- c(1, chg + 1) + ends <- c(chg, length(vv)) + + out[[k]] <- data.frame( + childId = childId[r], + hap = hap[r], + chr = cc, + # extract position of ibd change + locusStart = ll[starts], + locusEnd = ll[ends], + origin = vv[starts], + stringsAsFactors = FALSE + ) + k <- k + 1 + } + } + do.call(rbind, out) +} + +recHistMatToSegDfPy <- function(histMat, nLoci) { + + origin <- as.integer(histMat[, 1]) + starts <- as.integer(histMat[, 2]) + + ends <- c(starts[-1] - 1L, nLoci) + + data.frame( + origin = origin, + locusStart = starts, + locusEnd = ends, + stringsAsFactors = FALSE + ) +} + + +recHistToSegDfWithParentsPy <- function(SP, offspringPop, nLociByChr) { + childIds <- offspringPop@id + ped <- SP$pedigree[childIds, , drop = FALSE] + + out <- list() + k <- 1 + + for (childId in childIds) { + x <- SP$recHist[[childId]] + + motherId <- ped[childId, "mother"] + fatherId <- ped[childId, "father"] + + for (cc in seq_along(x)) { + nLoci <- nLociByChr[[cc]] + + haps <- as.vector(x[[cc]]) + nHap <- length(haps) + + for (h in seq_len(nHap)) { + seg <- recHistMatToSegDfPy(haps[[h]], nLoci = nLoci) + + parentId <- if (h <= nHap/2) motherId else fatherId + + seg$childId <- childId + seg$chr <- cc + seg$hap <- h + seg$parentId <- parentId + + seg$parentHap <- seg$origin + seg$parentGlobalHapId <- (parentId - 1) * nHap + seg$parentHap + + out[[k]] <- seg[, c("childId", "hap", "chr", + "locusStart","locusEnd", + "parentId","parentHap","parentGlobalHapId")] + k <- k + 1 + } + } + } + + do.call(rbind, out) +} + +bridgeCollectSegFromSimOutputPy <- function(SP, simOutput) { + bridgeSegDfList <<- list() + + nLociByChr <- lapply(chrKeptPosBpList, length) + + for (k in 2:length(simOutput)) { + segDf <- recHistToSegDfWithParentsPy(SP, simOutput[[k]], nLociByChr) + bridgeSegDfList[[length(bridgeSegDfList) + 1]] <<- segDf + } + + invisible(bridgeSegDfList) +} + + +segDfToEdgeDfUsingBridge <- function(segDf, chr_info) { + # segDF: childID, hap, chr, locusStart, locusEnd, origin + out <- segDf + out$left <- NA_real_ + out$right <- NA_real_ + + for (cc in sort(unique(out$chr))) { + #posBp <- bridgeEnv$chrKeptPosBpList[[cc]] + #if (is.null(posBp)) stop("bridgeEnv$chrKeptPosBpList[[", cc, "]] is NULL.") + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- as.numeric(ts$sequence_length) + + idx <- which(out$chr == cc) + + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + out +} + +bridgeAllSegToEdgeDfPy <- function(chr_info) { + allSeg <- do.call(rbind, bridgeSegDfList) + + out <- allSeg + out$left <- NA + out$right <- NA + + for (cc in sort(unique(out$chr))) { + posBp <- chrKeptPosBpList[[cc]] + + tsPath <- chr_info[[cc]]$ts_path + ts <- tskit$load(tsPath) + seqLen <- ts$sequence_length + + idx <- which(out$chr == cc) + for (i in idx) { + s <- out$locusStart[i] + e <- out$locusEnd[i] + out$left[i] <- if (s == 1) 0 else posBp[s] + out$right[i] <- if (e < length(posBp)) posBp[e + 1] else seqLen + } + } + + out +} + +bridgeComputeIndTimePy <- function(pedigree) { + n <- nrow(pedigree) + indTime <- rep(NA, n) + + for (i in 1:n) { + m <- pedigree[i, "mother"] + f <- pedigree[i, "father"] + + if (m == 0 && f == 0) { + indTime[i] <- 0 + } else { + indTime[i] <- min(indTime[m], indTime[f]) - 1 + } + } + + indTime +} + + +bridgeWriteTreesPy <- function(chr_info, edgeDf, SP, out_dir = NULL, + out_basename = "AlphaSimR_extended") { + nodeSchema <- tskit$MetadataSchema(list( + codec = "json", + type = "object", + properties = list( + alphaSimR = list( + type = "object", + properties = list( + id = list(type = "string", description = "AlphaSimR node id (childId_hap)") + ), + required = list("id"), + additionalProperties = FALSE + ) + ), + required = list("alphaSimR"), + additionalProperties = FALSE + )) + + indTime <- bridgeComputeIndTimePy(SP$pedigree) + + nodeIdMapByChr <<- vector("list", length(chr_info)) + indIdMapByChr <<- vector("list", length(chr_info)) + + for (cc in seq_along(chr_info)) { + #nodeIdMap <<- list() + nodeIdMapByChr[[cc]] <<- list() + indIdMapByChr[[cc]] <<- list() + + ts <- tskit$load(chr_info[[cc]]$ts_path) + tables <- ts$dump_tables() + reticulate::py_set_attr(tables$nodes, "metadata_schema", nodeSchema) + + # for metadata + n <- tables$nodes$num_rows + encoded <- vector("list", n) + for (i in 0:(n - 1)) { + md_i <- tables$nodes[i]$metadata + encoded[[i + 1]] <- tables$nodes$metadata_schema$encode_row(md_i) + } + # ---- + + + df <- edgeDf[edgeDf$chr == cc, , drop = FALSE] + if (nrow(df) == 0) next + + # get indIDs for sampled nodes + sampNodeId <- ts$samples() + sampIndRow <- integer(length(sampNodeId)) + for (i in seq_along(sampNodeId)) { + sampIndRow[i] <- tables$nodes[sampNodeId[i]]$individual + } + if (any(sampIndRow < 0)) { + bad <- which(sampIndRow < 0)[1] + stop( + "Sample node", sampNodeId[bad], "has individual = -1. ", + "Cannot reuse founders' individuals. ", + ) + } + + nFounder <- length(sampNodeId) / ploidy + idx <- 1 + for (ind in 1:nFounder) { + indRow <- sampIndRow[idx] + indIdMapByChr[[cc]][[as.character(ind)]] <<- indRow + + for (h in 1:ploidy) { + #nodeIdMap[[paste(ind, h, sep = "_")]] <<- as.integer(sampNodeId[[idx]]) + nodeId <- as.integer(unlist(sampNodeId[[idx]]))[1] + key <- paste(ind, h, sep = "_") + nodeIdMapByChr[[cc]][[key]] <<- nodeId + #tables$nodes$metadata[[nodeId + 1]] <- tskit$pack_bytes(list(alphaSimR = list(id = key))) + #tables$nodes[nodeId + 1] <- tables$nodes[nodeId + 1]$replace(metadata=list(alphaSimR = list(id = key))) + encoded[[nodeId + 1]] <- tables$nodes$metadata_schema$encode_row( + list(alphaSimR = list(id = key))) + idx <- idx + 1 + } + } + tables$nodes$packset_metadata(encoded) + + # add indIDs for offSpring nodes + nextInd <- as.integer(tables$individuals$num_rows) + addNewIndividual <- function(alphaId) { + key <- as.character(alphaId) + if (!is.null(indIdMapByChr[[cc]][[key]])) return(indIdMapByChr[[cc]][[key]]) + + m <- SP$pedigree[alphaId, "mother"] + f <- SP$pedigree[alphaId, "father"] + + mRow <- addNewIndividual(m) + fRow <- addNewIndividual(f) + + newId <- nextInd + tables$individuals$add_row( + parents = list(as.integer(mRow), as.integer(fRow)), + metadata = list(file_id=as.integer(newId))) + indIdMapByChr[[cc]][[key]] <<- as.integer(newId) + + nextInd <<- nextInd + 1L + newId + } + + childIdsNeeded <- sort(as.integer(unique(df$childId))) + for (childId in childIdsNeeded) { + addNewIndividual(childId) + } + + # append child nodes + childKeys <- unique(paste(df$childId, df$hap, sep = "_")) + for (key in childKeys) { + #if (is.null(nodeIdMap[[key]])) { + if (is.null(nodeIdMapByChr[[cc]][[key]])) { + childId <- as.integer(sub("_.*$", "", key)) + indRow <- indIdMapByChr[[cc]][[as.character(childId)]] + + tables$nodes$add_row( + flags = 0L, + time = indTime[[childId]], + population = -1L, + individual = indRow, + metadata = list(alphaSimR = list(id = key)) + ) + #nodeIdMap[[key]] <<- as.integer(tables$nodes$num_rows - 1) + nodeIdMapByChr[[cc]][[key]] <<- as.integer(tables$nodes$num_rows - 1) + } + } + + # append edges + for (i in 1:nrow(df)) { + parentKey <- paste(df$parentId[i], df$parentHap[i], sep = "_") + childKey <- paste(df$childId[i], df$hap[i], sep = "_") + + #if (is.null(nodeIdMap[[key]])) { + if (is.null(nodeIdMapByChr[[cc]][[parentKey]])) { + stop("Missing parent node for key=", parentKey, + " on chr=", cc, ". Check founder mapping.") + } + + tables$edges$add_row( + left = df$left[i], + right = df$right[i], + #parent = as.integer(nodeIdMap[[parentKey]]), + #child = as.integer(nodeIdMap[[childKey]]) + parent = nodeIdMapByChr[[cc]][[parentKey]], + child = nodeIdMapByChr[[cc]][[childKey]] + ) + } + + tables$sort() + newTs <- tables$tree_sequence() + + outDirCc <- if (is.null(out_dir)) dirname(chr_info[[cc]]$ts_path) else out_dir + outPath <- file.path(outDirCc, paste0(out_basename, "_chr", cc - 1, ".trees")) + + newTs$dump(outPath) + cat("Wrote:", outPath, "\n") + } + + invisible(TRUE) +} diff --git a/dev/makeFoundersFromTs.R b/dev/makeFoundersFromTs.R new file mode 100644 index 00000000..34ec04b0 --- /dev/null +++ b/dev/makeFoundersFromTs.R @@ -0,0 +1,265 @@ +sample_segregating_variants <- function(ts, segSites, seed) { + + # Sample segregating variants from the tree sequence. + # + # Parameters + # ========== + # ts: tskit.TreeSequence + # The tree sequence to sample from. + # segSites: int + # The number of segregating sites to sample. + # seed: int + # The random seed to use for sampling. + # + # Returns + # ======= + # list of int + # The positions of the sampled segregating sites. + # Set the random seed for reproducibility. + set.seed(seed) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix and P vector based on required sample size (segSites) + # We only need space for 'segSites' number of variants + H <- matrix(NA_integer_, nrow = num_samples, ncol = segSites) + P <- numeric(segSites) + + it <- ts$variants() + + # k tracks how many biallelic variants we have encountered so far + k <- 0 + # current_size tracks how many variants are currently in our reservoir + current_size <- 0 + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + k <- k + 1 + + if (current_size < segSites) { + # Case A: Reservoir is not full yet + current_size <- current_size + 1 + H[, current_size] <- g + P[current_size] <- v$position + } else { + # Case B: Reservoir is full, use Prob. entry: j/k + # sample.int(k, 1) returns a value from 1 to k + j <- sample.int(k, 1) + + if (j <= segSites) { + # Replace the existing variant at index j + H[, j] <- g + P[j] <- v$position + } + } + } + } + + # 4. Final check: if we found fewer biallelic sites than segSites, trim the output + if (k < segSites) { + if (k > 0) { + H <- H[, 1:k, drop = FALSE] + P <- P[1:k] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + } + + return(list(H = H, P = P)) +} + + +segregating_variants <- function(ts) { + # 1. Get dimensions for pre-allocation + max_sites <- as.integer(ts$num_sites()) + num_samples <- as.integer(ts$num_samples()) + + # 2. Pre-allocate H matrix (Rows: samples, Cols: sites) + # Using integer matrix to save memory (similar to np.int8) + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + # Pre-allocate P vector for positions + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # Filter for biallelic sites (exactly 2 unique alleles) + if (length(unique(g)) == 2) { + count <- count + 1 + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results to the actual number of kept variants + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + + return(list(H = H, P = P)) +} + +segregating_variants_debug <- function(ts) { + # 1. Get dimensions for pre-allocation + max_sites <- ts$num_sites() + num_samples <- ts$num_samples() + + # DEBUG: Print initial metadata + message(paste("Expected max sites:", max_sites)) + message(paste("Expected num samples (from ts):", num_samples)) + + # 2. Pre-allocate H matrix + H_full <- matrix(NA_integer_, nrow = num_samples, ncol = max_sites) + P_full <- numeric(max_sites) + + it <- ts$variants() + count <- 0 + + # 3. Iterate through variants + repeat { + v <- it$next_variant() + if (is.null(v)) break + + g <- v$genotypes + + # DEBUG: Check dimensions on the first iteration + if (count == 0) { + message(paste("Actual length of genotype vector (g):", length(g))) + message(paste("Matrix H_full has", nrow(H_full), "rows")) + + if (length(g) != nrow(H_full)) { + stop("DIMENSION MISMATCH: The genotype vector length does not match matrix rows!") + } + } + + # Filter for biallelic sites + if (length(unique(g)) == 2) { + count <- count + 1 + + # DEBUG: Check for column overflow + if (count > max_sites) { + stop(paste("INDEX OVERFLOW: count (", count, ") exceeded max_sites (", max_sites, ")")) + } + + # Fill the matrix column directly + H_full[, count] <- g + P_full[count] <- v$position + } + } + + # 4. Trim the results + if (count > 0) { + H <- H_full[, 1:count, drop = FALSE] + P <- P_full[1:count] + } else { + H <- matrix(nrow = num_samples, ncol = 0) + P <- numeric(0) + } + + message(paste("Success! Final count of biallelic variants:", count)) + return(list(H = H, P = P)) +} + +# rec map used in msprime: +rateMap2cumMorgan <- function(x, breaks, rates) { + stopifnot(length(breaks) == length(rates) + 1) + + o <- order(breaks) + breaks <- breaks[o] + + # M_i = m(breaks[i]) + seg_len <- diff(breaks) + M_start <- c(0, cumsum(rates * seg_len)) # length = length(breaks) + + i <- findInterval(x, breaks, rightmost.closed = FALSE) + i <- pmin(pmax(i, 1), length(rates)) + + m <- M_start[i] + rates[i] * (x - breaks[i]) + return(m) +} + + +ts2chrData <- function(ts_path, breaks, rates, segSites, site_sampling_seed) { + ts = ts_load(ts_path) + num_pos <- ts$num_sites() + + if (!is.null(segSites)) { + + if (num_pos < segSites) { + stop("Insufficient sites (only ", num_pos, " sites in the tree sequence).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + out <- sample_segregating_variants(ts, segSites, site_sampling_seed) + + if (length(out[[2]]) < segSites) { + stop("Insufficient sites (only ", length(out[[2]]), " sites after filtering non-biallelic sites).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + } + else { + out <- segregating_variants(ts) + } + + H <- out[[1]] + pos <- out[[2]] + + ordPos <- order(pos) + + pos <- pos[ordPos] + + mpos <- rateMap2cumMorgan(pos, breaks, rates) + + # relative position, so the 1st element is 0 + mpos <- mpos - min(mpos) + + ordMap <- order(mpos) + mpos <- mpos[ordMap] + pos <- pos[ordMap] + H <- H[, ordMap, drop = FALSE] + + + list( + genMap = list(mpos), + haplotypes = list(H), + keptPosBp = pos + ) +} + +asMapPop <- function(chr_info, ploidy = 2, inbred = FALSE, segSites = NULL, site_sampling_seed = 42) { + ploidy <<- ploidy + chr_data <- lapply(chr_info, function(info) { + ts2chrData( + ts_path = info$ts_path, + breaks = info$breaks, + rates = info$rates, + segSites = info$segSites, + site_sampling_seed = site_sampling_seed + ) + }) + + # save pos in bp for tskit tables + chrKeptPosBpList <<- lapply(chr_data, `[[`, "keptPosBp") + + genMap <- do.call(c, lapply(chr_data, `[[`, "genMap")) + haplotypes <- do.call(c, lapply(chr_data, `[[`, "haplotypes")) + + newMapPop(genMap = genMap, haplotypes = haplotypes, inbred = inbred, ploidy = ploidy) +} diff --git a/dev/makeFoundersFromTsPy.R b/dev/makeFoundersFromTsPy.R new file mode 100644 index 00000000..de6f52cf --- /dev/null +++ b/dev/makeFoundersFromTsPy.R @@ -0,0 +1,152 @@ +#library(reticulate) +library(jsonlite) + +#use_virtualenv("~/r-reticulate-env", required = TRUE) +#msprime <- import("msprime") +#tskit <- import("tskit") + +reticulate::py_run_string(" +import numpy as np + +def sample_segregating_variants(ts, segSites, seed): + rng = np.random.default_rng(int(seed)) + + kept_pos = [] + kept_g = [] + k = 0 + + for v in ts.variants(): + g = v.genotypes + + if len(np.unique(g)) != 2: + continue + + # reservoir sampling + k += 1 + if len(kept_g) < segSites: + kept_g.append(g.copy()) + kept_pos.append(v.site.position) + else: + # Prob. entry: j/k + j = rng.integers(0, k) + if j < segSites: + kept_g[j] = g.copy() + kept_pos[j] = v.site.position + + H = np.stack(kept_g, axis=1).astype(np.int8) + P = np.array(kept_pos, dtype=float) + return H, P +") + +reticulate::py_run_string(" +import numpy as np + +def segregating_variants(ts): + + kept_pos = [] + kept_g = [] + + for v in ts.variants(): + g = v.genotypes + + if len(np.unique(g)) != 2: + continue + + kept_g.append(g.copy()) + kept_pos.append(v.site.position) + + H = np.stack(kept_g, axis=1).astype(np.int8) + P = np.array(kept_pos, dtype=float) + return H, P +") + + +# rec map used in msprime: +rateMap2cumMorgan <- function(x, breaks, rates) { + stopifnot(length(breaks) == length(rates) + 1) + + o <- order(breaks) + breaks <- breaks[o] + + # M_i = m(breaks[i]) + seg_len <- diff(breaks) + M_start <- c(0, cumsum(rates * seg_len)) # length = length(breaks) + + i <- findInterval(x, breaks, rightmost.closed = FALSE) + i <- pmin(pmax(i, 1), length(rates)) + + m <- M_start[i] + rates[i] * (x - breaks[i]) + return(m) +} + + +ts2chrDataPy <- function(ts_path, breaks, rates, segSites, site_sampling_seed) { + ts = tskit$load(ts_path) + + pos <- ts$tables$sites$position + + if (!is.null(segSites)) { + # stopifnot(length(pos) >= segSites) + if (length(pos) < segSites) { + stop("Insufficient sites (only ", length(pos), " sites in the tree sequence).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + out <- py$sample_segregating_variants(ts, segSites, seed=site_sampling_seed) + #if (length(out[[2]]) < segSites) { + # warning("Not enough sites kept after filtering non-biallelic sites.") + #} + if (length(out[[2]]) < segSites) { + stop("Insufficient sites (only ", length(out[[2]]), " sites after filtering non-biallelic sites).") + } + message(segSites, " variants sampled ", "(Random seed: ", site_sampling_seed, ")") + } + else { + out <- py$segregating_variants(ts) + } + + H <- out[[1]] + pos <- out[[2]] + + ordPos <- order(pos) + + pos <- pos[ordPos] + + mpos <- rateMap2cumMorgan(pos, breaks, rates) + + # relative position, so the 1st element is 0 + mpos <- mpos - min(mpos) + + ordMap <- order(mpos) + mpos <- mpos[ordMap] + pos <- pos[ordMap] + H <- H[, ordMap, drop = FALSE] + + + list( + genMap = list(mpos), + # haplotypes <- list(H) + haplotypes = list(H), + keptPosBp = pos + ) +} + +asMapPopPy <- function(chr_info, ploidy = 2, inbred = FALSE, segSites = NULL, site_sampling_seed = 42) { + ploidy <<- ploidy + chr_data <- lapply(chr_info, function(info) { + ts2chrDataPy( + ts_path = info$ts_path, + breaks = info$breaks, + rates = info$rates, + segSites = info$segSites, + site_sampling_seed = site_sampling_seed + ) + }) + + # save pos in bp for tskit tables + chrKeptPosBpList <<- lapply(chr_data, `[[`, "keptPosBp") + + genMap <- do.call(c, lapply(chr_data, `[[`, "genMap")) + haplotypes <- do.call(c, lapply(chr_data, `[[`, "haplotypes")) + + newMapPop(genMap = genMap, haplotypes = haplotypes, inbred = inbred, ploidy = ploidy) +} diff --git a/dev/multiple_chr.py b/dev/multiple_chr.py new file mode 100644 index 00000000..e6d9d91a --- /dev/null +++ b/dev/multiple_chr.py @@ -0,0 +1,54 @@ +import io +import msprime +from pathlib import Path + +output_path = "/Users/jliang2/Projects/test_TSK2ASR/data/simulations/normal" +Path(output_path).mkdir(parents=True, exist_ok=True) + +mut_rate = 1.25e-8 +mut_random_seed = 5678 + +ped_txt = """\ +# id parent0 parent1 time is_sample +0 2 3 0.0 1 +1 4 5 0.0 1 +2 6 7 1.0 0 +3 8 9 1.0 0 +4 6 7 1.0 0 +5 10 11 1.0 0 +6 . . 2.0 0 +7 . . 2.0 0 +8 . . 2.0 0 +9 . . 2.0 0 +10 . . 2.0 0 +11 . . 2.0 0 +""" + +Ls = [1000000, 2000000, 3000000] +#rs = [1e-8, 2e-8, 3e-8] +rate=[1e-7, 1e-8, 1e-7] + + +ts_chroms = [] +pedigree = msprime.parse_pedigree(io.StringIO(ped_txt), sequence_length=1) + +for i in range(len(Ls)): + pedigree.sequence_length = Ls[i] + rate_map = msprime.RateMap( + position=[0, round(Ls[i] / 3), 2 * round(Ls[i] / 3), Ls[i]], + rate=rate) + + ped_ts = msprime.sim_ancestry( + initial_state=pedigree, model="fixed_pedigree", + recombination_rate=rate_map, random_seed=i+1) + + ts_chroms.append( + msprime.sim_ancestry( + initial_state=ped_ts, population_size=1000, + recombination_rate=rate_map, model="dtwf", random_seed=i+100)) + +for i, ts in enumerate(ts_chroms): + print(f"chromosome {i} has length {ts.sequence_length} and {ts.num_trees} trees") + tree_seq_mut = msprime.sim_mutations(ts, rate=mut_rate, random_seed=mut_random_seed) + tree_seq_mut_tree_result = output_path + f"/msprime_chr{i}.trees" + tree_seq_mut.dump(tree_seq_mut_tree_result) diff --git a/dev/notes.md b/dev/notes.md new file mode 100644 index 00000000..17c2bb39 --- /dev/null +++ b/dev/notes.md @@ -0,0 +1,121 @@ +How I set up an R package using the tskit C API: + +1. Create a basic R package structure (in an existing R project folder) + +I removed other files and created a package directory with: +``` +AlphaSimRTmp/ + DESCRIPTION + R/ + src/ +``` + +(R will not recognise the directory as a package unless DESCRIPTION, R/, and src/ all exist) + +2. Write a minimal DESCRIPTION + +I added a minimal DESCRIPTION file. + +``` +Package: AlphaSimRTmp +Type: Package +Version: 0.0.1 +Imports: Rcpp +LinkingTo: Rcpp +``` + +3. Vendor tskit and kastore (only C API files) + +Tskit and karstore are from: https://github.com/tskit-dev/tskit/archive/refs/tags/1.0.0.zip +Inside src/, I created a deps/ directory and copied in only the C API parts: + +``` +src/deps/ + tskit/ + kastore/ + tskit.h +``` +tskit folder from: tskit-1.0.0/c/tskit +karstore folder from: tskit-1.0.0/c/subprojects/kastore +tskit.h from: tskit-1.0.0/c/tskit.h + +(I did not copy meson.build, examples, Python code and documentation etc.) + +4. Create a minimal C++ test file + +I added src/minimal.cpp with: + +#include + +two exported Rcpp functions: + +one to report the tskit version + +one smoke test that loads a .trees file using +tsk_table_collection_init → load → tsk_treeseq_init + +5. Create an initial NAMESPACE so Rcpp::compileAttributes() could run + +Before anything would compile, I created a minimal NAMESPACE in R: + +``` +writeLines(c( + "useDynLib(AlphaSimRTmp, .registration=TRUE)", + "importFrom(Rcpp, evalCpp)" +), "NAMESPACE") +``` + +6. Write Makevars to compile vendored C code + +In src/Makevars, I: + +added include paths for deps, deps/tskit, and deps/kastore; + +explicitly listed all tskit and kastore .c files; + +added custom rules to compile .c files in subdirectories; + +linked the resulting .o files into the package shared library. + +7. Generate Rcpp and roxygen outputs + +From the package root, in R: +``` +Rcpp::compileAttributes() +roxygen2::roxygenise() +``` +After this, R/RcppExports.R, src/RcppExports.cpp were created, but the minimal handwritten NAMESPACE was not replaced. + +8. Add zzz.R so roxygen2 can generate a correct NAMESPACE + +Before generating documentation, I created R/zzz.R with the following contents: +``` +#' @useDynLib AlphaSimRTmp, .registration = TRUE +#' @importFrom Rcpp evalCpp +NULL +``` + +This ensures that roxygen2 writes the required useDynLib() and importFrom(Rcpp, evalCpp) entries into NAMESPACE. + +9. I remove the handwritten NAMESPACE and generated it with running `roxygen2::roxygenise()` again. + +10. Clean install the package (not necessary; just because of bugs in lazy-load database caused by repeated installs) + +``` +rm -rf ~/Library/R/arm64/4.5/library/AlphaSimRTmp +rm -rf ~/Library/R/arm64/4.5/library/00LOCK-AlphaSimRTmp +R CMD INSTALL --preclean AlphaSimRTmp +``` + +11. Test + +In a fresh R session: +``` +> library(AlphaSimRTmp) +> tskit_version_test() +major minor patch + 1 3 0 +> +> tskit_smoke_load_free("...Projects/test_TSK2ASR/data/simulations/normal/msprime_chr1.trees") +[1] 1 +``` diff --git a/dev/notesRealBreakpoints.md b/dev/notesRealBreakpoints.md new file mode 100644 index 00000000..29eaa63f --- /dev/null +++ b/dev/notesRealBreakpoints.md @@ -0,0 +1,319 @@ +notesRealBreakpoints +================ +2026-02-18 + +## Test data + +Use `AlphaSimR_test/dev/multiple_chr.py` to simulate 2 chromosomes and 2 +dip individuals with pedigree. Or directly use `msprime_chr0.trees` and +`msprime_chr1.trees` in `AlphaSimR_test/dev/testData`. + +## Try this version + +In this version, a recHistGen object (similar to recHist) was added to +record the real recombination breakpoints. + +### Load tree sequences + + library(AlphaSimR) + use_virtualenv("~/r-reticulate-env", required = TRUE) + tskit <- import("tskit") + devtools::load_all() + + # two chromosomes + L1 <- 1e6 + L2 <- 2e6 + # here, use the same recombination map as used in msprime + chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + #breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) + ) + + founderGenomes1 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) + +### Run AlphaSimR + + set.seed(42) + SP = SimParam$new(founderGenomes1) + SP$setSexes("yes_sys") + SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + + SP$setTrackPed(TRUE) + # try the new function here, it automatically set setTrackRec also. + SP$setTrackRecGen(TRUE) + basePop = newPop(founderGenomes1) + + # the 2 objects are same now: + SP$recHistGen + SP$recHist + + basePop = setPheno(basePop, + h2 = 0.5) + + #--- n generations + nCycles<-2 + + # very simple container for each cycles sim output + simOutput<-list(basePop) + cycle<-1 + for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop + } + +Now we can see the difference between recHist and recHistGen: + + RHG <- SP$recHistGen + RH <- SP$recHist + # ind 3; chr 2; hap 1. Maybe not the same output, please check RHG and RH to find a hap with recombination + rh <- RH[[3]][[2]][[1]] + rhg <- RHG[[3]][[2]][[1]] + gm <- SP$genMap[[2]] + +Col 1: original Hap; Col2: start from where (recHist: index of SNP; +recHistGen: positions in Morgan) + + > rh + [,1] [,2] + [1,] 2 1 + [2,] 1 112 + > rhg + [,1] [,2] + [1,] 2 0.00000000 + [2,] 1 0.09354741 + +So, if everything goes well, rh\[2,2\]-1 \< rhg\[2,2\] \< rh\[2,2\]. We +can check it with genMap (SNP index -\> SNP position in Morgan): + + > gm[[111]] + [1] 0.0923321 + > gm[[112]] + [1] 0.0939209 + +### Collect information for ts tables + + # for RecHist + # bridgeSegDfList store the indexes of SNPs after recombination events + bridgeCollectSegFromSimOutput(SP, simOutput) + # for RecHistGen + # bridgeSegDfListGen store the positions of where recombination happen + bridgeCollectSegGenFromSimOutput(SP, simOutput) + +For RecHist, the indexes of SNPs have to be turned into positions: + + edgeDf <- bridgeAllSegToEdgeDf(chr_info) + +### Write tree files and check + + bridgeWriteTrees(chr_info, edgeDf, SP) + +In python: + + import tskit + origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr0.trees') + marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') + + # Statistics: + # chr1 + origin.num_trees + 298 + marker_ts.num_trees + 298 + # chr 1 is too short for new recombination events. But 40 new nodes (2 x 20 ind) added. + origin.num_nodes + 260 + marker_ts.num_nodes + 300 + + # chr2 + origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr1.trees') + marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr1.trees') + # now here are new recombination events: + origin.num_trees + 620 + marker_ts.num_trees + 623 + # and still 40 new nodes: + origin.num_nodes + 491 + marker_ts.num_nodes + 531 + +We can plot the pedigree by (information in individual table): + + from matplotlib import pyplot as plt + import networkx as nx + import tskit + def draw_pedigree(ped_ts): + G = nx.DiGraph() + for ind in ped_ts.individuals(): + time = ped_ts.node(ind.nodes[0]).time + pop = ped_ts.node(ind.nodes[0]).population + G.add_node(ind.id, time=time, population=pop) + for p in ind.parents: + if p != tskit.NULL: + G.add_edge(ind.id, p) + pos = nx.multipartite_layout(G, subset_key="time", align="horizontal") + colours = plt.rcParams['axes.prop_cycle'].by_key()['color'] + node_colours = [colours[node_attr["population"]] for node_attr in G.nodes.values()] + nx.draw_networkx(G, pos, with_labels=True, node_color=node_colours) + plt.show() + + draw_pedigree(origin) + +![](../man/figures/originInd.png) + +The new individuals added: + + draw_pedigree(marker_ts) + +![](../man/figures/addInd.png) + +For RecHistGen, the positions were stored in bridgeSegDfListGen and can +be directly used: + + bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) + +### More recombinations? + +Let’s use the same msprime .tree files, but set higher recombination +rates to see the difference between recHist and recHistGen when there +are double crossing over between 2 sampled SNPs. + + L1 <- 1e6 + L2 <- 2e6 + chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + #breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) + ) + + founderGenomes2 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) + set.seed(42) + SP2 = SimParam$new(founderGenomes2) + SP2$setSexes("yes_sys") + SP2$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + + SP2$setTrackPed(TRUE) + # try the new function here, it automatically set setTrackRec also. + SP2$setTrackRecGen(TRUE) + basePop2 = newPop(founderGenomes2, simParam = SP2) + basePop2 = setPheno(basePop2, + h2 = 0.5, + simParam = SP2) + + #--- n generations + nCycles<-2 + + # very simple container for each cycles sim output + simOutput2<-list(basePop2) + cycle<-1 + for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput2[[cycle]], nInd=6, use = "gv", simParam = SP2) + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5, simParam = SP2) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5, simParam = SP2) + # add new offspring to simOutput list + simOutput2[[cycle+1]]<-offspringPop + } + +check ind 3; chr 1; hap 1: + + RHG <- SP2$recHistGen + RH <- SP2$recHist + gm <- SP2$genMap[[1]] + + rh <- RH[[3]][[1]][[1]] + rhg <- RHG[[3]][[1]][[1]] + +Now we can see 10 more recombination events in recHistGen: + + > rh + [,1] [,2] + [1,] 2 1 + [2,] 1 8 + [3,] 2 36 + [4,] 1 44 + [5,] 2 47 + [6,] 1 50 + [7,] 2 52 + > rhg + [,1] [,2] + [1,] 2 0.000000 + [2,] 1 1.306539 + [3,] 2 2.243464 + [4,] 1 2.423857 + [5,] 2 2.608929 + [6,] 1 2.831431 + [7,] 2 4.993668 + [8,] 1 5.958832 + [9,] 2 6.238179 + [10,] 1 7.697307 + [11,] 2 7.882583 + [12,] 1 8.537422 + [13,] 2 9.214039 + [14,] 1 9.696815 + [15,] 2 10.348671 + [16,] 1 11.282477 + [17,] 2 12.080616 + +To see where the recombination events (between which SNPs) recorded in +recHistGen: + + > x <- rhg[,2] + > + > left <- findInterval(x, gm) + > right <- pmin(left + 1, length(gm)) + > + > out <- data.frame( + + x = x, + + left_i = left, + + left_v = gm[left], + + right_i = right, + + right_v = gm[right] + + ) + > + > out + x left_i left_v right_i right_v + 1 0.000000 1 0.00000 2 0.00609 + 2 1.306539 7 1.06375 8 1.36458 + 3 2.243464 10 1.77918 11 2.51330 + 4 2.423857 10 1.77918 11 2.51330 + 5 2.608929 12 2.57886 13 2.86640 + 6 2.831431 12 2.57886 13 2.86640 + 7 4.993668 35 4.64742 36 5.39801 + 8 5.958832 38 5.73197 39 6.43739 + 9 6.238179 38 5.73197 39 6.43739 + 10 7.697307 41 7.61275 42 8.31825 + 11 7.882583 41 7.61275 42 8.31825 + 12 8.537422 43 8.51587 44 8.67657 + 13 9.214039 46 8.85817 47 9.52313 + 14 9.696815 47 9.52313 48 10.44305 + 15 10.348671 47 9.52313 48 10.44305 + 16 11.282477 49 10.92943 50 11.29281 + 17 12.080616 51 11.94669 52 12.37887 + +The double crossing overs between 2 sampled SNPs (e.g. rows 3 & 4; 5 & +6; 8 & 9; 10 & 11; 14 & 15) were ignored by recHist but kept by +recHistGen. diff --git a/dev/notesRealBreakpoints.rmd b/dev/notesRealBreakpoints.rmd new file mode 100644 index 00000000..05ed8680 --- /dev/null +++ b/dev/notesRealBreakpoints.rmd @@ -0,0 +1,321 @@ +--- +title: "notesRealBreakpoints" +output: + github_document: + output_file: ../README.md +date: "2026-02-18" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## Test data + +Use `AlphaSimR_test/dev/multiple_chr.py` to simulate 2 chromosomes and 2 dip individuals with pedigree. Or directly use `msprime_chr0.trees` and `msprime_chr1.trees` in `AlphaSimR_test/dev/testData`. + +## Try this version + +In this version, a recHistGen object (similar to recHist) was added to record the real recombination breakpoints. + +### Load tree sequences + +``` +library(AlphaSimR) +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") +devtools::load_all() + +# two chromosomes +L1 <- 1e6 +L2 <- 2e6 +# here, use the same recombination map as used in msprime +chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + #breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes1 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +``` + +### Run AlphaSimR + +``` +set.seed(42) +SP = SimParam$new(founderGenomes1) +SP$setSexes("yes_sys") +SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP$setTrackRecGen(TRUE) +basePop = newPop(founderGenomes1) + +# the 2 objects are same now: +SP$recHistGen +SP$recHist + +basePop = setPheno(basePop, + h2 = 0.5) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput<-list(basePop) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop +} +``` +Now we can see the difference between recHist and recHistGen: +``` +RHG <- SP$recHistGen +RH <- SP$recHist +# ind 3; chr 2; hap 1. Maybe not the same output, please check RHG and RH to find a hap with recombination +rh <- RH[[3]][[2]][[1]] +rhg <- RHG[[3]][[2]][[1]] +gm <- SP$genMap[[2]] +``` +Col 1: original Hap; Col2: start from where (recHist: index of SNP; recHistGen: positions in Morgan) +``` +> rh + [,1] [,2] +[1,] 2 1 +[2,] 1 112 +> rhg + [,1] [,2] +[1,] 2 0.00000000 +[2,] 1 0.09354741 +``` +So, if everything goes well, rh[2,2]-1 < rhg[2,2] < rh[2,2]. We can check it with genMap (SNP index -> SNP position in Morgan): +``` +> gm[[111]] +[1] 0.0923321 +> gm[[112]] +[1] 0.0939209 +``` + +### Collect information for ts tables + +``` +# for RecHist +# bridgeSegDfList store the indexes of SNPs after recombination events +bridgeCollectSegFromSimOutput(SP, simOutput) +# for RecHistGen +# bridgeSegDfListGen store the positions of where recombination happen +bridgeCollectSegGenFromSimOutput(SP, simOutput) +``` + +For RecHist, the indexes of SNPs have to be turned into positions: +``` +edgeDf <- bridgeAllSegToEdgeDf(chr_info) +``` +### Write tree files and check +``` +bridgeWriteTrees(chr_info, edgeDf, SP) +``` +In python: +``` +import tskit +origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr0.trees') +marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') + +# Statistics: +# chr1 +origin.num_trees +298 +marker_ts.num_trees +298 +# chr 1 is too short for new recombination events. But 40 new nodes (2 x 20 ind) added. +origin.num_nodes +260 +marker_ts.num_nodes +300 + +# chr2 +origin = tskit.load('.../AlphaSimR_test/dev/testData/msprime_chr1.trees') +marker_ts = tskit.load('.../AlphaSimR_test/dev/testData/AlphaSimR_extended_chr1.trees') +# now here are new recombination events: +origin.num_trees +620 +marker_ts.num_trees +623 +# and still 40 new nodes: +origin.num_nodes +491 +marker_ts.num_nodes +531 +``` +We can plot the pedigree by (information in individual table): +``` +from matplotlib import pyplot as plt +import networkx as nx +import tskit +def draw_pedigree(ped_ts): + G = nx.DiGraph() + for ind in ped_ts.individuals(): + time = ped_ts.node(ind.nodes[0]).time + pop = ped_ts.node(ind.nodes[0]).population + G.add_node(ind.id, time=time, population=pop) + for p in ind.parents: + if p != tskit.NULL: + G.add_edge(ind.id, p) + pos = nx.multipartite_layout(G, subset_key="time", align="horizontal") + colours = plt.rcParams['axes.prop_cycle'].by_key()['color'] + node_colours = [colours[node_attr["population"]] for node_attr in G.nodes.values()] + nx.draw_networkx(G, pos, with_labels=True, node_color=node_colours) + plt.show() + +draw_pedigree(origin) +``` +![](../man/figures/originInd.png) + +The new individuals added: +``` +draw_pedigree(marker_ts) +``` +![](../man/figures/addInd.png) + +For RecHistGen, the positions were stored in bridgeSegDfListGen and can be directly used: +``` +bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +``` +### More recombinations? +Let's use the same msprime .tree files, but set higher recombination rates to see the difference between recHist and recHistGen when there are double crossing over between 2 sampled SNPs. +``` +L1 <- 1e6 +L2 <- 2e6 +chr_info <- list( + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr0.trees", + #breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path=".../AlphaSimR_test/dev/testData/msprime_chr1.trees", + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes2 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +set.seed(42) +SP2 = SimParam$new(founderGenomes2) +SP2$setSexes("yes_sys") +SP2$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP2$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP2$setTrackRecGen(TRUE) +basePop2 = newPop(founderGenomes2, simParam = SP2) +basePop2 = setPheno(basePop2, + h2 = 0.5, + simParam = SP2) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput2<-list(basePop2) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput2[[cycle]], nInd=6, use = "gv", simParam = SP2) + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5, simParam = SP2) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5, simParam = SP2) + # add new offspring to simOutput list + simOutput2[[cycle+1]]<-offspringPop +} +``` +check ind 3; chr 1; hap 1: +``` +RHG <- SP2$recHistGen +RH <- SP2$recHist +gm <- SP2$genMap[[1]] + +rh <- RH[[3]][[1]][[1]] +rhg <- RHG[[3]][[1]][[1]] +``` +Now we can see 10 more recombination events in recHistGen: +``` +> rh + [,1] [,2] +[1,] 2 1 +[2,] 1 8 +[3,] 2 36 +[4,] 1 44 +[5,] 2 47 +[6,] 1 50 +[7,] 2 52 +> rhg + [,1] [,2] + [1,] 2 0.000000 + [2,] 1 1.306539 + [3,] 2 2.243464 + [4,] 1 2.423857 + [5,] 2 2.608929 + [6,] 1 2.831431 + [7,] 2 4.993668 + [8,] 1 5.958832 + [9,] 2 6.238179 +[10,] 1 7.697307 +[11,] 2 7.882583 +[12,] 1 8.537422 +[13,] 2 9.214039 +[14,] 1 9.696815 +[15,] 2 10.348671 +[16,] 1 11.282477 +[17,] 2 12.080616 +``` +To see where the recombination events (between which SNPs) recorded in recHistGen: +``` +> x <- rhg[,2] +> +> left <- findInterval(x, gm) +> right <- pmin(left + 1, length(gm)) +> +> out <- data.frame( ++ x = x, ++ left_i = left, ++ left_v = gm[left], ++ right_i = right, ++ right_v = gm[right] ++ ) +> +> out + x left_i left_v right_i right_v +1 0.000000 1 0.00000 2 0.00609 +2 1.306539 7 1.06375 8 1.36458 +3 2.243464 10 1.77918 11 2.51330 +4 2.423857 10 1.77918 11 2.51330 +5 2.608929 12 2.57886 13 2.86640 +6 2.831431 12 2.57886 13 2.86640 +7 4.993668 35 4.64742 36 5.39801 +8 5.958832 38 5.73197 39 6.43739 +9 6.238179 38 5.73197 39 6.43739 +10 7.697307 41 7.61275 42 8.31825 +11 7.882583 41 7.61275 42 8.31825 +12 8.537422 43 8.51587 44 8.67657 +13 9.214039 46 8.85817 47 9.52313 +14 9.696815 47 9.52313 48 10.44305 +15 10.348671 47 9.52313 48 10.44305 +16 11.282477 49 10.92943 50 11.29281 +17 12.080616 51 11.94669 52 12.37887 +``` +The double crossing overs between 2 sampled SNPs (e.g. rows 3 & 4; 5 & 6; 8 & 9; 10 & 11; 14 & 15) were ignored by recHist but kept by recHistGen. diff --git a/dev/notesTestGrowTsTables.md b/dev/notesTestGrowTsTables.md new file mode 100644 index 00000000..05cd4f5d --- /dev/null +++ b/dev/notesTestGrowTsTables.md @@ -0,0 +1,191 @@ +``` +rm(list = ls()) +``` + +0. load dependencies (already in the R scripts, but if you have different setting plz just do this in your way) +``` +library(reticulate) +library(AlphaSimR) + +use_virtualenv("~/r-reticulate-env", required = TRUE) +msprime <- import("msprime") +tskit <- import("tskit") +``` +1. load functions +``` +devtools::load_all() +#--- or ---- +source("R/makeFoundersFromTs.R") +source("R/alphaSimR2Ts.R") +``` + +2. read the .trees files with 2 chromosomes and 2 dip individuals (from msprime) +note: chrKeptPosBpList added, so we know the position and index of sampled SNPs in the original .trees files (alphaSimR only record their index) +``` +L1 <- 1e6 +L2 <- 2e6 + +chr_info <- list( + list(ts_path="dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + list(ts_path="dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) +) + +founderGenomes <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +``` + +3. run alphaSimR to set the founder genomes and parameters +``` +set.seed(42) +SP = SimParam$new(founderGenomes) +SP$setSexes("yes_sys") +SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) +SP$setTrackPed(TRUE) +SP$setTrackRec(TRUE) +basePop = newPop(founderGenomes) +basePop = setPheno(basePop, + h2 = 0.5) +``` + +4. run addition 2 generations +``` +#--- n generations +nCycles<-2 + +# keep founderPop and offspringPop in SimOutput +simOutput<-list(basePop) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop +} +``` + +5. Link recHist (from SP, based on sampled SNPs) with parent-child hap (in tskit positions, from chrKeptPosBpList) +``` +bridgeCollectSegFromSimOutput(SP, simOutput) +``` + +6. make an edge table from bridgeSegDfList +``` +edgeDf <- bridgeAllSegToEdgeDf(chr_info) +``` + +7. write tskit tables (nodes and edges) +note1: time of founder generation: 0; time of offspring: time of the youngest parent - 1 +note2: check nodeIdMapByChr for ids of alphaSimR and tskit +note3: n ploidy is from asMapPop, so variable number of ploidy along generations is not allowed +note4: be careful with the metadata in the future (different behaviors between Python and R even with Reticulate) +``` +bridgeWriteTrees(chr_info, edgeDf, SP) +``` + +8. We can see that there is no new recombination break points in chr1, let's play with chr2 + +``` +# in Python: +import tskit +ts0 = tskit.load('.../dev/testData/msprime_chr1.trees') +ts1 = tskit.load('.../dev/testData/_AlphaSimR_extended_chr1.trees') +``` + +From edgeDf, there's a breakpoint at 1549443 +``` +ts0_1549443 = ts0.at(1549442) +ts0_1549444 = ts0.at(1549443) +ts1_1549443 = ts1.at(1549442) +ts1_1549444 = ts1.at(1549443) +``` +Same tree in the original file: +``` +print(ts0_1549443.draw_text()) +``` +``` +Output: + 182 + ┏━┻━┓ + 32 ┃ + ┏┻━┓ ┃ +12 1621 +┏┻┓ ┃ ┃ +0 2 1 3 +``` + +``` +print(ts0_1549444.draw_text()) +``` +``` +Output: + 182 + ┏━┻━┓ + 32 ┃ + ┏┻━┓ ┃ +12 1621 +┏┻┓ ┃ ┃ +0 2 1 3 +``` +Different trees in the new file +``` +print(ts1_1549443.draw_text()) +``` +``` +Output: + 182 + ┏━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ 32 + ┃ ┏━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━┓ + 21 16 12 + ┃ ┃ ┏━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━┓ + 3 1 2 0 + ┏━━━┳━━━┳━━━┳┻━━━━┳━━━━━━━┓ ┏━━━┳━━━┳━┻━━━━━━━━━┓ ┏━━━┳━┻━┳━━━┓ ┏━━━┳━━━┳━━━━━━━━━┳━━━┻━━━━━━━━━┳━━━━━━━━━━━┓ +491 493 505 497 499 501 494 504 508 492 495 503 507 509 496 506 510 498 500 502 + ┃ ┏━┻━┓ ┏━┻━┓ ┏━━━┳━━━╋━━━┳━━━┓ ┏━━━┳━┻━┳━━━┓ ┏━━━╋━━━┓ ┏━━━╋━━━┓ + 511 512 520 523 529 522 524 526 528 530 513 515 517 519 514 516 518 521 525 527 +``` +``` +print(ts1_1549444.draw_text()) +``` +``` +Output: + 182 + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + 32 ┃ + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ + 12 16 21 + ┏━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━┓ ┃ ┃ + 2 0 1 3 + ┏━━━┳━━━╋━━━┳━━━┓ ┏━━━┳━━━┳━━━━━━━━━┳━━━┻━━━━━━━━━┳━━━━━━━━━━━┓ ┏━━━┳━━━┳━┻━━━━━━━━━┓ ┏━━━┳━━━┳━━┻━━┳━━━━━━━┓ +491 495 503 507 509 496 506 510 498 500 502 494 504 508 492 493 505 497 499 501 + ┏━━━┳━┻━┳━━━┓ ┏━━━╋━━━┓ ┏━━━╋━━━┓ ┏━━━┳━━━╋━━━┳━━━┓ ┃ ┏━┻━┓ ┏━┻━┓ + 513 515 517 519 514 516 518 521 525 527 522 524 526 528 530 511 512 520 523 529 + +``` +difference: one of the parent nodes of node 491 (3_1 in alphaSimR) changed from 3 (2_2) to 2 (2_1), the same as edgeDf. + + +New nodes look like: +``` +ts1.tables.nodes[491] +``` +``` +Output: +NodeTableRow(flags=0, time=-1.0, population=-1, individual=-1, metadata={'alphaSimR': {'id': '3_1'}}) +``` +Founder nodes look like: +``` +ts1.tables.nodes[0] +``` +``` +Output: +NodeTableRow(flags=1, time=0.0, population=0, individual=0, metadata={'alphaSimR': {'id': '1_1'}}) +``` diff --git a/dev/reservoir_sampling.Rmd b/dev/reservoir_sampling.Rmd new file mode 100644 index 00000000..a0376b95 --- /dev/null +++ b/dev/reservoir_sampling.Rmd @@ -0,0 +1,51 @@ +#segSites = n +#qualifiedSites = N + +Algorithm: if we make segSites as a list with size $n$, for the first $n$ qualifiedSites, along the chromosome, we just put it in the list; for the others (e.g. index $i$), we generate a random number from 0 to $i$, if $i <= n$, we put it in the list, or we discard it. + +1. Probability of a site $i$ entry the list +$$ +P(entry) = +\begin{cases} +1, & i <= n, \\ +\frac{n}{i}, & i > n . +\end{cases} +$$ +2. Probability of a site $i$ in the list being replace at step $j$ $(in$ +$$ +P(not\ being\ replaced)=\prod_{j=i+1}^N \frac{j-1}{j}\\ += \frac{i}{i+1}*\frac{i+1}{i+2}*\frac{i+2}{i+3}...\frac{N-1}{N}\\ += \frac{i}{N} +$$ +3. Probability of a site $i$ in the list at the end $(j=N)$ +$$ +P(in\ the\ list)=P(entry) * P(not\ being\ replaced) +$$ +$$ +P(in\ the\ list) = +\begin{cases} +1*\frac{n}{N}, & i <= n, \\ +\frac{n}{i}*\frac{i}{N}, & i > n . +\end{cases}\\ +=\begin{cases} +\frac{n}{N}, & i <= n, \\ +\frac{n}{N}, & i > n . +\end{cases} +$$ +So, all the qualifiedSites along the chromosome have the same probability being sampled. diff --git a/dev/reservoir_sampling.html b/dev/reservoir_sampling.html new file mode 100644 index 00000000..a98e4da8 --- /dev/null +++ b/dev/reservoir_sampling.html @@ -0,0 +1,452 @@ + + + + + + + + + + + + + +reservoir_sampling.knit + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + +

#segSites = n #qualifiedSites = N

+

Algorithm: if we make segSites as a list with size \(n\), for the first \(n\) qualifiedSites, along the chromosome, +we just put it in the list; for the others (e.g. index \(i\)), we generate a random number from 0 to +\(i\), if \(i +<= n\), we put it in the list, or we discard it.

+
    +
  1. Probability of a site \(i\) entry +the list \[ +P(entry) = +\begin{cases} +1, & i <= n, \\ +\frac{n}{i}, & i > n . +\end{cases} +\]
  2. +
  3. Probability of a site \(i\) in the +list being replace at step \(j\) \((i<j<=N)\) \[ +P(being\ replaced)=P(j\ entry)*P(sample\ i\ from\ list) += \frac{n}{j}*\frac{1}{n}=\frac{1}{j} +\]
  4. +
  5. Probability of a site \(i\) in the +list NOT being replace at step \(j\) +\((i<j<=N)\) \[ +P(not\ being\ replaced)=1-\frac{1}{j}=\frac{j-1}{j} +\]
  6. +
  7. Probability of a site \(i\) in the +list NOT being replace at the end \((j=N)\) For \(i<=n\) \[ +P(not\ being\ replaced)=\prod_{j=n+1}^N \frac{j-1}{j}\\ += \frac{n}{n+1}*\frac{n+1}{n+2}*\frac{n+2}{n+3}...\frac{N-1}{N}\\ += \frac{n}{N} +\] For \(i>n\) \[ +P(not\ being\ replaced)=\prod_{j=i+1}^N \frac{j-1}{j}\\ += \frac{i}{i+1}*\frac{i+1}{i+2}*\frac{i+2}{i+3}...\frac{N-1}{N}\\ += \frac{i}{N} +\]
  8. +
  9. Probability of a site \(i\) in the +list at the end \((j=N)\) \[ +P(in\ the\ list)=P(entry) * P(not\ being\ replaced) +\] \[ +P(in\ the\ list) = +\begin{cases} +1*\frac{n}{N}, & i <= n, \\ +\frac{n}{i}*\frac{i}{N}, & i > n . +\end{cases}\\ +=\begin{cases} +\frac{n}{N}, & i <= n, \\ +\frac{n}{N}, & i > n . +\end{cases} +\] So, all the qualifiedSites along the chromosome have the same +probability being sampled.
  10. +
+ + + + +
+ + + + + + + + + + + + + + + diff --git a/dev/smokeStep1.R b/dev/smokeStep1.R new file mode 100644 index 00000000..f9233b55 --- /dev/null +++ b/dev/smokeStep1.R @@ -0,0 +1,22 @@ +## Step 1 smoke test: read site positions via C API + +library(AlphaSimRTmp) + +tsPath <- "/Users/jliang2/Projects/test_TSK2ASR/data/simulations/normal/msprime_chr1.trees" + +pos <- tsSitesPosition(tsPath) + +cat("numSites =", length(pos), "\n") +cat("firstPositions =", paste(head(pos, 10), collapse = ", "), "\n") +cat("lastPositions =", paste(tail(pos, 3), collapse = ", "), "\n") + +library(reticulate) +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") +ts <- tskit$load(tsPath) +posPy <- ts$tables$sites$position +posPy <- as.numeric(py_to_r(ts$tables$sites$position)) + +cat("numSites =", length(posPy), "\n") +cat("firstPositions =", paste(head(posPy, 10), collapse = ", "), "\n") +cat("lastPositions =", paste(tail(posPy, 3), collapse = ", "), "\n") diff --git a/dev/smokeStep2.R b/dev/smokeStep2.R new file mode 100644 index 00000000..02ad0889 --- /dev/null +++ b/dev/smokeStep2.R @@ -0,0 +1,158 @@ +library(AlphaSimR) +use_virtualenv("~/r-reticulate-env", required = TRUE) +tskit <- import("tskit") +devtools::load_all() + +# two chromosomes +L1 <- 1e6 +L2 <- 2e6 +chr_info <- list( + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + #breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr1.trees", + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes1 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) + +set.seed(42) +SP = SimParam$new(founderGenomes1) +SP$setSexes("yes_sys") +SP$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP$setTrackRecGen(TRUE) +SP$recHistGen +basePop = newPop(founderGenomes1) +# the 2 objects are same now: +SP$recHistGen +SP$recHist +basePop = setPheno(basePop, + h2 = 0.5) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput<-list(basePop) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) + # add new offspring to simOutput list + simOutput[[cycle+1]]<-offspringPop +} + +# see the difference between recHist and recHistGen +RHG <- SP$recHistGen +RH <- SP$recHist +# ind 3; chr 2; hap 1. Maybe not the same output, please check RHG and RH to find a hap with recombination +rh <- RH[[3]][[2]][[1]] +rhg <- RHG[[3]][[2]][[1]] +gm <- SP$genMap[[2]] +rh +rhg +gm[[111]] +gm[[112]] + + +# for RecHist +# bridgeSegDfList store the indexes of SNPs after recombination events +bridgeCollectSegFromSimOutput(SP, simOutput) +# for RecHistGen +# bridgeSegDfListGen store the positions of where recombination happen +bridgeCollectSegGenFromSimOutput(SP, simOutput) + +# for RecHist +edgeDf <- bridgeAllSegToEdgeDf(chr_info) +bridgeWriteTrees(chr_info, edgeDf, SP) +# load the tree in Python... +#origin = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr0.trees') +#marker_ts = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') +# check the number of trees, nodes, and individual + +# for RecHistGen +bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +# real_break_ts = tskit.load('/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/AlphaSimR_extended_chr0.trees') + + +L1 <- 1e6 +L2 <- 2e6 +chr_info <- list( + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr0.trees", + #breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), + breaks=c(0, L1/2, L1), rates=c(1e-5, 2e-5), segSites=60), + list(ts_path="/Users/jliang2/R_scripts/AlphaSimR_test/dev/testData/msprime_chr1.trees", + #breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) + breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-4, 1e-5, 1e-4), segSites=155) +) + +founderGenomes2 <- asMapPop(chr_info = chr_info, inbred=FALSE, ploidy=2L) +set.seed(42) +SP2 = SimParam$new(founderGenomes2) +SP2$setSexes("yes_sys") +SP2$addTraitA(nQtlPerChr = 5, + mean = 500, + var = 450) + +SP2$setTrackPed(TRUE) +# try the new function here, it automatically set setTrackRec also. +SP2$setTrackRecGen(TRUE) +basePop2 = newPop(founderGenomes2, simParam = SP2) +basePop2 = setPheno(basePop2, + h2 = 0.5, + simParam = SP2) + +#--- n generations +nCycles<-2 + +# very simple container for each cycles sim output +simOutput2<-list(basePop2) +cycle<-1 +for(cycle in 1:nCycles){ + cat(paste0(" C",cycle)) + # choose the best from last cycle + chosenParents<- selectInd(pop=simOutput2[[cycle]], nInd=6, use = "gv", simParam = SP2) + # make crosses + offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5, simParam = SP2) + # phenotype new offspring + offspringPop<-setPheno(pop = offspringPop, h2 = 0.5, simParam = SP2) + # add new offspring to simOutput list + simOutput2[[cycle+1]]<-offspringPop +} + +RHG <- SP2$recHistGen +RH <- SP2$recHist +gm <- SP2$genMap[[1]] + +rh <- RH[[3]][[1]][[1]] +rhg <- RHG[[3]][[1]][[1]] +rh +rhg +x <- rhg[,2] + +left <- findInterval(x, gm) +right <- pmin(left + 1, length(gm)) + +out <- data.frame( + x = x, + left_i = left, + left_v = gm[left], + right_i = right, + right_v = gm[right] +) + +out + + + diff --git a/dev/testData/AlphaSimR_extended_chr0.trees b/dev/testData/AlphaSimR_extended_chr0.trees new file mode 100644 index 00000000..5ef63a20 Binary files /dev/null and b/dev/testData/AlphaSimR_extended_chr0.trees differ diff --git a/dev/testData/AlphaSimR_extended_chr1.trees b/dev/testData/AlphaSimR_extended_chr1.trees new file mode 100644 index 00000000..c7a57384 Binary files /dev/null and b/dev/testData/AlphaSimR_extended_chr1.trees differ diff --git a/dev/testData/msprime_chr0.trees b/dev/testData/msprime_chr0.trees new file mode 100644 index 00000000..6942f0fd Binary files /dev/null and b/dev/testData/msprime_chr0.trees differ diff --git a/dev/testData/msprime_chr1.trees b/dev/testData/msprime_chr1.trees new file mode 100644 index 00000000..b5477e21 Binary files /dev/null and b/dev/testData/msprime_chr1.trees differ diff --git a/dev/testSampling.R b/dev/testSampling.R new file mode 100644 index 00000000..46dde852 --- /dev/null +++ b/dev/testSampling.R @@ -0,0 +1,71 @@ +library(ggplot2) + +L1 <- 1e6 +chr_info <- list( + list(ts_path="dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8))) +founderGenomes <- asMapPop( + chr_info = chr_info, + inbred = FALSE, + ploidy = 2L + ) +all_pos <- chrKeptPosBpList +all_pos <- unlist(all_pos) +breaks <- seq(min(all_pos), max(all_pos), length.out = 11) +bg_counts <- cut(all_pos, breaks = breaks, include.lowest = TRUE) %>% + table() %>% + as.numeric() + +n_iterations <- 50 +n_bins <- 10 +segSites <- 77 + +chr_info <- list( + list(ts_path="dev/testData/msprime_chr0.trees", + breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=segSites)) + +all_bp_positions <- list() +all_gen_map_positions <- list() + +#set.seed(42) +random_seeds <- sample(1:1000000, size = n_iterations) + +for (i in 1:n_iterations) { + current_seed <- random_seeds[i] + + founderGenomes <- asMapPop( + chr_info = chr_info, + inbred = FALSE, + ploidy = 2L, + site_sampling_seed = current_seed + ) + + bp_pos <- unlist(chrKeptPosBpList) + all_bp_positions[[i]] <- bp_pos + + gen_map_pos <- unlist(founderGenomes@genMap) + all_gen_map_positions[[i]] <- gen_map_pos +} + + +df_bp <- data.frame(pos = unlist(all_bp_positions)) +df_gen <- data.frame(pos = unlist(all_gen_map_positions)) + +sampled_counts <- cut(df_bp$pos, breaks = breaks, include.lowest = TRUE) %>% + table() %>% + as.numeric() + +plot_data <- data.frame( + bin_mid = (breaks[-1] + breaks[-length(breaks)]) / 2, + sampling_ratio = (sampled_counts) / bg_counts +) + +p1 <- ggplot(plot_data, aes(x = bin_mid/1000, y = sampling_ratio)) + + geom_bar(stat = "identity", fill = "skyblue", color = "white") + + geom_hline(yintercept = mean(plot_data$sampling_ratio, na.rm = TRUE), + linetype = "dashed", color = "red") + + labs(title = paste(segSites, "segSites (", length(all_pos), "in total) over", n_iterations, "runs"), + x = "Position (kbp)", + y = "Frequency (Count) / Background") + + theme_minimal() +print(p1) diff --git a/dev/test_Rcpptskit_out.Rmd b/dev/test_Rcpptskit_out.Rmd new file mode 100644 index 00000000..ecc33dca --- /dev/null +++ b/dev/test_Rcpptskit_out.Rmd @@ -0,0 +1,125 @@ +--- +title: "test_Rcpptskit_out" +output: html_document +date: "2026-04-01" +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +## R Markdown + +This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see . + +When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this: + +```{r cars} +summary(cars) +``` + +## Including Plots + +You can also embed plots, for example: + +```{r pressure, echo=FALSE} +plot(pressure) +``` + +Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot. + +``` +> library(reticulate) +> use_virtualenv("~/r-reticulate-env", required = TRUE) +> tskit <- import("tskit") +> devtools::load_all() +ℹ Loading AlphaSimR +> L1 <- 1e6 +> L2 <- 2e6 +> # here, use the same recombination map as used in msprime +> chr_info <- list( ++ list(ts_path=".dev/testData/msprime_chr0.trees", ++ breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), ++ list(ts_path="dev/testData/msprime_chr1.trees", ++ breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) ++ ) +> founderGenomes1 <- asMapPopPy(chr_info = chr_info, inbred=FALSE, ploidy=2L) +Error in py_call_impl(callable, call_args$unnamed, call_args$named) : + FileNotFoundError: [Errno 2] No such file or directory: '.dev/testData/msprime_chr0.trees' +Run `reticulate::py_last_error()` for details. +Called from: py_call_impl(callable, call_args$unnamed, call_args$named) + +Browse[1]> +> L1 <- 1e6 +> L2 <- 2e6 +> # here, use the same recombination map as used in msprime +> chr_info <- list( ++ list(ts_path="dev/testData/msprime_chr0.trees", ++ breaks=c(0, L1/2, L1), rates=c(1e-8, 2e-8), segSites=60), ++ list(ts_path="dev/testData/msprime_chr1.trees", ++ breaks=c(0, L2/3, 2*L2/3, L2), rates=c(1e-7, 1e-8, 1e-7), segSites=155) ++ ) +> founderGenomes1 <- asMapPopPy(chr_info = chr_info, inbred=FALSE, ploidy=2L) +60 variants sampled (Random seed: 42) +60 variants sampled (Random seed: 42) +155 variants sampled (Random seed: 42) +155 variants sampled (Random seed: 42) +> set.seed(42) +> SP = SimParam$new(founderGenomes1) +> SP$setSexes("yes_sys") +> SP$addTraitA(nQtlPerChr = 5, ++ mean = 500, ++ var = 450) +> +> SP$setTrackPed(TRUE) +> # try the new function here, it automatically set setTrackRec also. +> SP$setTrackRecGen(TRUE) +> basePop = newPop(founderGenomes1) +> basePop = setPheno(basePop, ++ h2 = 0.5) +> +> #--- n generations +> nCycles<-2 +> +> # very simple container for each cycles sim output +> simOutput<-list(basePop) +> cycle<-1 +> for(cycle in 1:nCycles){ ++ cat(paste0(" C",cycle)) ++ # choose the best from last cycle ++ chosenParents<- selectInd(pop=simOutput[[cycle]], nInd=6, use = "gv") ++ # make crosses ++ offspringPop<-randCross(pop=chosenParents, nCrosses=2, nProgeny = 5) ++ # phenotype new offspring ++ offspringPop<-setPheno(pop = offspringPop, h2 = 0.5) ++ # add new offspring to simOutput list ++ simOutput[[cycle+1]]<-offspringPop ++ } + C1 C2 +Warning message: +In selectInd(pop = simOutput[[cycle]], nInd = 6, use = "gv") : + Suitable candidates smaller than nInd, returning 2 individuals + +> rm(tskit) +> tskit +Error: object 'tskit' not found + +> library(RcppTskit) +> bridgeCollectSegGenFromSimOutput(SP, simOutput) +> bridgeWriteTrees(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +Wrote: dev/testData/AlphaSimR_extended_chr0.trees +Wrote: dev/testData/AlphaSimR_extended_chr1.trees +> source("dev/alphaSimR2TsPy.R") +> source("dev/alphaSimR2TsGenPy.R") +> bridgeCollectSegGenFromSimOutputPy(SP, simOutput) +> rm(bridgeSegDfListGen) +> rm(indIdMapByChr) +> rm(nodeIdMapByChr) +> rm(list = lsf.str()) +> source("dev/alphaSimR2TsPy.R") +> source("dev/alphaSimR2TsGenPy.R") +> bridgeCollectSegGenFromSimOutputPy(SP, simOutput) +> bridgeWriteTreesPy(chr_info, do.call(rbind, bridgeSegDfListGen), SP) +Wrote: dev/testData/AlphaSimR_extended_chr0.trees +Wrote: dev/testData/AlphaSimR_extended_chr1.trees +``` diff --git a/man/SimParam.Rd b/man/SimParam.Rd index c06b5b74..f63aa94e 100644 --- a/man/SimParam.Rd +++ b/man/SimParam.Rd @@ -403,6 +403,10 @@ genetic map} \item{\code{recHist}}{list of historic recombination events} +\item{\code{isTrackRecGen}}{is recombination being tracked. Jinyang added.} + +\item{\code{recHistGen}}{list of historic recombination events. Jinyang added.} + \item{\code{haplotypes}}{list of computed IBD haplotypes} \item{\code{varA}}{additive genetic variance in founderPop} @@ -426,6 +430,7 @@ relative to all active QTL} \item \href{#method-SimParam-initialize}{\code{SimParam$new()}} \item \href{#method-SimParam-setTrackPed}{\code{SimParam$setTrackPed()}} \item \href{#method-SimParam-setTrackRec}{\code{SimParam$setTrackRec()}} + \item \href{#method-SimParam-setTrackRecGen}{\code{SimParam$setTrackRecGen()}} \item \href{#method-SimParam-resetPed}{\code{SimParam$resetPed()}} \item \href{#method-SimParam-restrSegSites}{\code{SimParam$restrSegSites()}} \item \href{#method-SimParam-setSexes}{\code{SimParam$setSexes()}} @@ -569,7 +574,26 @@ SP$setTrackRec(TRUE) \if{html}{\out{}} } } +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SimParam-setTrackRecGen}{}}} +\subsection{Method \code{setTrackRecGen()}}{ +Sets genetic-coordinate recombination tracking for the simulation. Jinyang added. +By default this is turned off. When turned on, it will also turn on pedigree tracking. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SimParam$setTrackRecGen(isTrackRecGen, force = FALSE)}\if{html}{\out{
}} +} +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{isTrackRecGen}}{should genetic-coordinate recombination tracking be on.} + +\item{\code{force}}{should the check for a running simulation be ignored.} +} +\if{html}{\out{
}} +} +} \if{html}{\out{
}} \if{html}{\out{}} \if{latex}{\out{\hypertarget{method-SimParam-resetPed}{}}} @@ -1807,7 +1831,7 @@ be metacentric.} For internal use only. \subsection{Usage}{ \if{html}{\out{
}} - \preformatted{SimParam$addToRec(lastId, id, mother, father, isDH, hist, ploidy)} + \preformatted{SimParam$addToRec(lastId, id, mother, father, isDH, hist, histGen = NULL, ploidy)} \if{html}{\out{
}} } \subsection{Arguments}{ @@ -1819,6 +1843,7 @@ be metacentric.} \item{\code{father}}{vector of father iids} \item{\code{isDH}}{indicator for DH lines} \item{\code{hist}}{new recombination history} + \item{\code{histGen}}{new recombination history (genetic coordinate)} \item{\code{ploidy}}{ploidy level} } \if{html}{\out{}} diff --git a/man/figures/addInd.png b/man/figures/addInd.png new file mode 100644 index 00000000..57ba0b11 Binary files /dev/null and b/man/figures/addInd.png differ diff --git a/man/figures/originInd.png b/man/figures/originInd.png new file mode 100644 index 00000000..be6c1489 Binary files /dev/null and b/man/figures/originInd.png differ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 436411a2..56218fcb 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -661,8 +661,8 @@ BEGIN_RCPP END_RCPP } // cross -Rcpp::List cross(const arma::field >& motherGeno, arma::uvec mother, const arma::field >& fatherGeno, arma::uvec father, const arma::field& femaleMap, const arma::field& maleMap, bool trackRec, arma::uword motherPloidy, arma::uword fatherPloidy, double v, double p, const arma::vec& motherCentromere, const arma::vec& fatherCentromere, double quadProb, int nThreads); -RcppExport SEXP _AlphaSimR_cross(SEXP motherGenoSEXP, SEXP motherSEXP, SEXP fatherGenoSEXP, SEXP fatherSEXP, SEXP femaleMapSEXP, SEXP maleMapSEXP, SEXP trackRecSEXP, SEXP motherPloidySEXP, SEXP fatherPloidySEXP, SEXP vSEXP, SEXP pSEXP, SEXP motherCentromereSEXP, SEXP fatherCentromereSEXP, SEXP quadProbSEXP, SEXP nThreadsSEXP) { +Rcpp::List cross(const arma::field >& motherGeno, arma::uvec mother, const arma::field >& fatherGeno, arma::uvec father, const arma::field& femaleMap, const arma::field& maleMap, bool trackRec, arma::uword motherPloidy, arma::uword fatherPloidy, double v, double p, const arma::vec& motherCentromere, const arma::vec& fatherCentromere, double quadProb, int nThreads, /* modified by Jinyang */ bool trackRecGen); +RcppExport SEXP _AlphaSimR_cross(SEXP motherGenoSEXP, SEXP motherSEXP, SEXP fatherGenoSEXP, SEXP fatherSEXP, SEXP femaleMapSEXP, SEXP maleMapSEXP, SEXP trackRecSEXP, SEXP motherPloidySEXP, SEXP fatherPloidySEXP, SEXP vSEXP, SEXP pSEXP, SEXP motherCentromereSEXP, SEXP fatherCentromereSEXP, SEXP quadProbSEXP, SEXP nThreadsSEXP, SEXP trackRecGenSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; @@ -681,7 +681,8 @@ BEGIN_RCPP Rcpp::traits::input_parameter< const arma::vec& >::type fatherCentromere(fatherCentromereSEXP); Rcpp::traits::input_parameter< double >::type quadProb(quadProbSEXP); Rcpp::traits::input_parameter< int >::type nThreads(nThreadsSEXP); - rcpp_result_gen = Rcpp::wrap(cross(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads)); + Rcpp::traits::input_parameter< /* modified by Jinyang */ bool >::type trackRecGen(trackRecGenSEXP); + rcpp_result_gen = Rcpp::wrap(cross(motherGeno, mother, fatherGeno, father, femaleMap, maleMap, trackRec, motherPloidy, fatherPloidy, v, p, motherCentromere, fatherCentromere, quadProb, nThreads, trackRecGen)); return rcpp_result_gen; END_RCPP } @@ -950,7 +951,7 @@ static const R_CallMethodDef CallEntries[] = { {"_AlphaSimR_getNonFounderIbd", (DL_FUNC) &_AlphaSimR_getNonFounderIbd, 3}, {"_AlphaSimR_getFounderIbd", (DL_FUNC) &_AlphaSimR_getFounderIbd, 2}, {"_AlphaSimR_createIbdMat", (DL_FUNC) &_AlphaSimR_createIbdMat, 5}, - {"_AlphaSimR_cross", (DL_FUNC) &_AlphaSimR_cross, 15}, + {"_AlphaSimR_cross", (DL_FUNC) &_AlphaSimR_cross, 16}, {"_AlphaSimR_createDH2", (DL_FUNC) &_AlphaSimR_createDH2, 7}, {"_AlphaSimR_createReducedGenome", (DL_FUNC) &_AlphaSimR_createReducedGenome, 10}, {"_AlphaSimR_popVarCpp", (DL_FUNC) &_AlphaSimR_popVarCpp, 1}, diff --git a/src/meiosis.cpp b/src/meiosis.cpp index a7f9ee30..7a36b305 100644 --- a/src/meiosis.cpp +++ b/src/meiosis.cpp @@ -72,6 +72,53 @@ arma::Mat RecHist::getHist(arma::uword ind, return hist(ind)(chr)(par); } +// Like RecHist, but store double positions (e.g., genetic coordinate) +class RecHistDbl{ +public: + arma::field< //individual + arma::field< //chromosome + arma::field< //ploidy + arma::Mat > > > hist; //(chr, posGen) + + void setSize(arma::uword nInd, + arma::uword nChr, + arma::uword ploidy); + + void addHist(arma::Mat& input, + arma::uword nInd, + arma::uword chrGroup, + arma::uword chrInd); + + arma::Mat getHist(arma::uword ind, + arma::uword chr, + arma::uword par); +}; + +void RecHistDbl::setSize(arma::uword nInd, + arma::uword nChr, + arma::uword ploidy=2){ + hist.set_size(nInd); + for(arma::uword i=0; i& input, + arma::uword nInd, + arma::uword chrGroup, + arma::uword chrInd){ + hist(nInd)(chrGroup)(chrInd) = input; +} + +arma::Mat RecHistDbl::getHist(arma::uword ind, + arma::uword chr, + arma::uword par){ + return hist(ind)(chr)(par); +} + // Samples the locations for chiasmata via a gamma process // end, the length of the interval used to sample // v, the interference parameter @@ -396,6 +443,55 @@ arma::Mat findBivalentCO(const arma::vec& genMap, double v, double p, return removeDoubleCO(output); } +// Continuous (genetic-coordinate) recombination history for a bivalent pair +// Output columns: (originChr, startPosGen) +// Row 0 is always (1, 0.0) +arma::Mat findBivalentCO_gen(const arma::vec& genMap, double v, double p, + alphasimrRng::rngEngine& rng){ + arma::uword readChr = 0; + double genLen = genMap(genMap.n_elem-1); + + // 1) Sample crossover positions on the genetic map + arma::vec posCO = sampleChiasmata(genLen, v, p, rng); + + // If no chiasmata were sampled: return a single record (all from chr1) + if(posCO.n_elem==0){ + arma::Mat output(1,2); + output(0,0) = 1.0; // origin chr1 + output(0,1) = 0.0; // start at beginning of chromosome (continuous) + return output; + } + + // 2)Thin crossovers + arma::vec thin = alphasimrRng::runifVec(posCO.n_elem, rng); + posCO = posCO(find(thin>0.5)); + + arma::uword nCO = posCO.n_elem; + + arma::Mat output(nCO+1, 2); + + // 3) Allocate output + output(0,0) = 1.0; + output(0,1) = 0.0; + + if(nCO==0){ + return output; + } + + posCO = sort(posCO); + + // 4) Alternate origin each crossover + for(arma::uword i=0; i& chr1, } } +// Simulates a gamete using the existing discrete (bin-based) model for geno, +// AND also returns a continuous (genetic-coordinate) recombination history. +// +// - hist: int matrix (originChr, startSite/bin) used for transferGeno +// - histGen: double matrix (originChr, startPosGen) keeping all breakpoints +void bivalent2(const arma::Col& chr1, + const arma::Col& chr2, + const arma::vec& genMap, + double v, + double p, + arma::Col& output, + arma::Mat& hist, + arma::Mat& histGen, + alphasimrRng::rngEngine& rng){ + + arma::uword startPos = 0; + arma::uword endPos; + arma::uword readChr = 0; + double genLen = genMap(genMap.n_elem - 1); + + // 1) Sample crossover positions once (shared) + arma::vec posCO = sampleChiasmata(genLen, v, p, rng); + + // 2) Thin crossovers (same rule as original) + if(posCO.n_elem > 0){ + arma::vec thin = alphasimrRng::runifVec(posCO.n_elem, rng); + posCO = posCO(find(thin > 0.5)); + } + + // Ensure increasing order for intervalSearch and for histGen + if(posCO.n_elem > 1){ + posCO = sort(posCO); + } + + arma::uword nCO = posCO.n_elem; + + // 3) Build continuous history (keep all breakpoints) + // Row 0 always starts from chr 1 at position 0.0 + histGen.set_size(nCO + 1, 2); + histGen(0,0) = 1.0; + histGen(0,1) = 0.0; + + readChr = 0; + for(arma::uword i = 0; i < nCO; ++i){ + readChr = (readChr + 1) % 2; + histGen(i + 1, 0) = double(readChr + 1); + histGen(i + 1, 1) = posCO(i); + } + + // 4) Build discrete history for transferGeno (may be simplified) + // Match original convention: row0 is (1,1); later start sites use endPos+2 + arma::Mat histRaw(nCO + 1, 2); + histRaw(0,0) = 1; + histRaw(0,1) = 1; + + if(nCO == 0){ + // No crossovers: single record is enough + hist = histRaw; + output = chr1; + return; + } + + readChr = 0; + startPos = 0; + for(arma::uword i = 0; i < nCO; ++i){ + readChr = (readChr + 1) % 2; + double x = posCO(i); + endPos = intervalSearch(genMap, x, startPos); + histRaw(i + 1, 0) = int(readChr + 1); + histRaw(i + 1, 1) = int(endPos + 2); + startPos = endPos; + } + + // Remove unobservable/redundant records for the discrete geno-transfer path only + hist = removeDoubleCO(histRaw); + + // 5) Use the discrete history to transfer genotype bits (unchanged logic) + int nBins = chr1.n_elem; + + if(hist.n_rows == 1){ + output = chr1; + return; + } + + for(arma::uword i = 0; i < (hist.n_rows - 1); ++i){ + switch(hist(i,0)){ + case 1: + transferGeno(chr1, output, hist(i,1), hist(i+1,1)); + break; + case 2: + transferGeno(chr2, output, hist(i,1), hist(i+1,1)); + break; + } + } + + switch(hist(hist.n_rows - 1, 0)){ + case 1: + transferGeno(chr1, output, hist(hist.n_rows - 1, 1), nBins*8 + 1); + break; + case 2: + transferGeno(chr2, output, hist(hist.n_rows - 1, 1), nBins*8 + 1); + break; + } +} + + +// Simulates a gamete using the existing discrete (bin-based) model for geno, +// AND also returns a continuous (genetic-coordinate) recombination history. +// +// - hist: original int matrix (originChr, startSite/bin) used for transferGeno +// - histGen: new double matrix (originChr, startPosGen) with true breakpoints +void bivalent2Old(const arma::Col& chr1, + const arma::Col& chr2, + const arma::vec& genMap, + double v, + double p, + arma::Col& output, + arma::Mat& hist, + // histGen added + arma::Mat& histGen, + alphasimrRng::rngEngine& rng){ + + hist = findBivalentCO(genMap, v, p, rng); + + // histGen added + histGen = findBivalentCO_gen(genMap, v, p, rng); + + if(hist.n_rows==1){ + output = chr1; + }else{ + int nBins = chr1.n_elem; + + for(arma::uword i=0; i<(hist.n_rows-1); ++i){ + switch(hist(i,0)){ + case 1: + transferGeno(chr1, output, hist(i,1), hist(i+1,1)); + break; + case 2: + transferGeno(chr2, output, hist(i,1), hist(i+1,1)); + } + } + + switch(hist(hist.n_rows-1,0)){ + case 1: + transferGeno(chr1, output, hist(hist.n_rows-1,1), nBins*8+1); + break; + case 2: + transferGeno(chr2, output, hist(hist.n_rows-1,1), nBins*8+1); + } + } +} + // Simulates a gamete using a count-location model for recombination // rng is the explicit dqrng stream used for crossover sampling and thinning. void quadrivalent(const arma::Col& chr1, @@ -1099,7 +1347,9 @@ Rcpp::List cross( const arma::vec& motherCentromere, const arma::vec& fatherCentromere, double quadProb, - int nThreads){ + int nThreads, + /* modified by Jinyang */ + bool trackRecGen){ mother -= 1; // R to C++ father -= 1; // R to C++ arma::uword ploidy = (motherPloidy+fatherPloidy)/2; @@ -1111,6 +1361,13 @@ Rcpp::List cross( if(trackRec){ hist.setSize(nInd,nChr,ploidy); } + + // modified by Jinyang + RecHistDbl histGen; + if(trackRecGen){ + histGen.setSize(nInd, nChr, ploidy); + } + if(nChr < static_cast(nThreads) ){ nThreads = nChr; } @@ -1122,6 +1379,8 @@ Rcpp::List cross( for(arma::uword chr=0; chr hist1, hist2; + // modified by Jinyang + arma::Mat histG1, histG2; arma::uvec xm(motherPloidy); // Indicator for mother chromosomes for(arma::uword i=0; i2){ if(alphasimrRng::runif(rng)>quadProb){ //Bivalent 1 - bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), - motherGeno(chr).slice(mother(ind)).col(xm(x+1)), - femaleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else {// ----modified by Jinyang + bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1158,17 +1430,40 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xm(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xm(x))+1)); + histG1.col(0).replace(200.0, double(int(xm(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } + // ----modified by Jinyang ++progenyChr; //Bivalent 2 - bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x+2)), - motherGeno(chr).slice(mother(ind)).col(xm(x+3)), - femaleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(motherGeno(chr).slice(mother(ind)).col(xm(x+2)), + motherGeno(chr).slice(mother(ind)).col(xm(x+3)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x+2)), + motherGeno(chr).slice(mother(ind)).col(xm(x+3)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1176,6 +1471,15 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xm(x+3))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xm(x+2))+1)); + histG1.col(0).replace(200.0, double(int(xm(x+3))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } + // ----modified by Jinyang ++progenyChr; }else{ //Quadrivalent @@ -1208,14 +1512,29 @@ Rcpp::List cross( } }else{ //Bivalent - bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), - motherGeno(chr).slice(mother(ind)).col(xm(x+1)), - femaleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(motherGeno(chr).slice(mother(ind)).col(xm(x)), + motherGeno(chr).slice(mother(ind)).col(xm(x+1)), + femaleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } + tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1223,6 +1542,15 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xm(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xm(x))+1)); + histG1.col(0).replace(200.0, double(int(xm(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } + // ----modified by Jinyang ++progenyChr; } } @@ -1233,14 +1561,28 @@ Rcpp::List cross( if((fatherPloidy-x)>2){ if(alphasimrRng::runif(rng)>quadProb){ //Bivalent 1 - bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), - fatherGeno(chr).slice(father(ind)).col(xf(x+1)), - maleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1248,17 +1590,39 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xf(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xf(x))+1)); + histG1.col(0).replace(200.0, double(int(xf(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } ++progenyChr; //Bivalent 2 - bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x+2)), - fatherGeno(chr).slice(father(ind)).col(xf(x+3)), - maleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(fatherGeno(chr).slice(father(ind)).col(xf(x+2)), + fatherGeno(chr).slice(father(ind)).col(xf(x+3)), + maleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x+2)), + fatherGeno(chr).slice(father(ind)).col(xf(x+3)), + maleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1266,6 +1630,14 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xf(x+3))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xf(x+2))+1)); + histG1.col(0).replace(200.0, double(int(xf(x+3))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } ++progenyChr; }else{ //Quadrivalent @@ -1298,14 +1670,28 @@ Rcpp::List cross( } }else{ //Bivalent - bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), - fatherGeno(chr).slice(father(ind)).col(xf(x+1)), - maleMap(chr), - v, - p, - gamete1, - hist1, - rng); + // modified by Jinyang ---- + if(trackRecGen){ + bivalent2(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + histG1, + rng); + } else { + // ----modified by Jinyang + bivalent(fatherGeno(chr).slice(father(ind)).col(xf(x)), + fatherGeno(chr).slice(father(ind)).col(xf(x+1)), + maleMap(chr), + v, + p, + gamete1, + hist1, + rng); + } tmpGeno.slice(ind).col(progenyChr) = gamete1; if(trackRec){ hist1.col(0) *= 100; //To avoid conflicts @@ -1313,15 +1699,31 @@ Rcpp::List cross( hist1.col(0).replace(200,int(xf(x+1))+1); hist.addHist(hist1,ind,chr,progenyChr); } + // modified by Jinyang ---- + if(trackRecGen){ + // mirror the same origin remapping on histG1 (double) + histG1.col(0) *= 100.0; + histG1.col(0).replace(100.0, double(int(xf(x))+1)); + histG1.col(0).replace(200.0, double(int(xf(x+1))+1)); + histGen.addHist(histG1, ind, chr, progenyChr); + } ++progenyChr; } } } //End individual loop geno(chr) = tmpGeno; } //End chromosome loop + + // modified by Jinyang ---- if(trackRec){ - return Rcpp::List::create(Rcpp::Named("geno")=geno, - Rcpp::Named("recHist")=hist.hist); + if(trackRecGen){ + return Rcpp::List::create(Rcpp::Named("geno")=geno, + Rcpp::Named("recHist")=hist.hist, + Rcpp::Named("recHistGen")=histGen.hist); + } else { + return Rcpp::List::create(Rcpp::Named("geno")=geno, + Rcpp::Named("recHist")=hist.hist); + } } return Rcpp::List::create(Rcpp::Named("geno")=geno); }