From b57c049008390238241516231225b03867b1e2ad Mon Sep 17 00:00:00 2001 From: Psy-Fer Date: Mon, 1 Jun 2026 16:05:34 +1000 Subject: [PATCH] STAR-compatible optional tags on unmapped records --- src/align/read_align.rs | 53 ++++++++++++++++++- src/io/bam.rs | 23 +++++--- src/io/sam.rs | 113 +++++++++++++++++++++++++++++++++++++--- src/lib.rs | 9 ++-- 4 files changed, 181 insertions(+), 17 deletions(-) diff --git a/src/align/read_align.rs b/src/align/read_align.rs index 876ec91..89ea66b 100644 --- a/src/align/read_align.rs +++ b/src/align/read_align.rs @@ -634,8 +634,16 @@ pub fn align_read( } let unmapped_reason = if transcripts.is_empty() { - // Transcripts were generated by DP but all filtered out - Some(UnmappedReason::TooShort) + let has_mismatch = filter_reasons.contains_key("mismatch_max") + || filter_reasons.contains_key("mismatch_rate"); + let has_other = filter_reasons + .keys() + .any(|k| *k != "mismatch_max" && *k != "mismatch_rate"); + if has_mismatch && !has_other { + Some(UnmappedReason::TooManyMismatches) + } else { + Some(UnmappedReason::TooShort) + } } else { None }; @@ -1619,6 +1627,47 @@ mod tests { assert!(result.is_ok()); } + #[test] + fn test_unmapped_reason_mismatch_classification() { + // Verify the filter_reasons logic used to derive unmapped_reason. + // mismatch-only → TooManyMismatches; anything else (or mixed) → TooShort. + let classify = |reasons: &[&str]| -> UnmappedReason { + let filter_reasons: std::collections::HashMap<&str, i32> = + reasons.iter().map(|k| (*k, 1)).collect(); + let has_mismatch = filter_reasons.contains_key("mismatch_max") + || filter_reasons.contains_key("mismatch_rate"); + let has_other = filter_reasons + .keys() + .any(|k| *k != "mismatch_max" && *k != "mismatch_rate"); + if has_mismatch && !has_other { + UnmappedReason::TooManyMismatches + } else { + UnmappedReason::TooShort + } + }; + + assert_eq!( + classify(&["mismatch_max"]), + UnmappedReason::TooManyMismatches + ); + assert_eq!( + classify(&["mismatch_rate"]), + UnmappedReason::TooManyMismatches + ); + assert_eq!( + classify(&["mismatch_max", "mismatch_rate"]), + UnmappedReason::TooManyMismatches + ); + // Mixed with score filter → TooShort + assert_eq!( + classify(&["mismatch_max", "score_min"]), + UnmappedReason::TooShort + ); + // Score-only → TooShort + assert_eq!(classify(&["score_min"]), UnmappedReason::TooShort); + assert_eq!(classify(&["match_min"]), UnmappedReason::TooShort); + } + #[test] fn test_transcript_multimap_limit() { let index = make_test_index(); diff --git a/src/io/bam.rs b/src/io/bam.rs index e18b123..e9c9183 100644 --- a/src/io/bam.rs +++ b/src/io/bam.rs @@ -503,7 +503,11 @@ mod tests { // Build record using SAM builder let record = crate::io::sam::SamWriter::build_unmapped_record( - read_name, &read_seq, &read_qual, None, + read_name, + &read_seq, + &read_qual, + ¶ms, + crate::stats::UnmappedReason::Other, ) .unwrap(); @@ -620,14 +624,16 @@ mod tests { "read1", &[0, 1, 2, 3], &[30, 30, 30, 30], - None, + ¶ms, + crate::stats::UnmappedReason::Other, ) .unwrap(), crate::io::sam::SamWriter::build_unmapped_record( "read2", &[0, 1, 2, 3], &[30, 30, 30, 30], - None, + ¶ms, + crate::stats::UnmappedReason::Other, ) .unwrap(), ]; @@ -674,9 +680,14 @@ mod tests { let temp_file = NamedTempFile::new().unwrap(); let mut writer = SortedBamWriter::create(temp_file.path(), &genome, ¶ms).unwrap(); // Add a record to trigger the limit - let rec = - crate::io::sam::SamWriter::build_unmapped_record("r1", &[0, 1, 2, 3], &[30; 4], None) - .unwrap(); + let rec = crate::io::sam::SamWriter::build_unmapped_record( + "r1", + &[0, 1, 2, 3], + &[30; 4], + ¶ms, + crate::stats::UnmappedReason::Other, + ) + .unwrap(); writer.write_batch(&[rec]).unwrap(); let result = writer.finish(); assert!(result.is_err(), "Should fail when RAM limit is exceeded"); diff --git a/src/io/sam.rs b/src/io/sam.rs index 7fda237..29716ea 100644 --- a/src/io/sam.rs +++ b/src/io/sam.rs @@ -7,6 +7,7 @@ use crate::io::fastq::{complement_base, decode_base}; use crate::junction::encode_motif; use crate::mapq::calculate_mapq; use crate::params::{Parameters, SamAttributes}; +use crate::stats::UnmappedReason; use bstr::BString; use noodles::sam; use noodles::sam::alignment::io::Write; @@ -47,6 +48,33 @@ impl BufferedSamRecords { } } +/// Insert STAR-compatible optional tags on unmapped records. +/// +/// NH, HI, AS, nM are gated on `outSAMattributes`; `uT:A:` is always emitted +/// (STAR emits it unconditionally so QC tools can parse unmapping categories). +fn insert_unmapped_tags(record: &mut RecordBuf, attrs: SamAttributes, reason: UnmappedReason) { + let data = record.data_mut(); + if attrs.contains(SamAttributes::NH) { + data.insert(Tag::ALIGNMENT_HIT_COUNT, Value::from(0i32)); + } + if attrs.contains(SamAttributes::HI) { + data.insert(Tag::HIT_INDEX, Value::from(0i32)); + } + if attrs.contains(SamAttributes::AS) { + data.insert(Tag::ALIGNMENT_SCORE, Value::from(0i32)); + } + if attrs.contains(SamAttributes::NM) { + data.insert(Tag::new(b'n', b'M'), Value::from(0i32)); + } + let ut = match reason { + UnmappedReason::Other => b'0', + UnmappedReason::TooShort => b'1', + UnmappedReason::TooManyMismatches => b'2', + UnmappedReason::TooManyLoci => b'3', + }; + data.insert(Tag::new(b'u', b'T'), Value::Character(ut)); +} + /// SAM file writer pub struct SamWriter { writer: sam::io::Writer>, @@ -167,12 +195,14 @@ impl SamWriter { /// * `read_name` - Read identifier /// * `read_seq` - Read sequence (encoded) /// * `read_qual` - Quality scores - /// * `rg_id` - Optional read group ID to emit as `RG:Z:` tag + /// * `params` - Parameters (used for attribute gating and RG tag) + /// * `unmapped_reason` - Why the read was not mapped (drives `uT:A:` tag) pub fn build_unmapped_record( read_name: &str, read_seq: &[u8], read_qual: &[u8], - rg_id: Option<&str>, + params: &Parameters, + unmapped_reason: UnmappedReason, ) -> Result { let mut record = RecordBuf::default(); @@ -189,7 +219,9 @@ impl SamWriter { *record.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(read_qual)); - maybe_insert_rg_tag(&mut record, rg_id); + let rg_id_owned = params.primary_rg_id()?; + maybe_insert_rg_tag(&mut record, rg_id_owned.as_deref()); + insert_unmapped_tags(&mut record, params.out_sam_attributes, unmapped_reason); Ok(record) } @@ -276,7 +308,13 @@ impl SamWriter { if paired_alignments.is_empty() { // Both mates unmapped return Self::build_paired_unmapped_records( - read_name, mate1_seq, mate1_qual, mate2_seq, mate2_qual, params, + read_name, + mate1_seq, + mate1_qual, + mate2_seq, + mate2_qual, + params, + UnmappedReason::Other, ); } @@ -529,6 +567,7 @@ impl SamWriter { *unmapped_rec.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(unmapped_qual)); maybe_insert_rg_tag(&mut unmapped_rec, rg_id); + insert_unmapped_tags(&mut unmapped_rec, attrs, UnmappedReason::Other); // Order: mate1 first, mate2 second if mate1_is_mapped { @@ -668,10 +707,12 @@ impl SamWriter { mate2_seq: &[u8], mate2_qual: &[u8], params: &Parameters, + unmapped_reason: UnmappedReason, ) -> Result, Error> { let mut records = Vec::with_capacity(2); let rg_id_owned = params.primary_rg_id()?; let rg_id = rg_id_owned.as_deref(); + let attrs = params.out_sam_attributes; // Mate1 record let mut rec1 = RecordBuf::default(); @@ -688,6 +729,7 @@ impl SamWriter { *rec1.sequence_mut() = Sequence::from(seq1_bytes); *rec1.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(mate1_qual)); maybe_insert_rg_tag(&mut rec1, rg_id); + insert_unmapped_tags(&mut rec1, attrs, unmapped_reason); records.push(rec1); // Mate2 record @@ -705,6 +747,7 @@ impl SamWriter { *rec2.sequence_mut() = Sequence::from(seq2_bytes); *rec2.quality_scores_mut() = QualityScores::from(fastq_qual_to_phred(mate2_qual)); maybe_insert_rg_tag(&mut rec2, rg_id); + insert_unmapped_tags(&mut rec2, attrs, unmapped_reason); records.push(rec2); Ok(records) @@ -1568,7 +1611,15 @@ mod tests { // FASTQ ASCII 'I' = 73 → Phred 40 let read_qual = b"III"; let read_seq = vec![0u8, 1, 2]; // ACG - let record = SamWriter::build_unmapped_record("read1", &read_seq, read_qual, None).unwrap(); + let params = Parameters::parse_from(["rustar-aligner", "--readFilesIn", "t.fq"]); + let record = SamWriter::build_unmapped_record( + "read1", + &read_seq, + read_qual, + ¶ms, + UnmappedReason::Other, + ) + .unwrap(); let stored: &[u8] = record.quality_scores().as_ref(); assert_eq!(stored, &[40u8, 40, 40]); @@ -1581,12 +1632,61 @@ mod tests { // Byte 32 is below the Phred+33 floor; should clamp to 0. let read_qual = &[32u8, 33, 34][..]; let read_seq = vec![0u8, 1, 2]; - let record = SamWriter::build_unmapped_record("read1", &read_seq, read_qual, None).unwrap(); + let params = Parameters::parse_from(["rustar-aligner", "--readFilesIn", "t.fq"]); + let record = SamWriter::build_unmapped_record( + "read1", + &read_seq, + read_qual, + ¶ms, + UnmappedReason::Other, + ) + .unwrap(); let stored: &[u8] = record.quality_scores().as_ref(); assert_eq!(stored, &[0u8, 0, 1]); } + #[test] + fn test_unmapped_record_tags_emitted() { + let params = Parameters::parse_from(["rustar-aligner", "--readFilesIn", "t.fq"]); + let read_seq = vec![0u8, 1, 2, 3]; + let read_qual = b"IIII"; + + for (reason, expected_ut) in [ + (UnmappedReason::Other, b'0'), + (UnmappedReason::TooShort, b'1'), + (UnmappedReason::TooManyMismatches, b'2'), + (UnmappedReason::TooManyLoci, b'3'), + ] { + let record = + SamWriter::build_unmapped_record("r", &read_seq, read_qual, ¶ms, reason) + .unwrap(); + let data = record.data(); + // NH/HI/AS present (Standard attrs enabled by default) + assert_eq!( + data.get(&Tag::ALIGNMENT_HIT_COUNT), + Some(&Value::from(0i32)), + "NH:i:0 missing for {reason:?}" + ); + assert_eq!( + data.get(&Tag::HIT_INDEX), + Some(&Value::from(0i32)), + "HI:i:0 missing for {reason:?}" + ); + assert_eq!( + data.get(&Tag::ALIGNMENT_SCORE), + Some(&Value::from(0i32)), + "AS:i:0 missing for {reason:?}" + ); + // uT:A: always emitted with correct value + assert_eq!( + data.get(&Tag::new(b'u', b'T')), + Some(&Value::Character(expected_ut)), + "uT:A: wrong for {reason:?}" + ); + } + } + #[test] fn test_transcript_to_record() { use cigar::op::{Kind, Op}; @@ -1760,6 +1860,7 @@ mod tests { &mate2_seq, &mate2_qual, ¶ms, + UnmappedReason::Other, ) .unwrap(); diff --git a/src/lib.rs b/src/lib.rs index 008594b..3748423 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -907,7 +907,6 @@ fn align_reads_single_end( let output_unmapped = params.out_sam_unmapped != params::OutSamUnmapped::None; let write_unmapped_fastq = params.out_reads_unmapped == params::OutReadsUnmapped::Fastx; let by_sjout = params.out_filter_type == OutFilterType::BySJout; - let rg_id_owned = params.primary_rg_id()?; // BySJout disk buffer: SAM records written to a temp file; only compact metadata kept in RAM. // For 100M reads this avoids ~60 GB of Vec in memory. @@ -982,7 +981,8 @@ fn align_reads_single_end( &read.name, &clipped_seq, &clipped_qual, - rg_id_owned.as_deref(), + params, + crate::stats::UnmappedReason::Other, )?; buffer.push(record); } @@ -1059,7 +1059,8 @@ fn align_reads_single_end( &read.name, &clipped_seq, &clipped_qual, - rg_id_owned.as_deref(), + params, + unmapped_reason.unwrap_or(crate::stats::UnmappedReason::Other), )?; buffer.push(record); } @@ -1422,6 +1423,7 @@ fn align_reads_paired_end( &m2_seq, &m2_qual, params, + crate::stats::UnmappedReason::Other, )?; for record in records { buffer.push(record); @@ -1578,6 +1580,7 @@ fn align_reads_paired_end( &m2_seq, &m2_qual, params, + unmapped_reason.unwrap_or(crate::stats::UnmappedReason::Other), )?; for record in records { buffer.push(record);