Skip to content

Commit e509e24

Browse files
committedMar 26, 2025
fix: Edge Case Handling
For single bp reads, there is an ambiguity in the SAM format ``` read001 0 ref 1 255 1M * 0 0 A * ``` The read above could have a QV value of `*`, or no QV values at all. Since htslib [sets a sentinel value of 255](https://github.com/samtools/htslib/blob/b14fffb4b5689696723634a20b85219be92f9d06/sam.c#L3029-L3030) in this case, it can lead to overflow in downstream operations.
1 parent e8611de commit e509e24

File tree

1 file changed

+37
-1
lines changed

1 file changed

+37
-1
lines changed
 

‎src/lib.rs

+37-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use std::io::prelude::*;
1111
use std::io::BufReader;
1212
use std::os::raw::{c_char, c_int};
1313
use std::path::Path;
14+
use std::slice;
1415
use std::sync::Arc;
1516

1617
pub struct StarReference {
@@ -254,7 +255,20 @@ impl StarAligner {
254255

255256
Self::prepare_fastq(&mut self.fastq1, name, read, qual);
256257
align_read_rust(self.aligner, self.fastq1.as_slice(), &mut self.aln_buf).unwrap();
257-
self.parse_sam_to_records(name)
258+
let mut records = self.parse_sam_to_records(name);
259+
// hts_lib parses reads of length 1 with a "*" quality score the same as
260+
// no quality score and sets it to a flag value of 255, handle this edge case by
261+
// restoring the quality value. Note aligning reads of length 1 not a great idea...
262+
if qual.len() == 1 && qual[0] == 42 {
263+
records.iter_mut().for_each(|r| {
264+
// Kind of nasty way to mutate data that isn't exposed.
265+
let data: &mut [u8] =
266+
unsafe { slice::from_raw_parts_mut(r.inner.data, r.inner.l_data as usize) };
267+
data[r.inner.core.l_qname as usize + r.cigar_len() * 4 + (r.seq_len() + 1) / 2..]
268+
[0] = 42 - 33;
269+
});
270+
}
271+
records
258272
}
259273

260274
/// Aligns a given read and return the resulting SAM string
@@ -443,6 +457,7 @@ fn align_read_pair_rust(
443457
#[cfg(test)]
444458
mod test {
445459
use super::*;
460+
use rust_htslib::bam::record::Aux;
446461
use rust_htslib::bam::Read;
447462

448463
/// References to some commonly used reference genomes for testing purposes
@@ -479,7 +494,28 @@ mod test {
479494
let recs = aligner.align_read(b"a", b"", b"");
480495
println!("{:?}", recs);
481496
let recs = aligner.align_read(b"b", b"A", b"?");
497+
let expected_qual: &[u8] = &[63 - 33];
482498
println!("{:?}", recs);
499+
assert_eq!(recs[0].qual(), expected_qual, "Quality value incorrect");
500+
let recs = aligner.align_read(b"Hello there", b"C", b"*");
501+
let record = &recs[0];
502+
let expected_qual: &[u8] = &[42 - 33];
503+
assert_eq!(
504+
record.qual(),
505+
expected_qual,
506+
"Quality value was not corrected."
507+
);
508+
assert_eq!(record.qname(), b"Hello there");
509+
let aux_tags: Vec<(&[u8], Aux)> = record.aux_iter().map(|z| z.unwrap()).collect();
510+
let expected_aux_tags = ["NH", "HI", "AS", "nM", "uT"];
511+
println!("Aux Tags {:?}", aux_tags);
512+
for (i, tag) in aux_tags.into_iter().enumerate() {
513+
assert_eq!(
514+
String::from_utf8(tag.0.into()).unwrap(),
515+
expected_aux_tags[i]
516+
);
517+
println!("TAG {:?}", String::from_utf8(tag.0.into()));
518+
}
483519
let (recs1, recs2) = aligner.align_read_pair(b"a", b"", b"", b"", b"");
484520
println!("{:?}, {:?}", recs1, recs2);
485521
let (recs1, recs2) = aligner.align_read_pair(b"b", b"A", b"?", b"", b"");

0 commit comments

Comments
 (0)