From 5c2030e0f63192b1b981cb63c89997110df0c08b Mon Sep 17 00:00:00 2001 From: tfenne Date: Sat, 18 Feb 2023 05:38:22 -0700 Subject: [PATCH 1/2] Attempt a smarter offset detection when "guides" are short and might be prone to multiple matches per read. --- src/commands/count.rs | 55 ++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/src/commands/count.rs b/src/commands/count.rs index 1af2f79..a34a60c 100644 --- a/src/commands/count.rs +++ b/src/commands/count.rs @@ -243,35 +243,56 @@ impl Count { min_fraction: f64, ) -> Result { let guide_length = library.guide_length; - let mut prefix_lengths = vec![0u64; 500]; + let mut prefix_lengths = vec![0f64; 500]; let mut count = 0u64; // Parse the first `sample_size` records to find exact match guides and // extract the sequence that precedes the guide parse_path(Some(fastq), |parser| { - parser - .each(|rec| { - let read_bases = rec.seq(); - let read_length = read_bases.len(); + let mut read_offsets = Vec::::with_capacity(5); - if read_length >= guide_length { - for trim in 0..=(read_length - guide_length) { - let bases = &read_bases[trim..trim + guide_length]; + parser.each(|rec| { + let read_bases = rec.seq(); + let read_length = read_bases.len(); - if lookup.contains_key(bases) { - prefix_lengths[trim] += 1; - } + if read_length >= guide_length { + for trim in 0..=(read_length - guide_length) { + let bases = &read_bases[trim..trim + guide_length]; + + if lookup.contains_key(bases) { + read_offsets.push(trim); } } + } + + // If the read only matched at a single offset, just use it; otherwise prefer + // match(es) that are perfect matches and allocation proportionally. + if read_offsets.len() == 1 { + prefix_lengths[read_offsets[0]] += 1.0; + } + else if read_offsets.len() > 1 { + let perfect_match_offsets = read_offsets.iter().copied().filter(|&off| { + let bases = &read_bases[off..off + guide_length]; + let guide = lookup.get(bases).unwrap(); + guide.bases.as_slice() == bases + }).collect_vec(); + + let preferred_offsets = if perfect_match_offsets.is_empty() { &read_offsets } else { &perfect_match_offsets }; + let addend = 1f64 / preferred_offsets.len() as f64; + for offset in preferred_offsets.iter().copied() { + prefix_lengths[offset] += addend; + } + } - count += 1; - count < sample_size - }) - .expect("Failed to parse."); + read_offsets.clear(); + count += 1; + count < sample_size + }) + .expect("Failed to parse."); }) .context(format!("Failed to read {:?}", fastq))?; - let total_matched: u64 = prefix_lengths.iter().sum(); + let total_matched: f64 = prefix_lengths.iter().sum(); let fraction_matched = total_matched as f64 / count as f64; info!( "In {:?} examined {} reads for guide start position and matched {} ({:.4}).", @@ -280,7 +301,7 @@ impl Count { // Tuple of offset -> count where count is > 0 let non_zeros = - prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0).collect_vec(); + prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0.0).collect_vec(); info!( "{} read offsets: {}", From 0d4bb8821e73330d8a57d3423bab55816f7b87f5 Mon Sep 17 00:00:00 2001 From: tfenne Date: Sat, 18 Feb 2023 05:56:59 -0700 Subject: [PATCH 2/2] Fixing lints. --- src/commands/count.rs | 75 ++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/src/commands/count.rs b/src/commands/count.rs index a34a60c..c966bb8 100644 --- a/src/commands/count.rs +++ b/src/commands/count.rs @@ -251,49 +251,58 @@ impl Count { parse_path(Some(fastq), |parser| { let mut read_offsets = Vec::::with_capacity(5); - parser.each(|rec| { - let read_bases = rec.seq(); - let read_length = read_bases.len(); + parser + .each(|rec| { + let read_bases = rec.seq(); + let read_length = read_bases.len(); - if read_length >= guide_length { - for trim in 0..=(read_length - guide_length) { - let bases = &read_bases[trim..trim + guide_length]; + if read_length >= guide_length { + for trim in 0..=(read_length - guide_length) { + let bases = &read_bases[trim..trim + guide_length]; - if lookup.contains_key(bases) { - read_offsets.push(trim); + if lookup.contains_key(bases) { + read_offsets.push(trim); + } } } - } - // If the read only matched at a single offset, just use it; otherwise prefer - // match(es) that are perfect matches and allocation proportionally. - if read_offsets.len() == 1 { - prefix_lengths[read_offsets[0]] += 1.0; - } - else if read_offsets.len() > 1 { - let perfect_match_offsets = read_offsets.iter().copied().filter(|&off| { - let bases = &read_bases[off..off + guide_length]; - let guide = lookup.get(bases).unwrap(); - guide.bases.as_slice() == bases - }).collect_vec(); - - let preferred_offsets = if perfect_match_offsets.is_empty() { &read_offsets } else { &perfect_match_offsets }; - let addend = 1f64 / preferred_offsets.len() as f64; - for offset in preferred_offsets.iter().copied() { - prefix_lengths[offset] += addend; + // If the read only matched at a single offset, just use it; otherwise prefer + // match(es) that are perfect matches and allocation proportionally. + #[allow(clippy::comparison_chain)] + if read_offsets.len() == 1 { + prefix_lengths[read_offsets[0]] += 1.0; + } else if read_offsets.len() > 1 { + let perfect_match_offsets = read_offsets + .iter() + .copied() + .filter(|&off| { + let bases = &read_bases[off..off + guide_length]; + let guide = lookup.get(bases).unwrap(); + guide.bases.as_slice() == bases + }) + .collect_vec(); + + let preferred_offsets = if perfect_match_offsets.is_empty() { + &read_offsets + } else { + &perfect_match_offsets + }; + let addend = 1.0 / preferred_offsets.len() as f64; + for offset in preferred_offsets.iter().copied() { + prefix_lengths[offset] += addend; + } } - } - read_offsets.clear(); - count += 1; - count < sample_size - }) - .expect("Failed to parse."); + read_offsets.clear(); + count += 1; + count < sample_size + }) + .expect("Failed to parse."); }) .context(format!("Failed to read {:?}", fastq))?; let total_matched: f64 = prefix_lengths.iter().sum(); - let fraction_matched = total_matched as f64 / count as f64; + let fraction_matched = total_matched / count as f64; info!( "In {:?} examined {} reads for guide start position and matched {} ({:.4}).", fastq, count, total_matched, fraction_matched @@ -312,7 +321,7 @@ impl Count { // Filter to just those trim lengths that have at least min_fraction of the data each let trims_to_return: Vec = non_zeros .into_iter() - .filter(|(_idx, n)| *n as f64 / total_matched as f64 >= min_fraction) + .filter(|(_idx, n)| *n / total_matched >= min_fraction) .map(|(idx, _n)| idx) .collect();