Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attempt a smarter offset detection when "guides" are short and might … #14

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions src/commands/count.rs
Original file line number Diff line number Diff line change
Expand Up @@ -243,12 +243,14 @@ impl Count {
min_fraction: f64,
) -> Result<PrefixInfo> {
let guide_length = library.guide_length;
let mut prefix_lengths = vec![0u64; 500];
let mut prefix_lengths = vec![0f64; 500];
let mut count = 0u64;

// Parse the first `sample_size` records to find exact match guides and
// extract the sequence that precedes the guide
parse_path(Some(fastq), |parser| {
let mut read_offsets = Vec::<usize>::with_capacity(5);

parser
.each(|rec| {
let read_bases = rec.seq();
Expand All @@ -259,28 +261,56 @@ impl Count {
let bases = &read_bases[trim..trim + guide_length];

if lookup.contains_key(bases) {
prefix_lengths[trim] += 1;
read_offsets.push(trim);
}
}
}

// If the read only matched at a single offset, just use it; otherwise prefer
// match(es) that are perfect matches and allocation proportionally.
#[allow(clippy::comparison_chain)]
if read_offsets.len() == 1 {
prefix_lengths[read_offsets[0]] += 1.0;
} else if read_offsets.len() > 1 {
let perfect_match_offsets = read_offsets
.iter()
.copied()
.filter(|&off| {
let bases = &read_bases[off..off + guide_length];
let guide = lookup.get(bases).unwrap();
guide.bases.as_slice() == bases
})
.collect_vec();

let preferred_offsets = if perfect_match_offsets.is_empty() {
&read_offsets
} else {
&perfect_match_offsets
};
let addend = 1.0 / preferred_offsets.len() as f64;
for offset in preferred_offsets.iter().copied() {
prefix_lengths[offset] += addend;
}
}

read_offsets.clear();
count += 1;
count < sample_size
})
.expect("Failed to parse.");
})
.context(format!("Failed to read {:?}", fastq))?;

let total_matched: u64 = prefix_lengths.iter().sum();
let fraction_matched = total_matched as f64 / count as f64;
let total_matched: f64 = prefix_lengths.iter().sum();
let fraction_matched = total_matched / count as f64;
info!(
"In {:?} examined {} reads for guide start position and matched {} ({:.4}).",
fastq, count, total_matched, fraction_matched
);

// Tuple of offset -> count where count is > 0
let non_zeros =
prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0).collect_vec();
prefix_lengths.iter().copied().enumerate().filter(|(_idx, n)| *n > 0.0).collect_vec();

info!(
"{} read offsets: {}",
Expand All @@ -291,7 +321,7 @@ impl Count {
// Filter to just those trim lengths that have at least min_fraction of the data each
let trims_to_return: Vec<usize> = non_zeros
.into_iter()
.filter(|(_idx, n)| *n as f64 / total_matched as f64 >= min_fraction)
.filter(|(_idx, n)| *n / total_matched >= min_fraction)
.map(|(idx, _n)| idx)
.collect();

Expand Down