Skip to content

Commit

Permalink
feat: fallback to fast string splitting
Browse files Browse the repository at this point in the history
  • Loading branch information
clintval committed Jan 3, 2024
1 parent 9d5a422 commit f806f57
Show file tree
Hide file tree
Showing 5 changed files with 6 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.collection.BetterBufferedIterator
import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util._
import htsjdk.samtools.util.{Interval, IntervalList, Murmur3, OverlapDetector}
Expand Down Expand Up @@ -390,8 +389,8 @@ class CollectDuplexSeqMetrics
val umi1s = IndexedSeq.newBuilder[String] // ab UMI 1 (and ba UMI 2) sequences
val umi2s = IndexedSeq.newBuilder[String] // ab UMI 2 (and ba UMI 1) sequences

ab.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => UmiSeparatorPattern.split(r[String](this.umiTag), -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }
ab.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u1; umi2s += u2 }
ba.iterator.map(r => r[String](this.umiTag).split("-", -1)).foreach { case Array(u1, u2) => umi1s += u2; umi2s += u1 }

val Seq(abConsensusUmi, baConsensusUmi) = Seq(umi1s, umi2s).map(_.result()).map{ umis =>
val consensus = this.consensusBuilder.callConsensus(umis)
Expand Down Expand Up @@ -531,7 +530,7 @@ class CollectDuplexSeqMetrics
val uniqueTotal = metrics.map(_.unique_observations).sum.toDouble

metrics.foreach { m =>
val Array(umi1, umi2) = UmiSeparatorPattern.split(m.umi, -1)
val Array(umi1, umi2) = m.umi.split("-", -1)
m.fraction_raw_observations = m.raw_observations / rawTotal
m.fraction_unique_observations = m.unique_observations / uniqueTotal
m.fraction_unique_observations_expected = singleUmiMetrics(umi1).fraction_unique_observations * singleUmiMetrics(umi2).fraction_unique_observations
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/com/fulcrumgenomics/umi/CorrectUmis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ class CorrectUmis
missingUmisRecords += 1
rejectOut.foreach(w => w += rec)
case Some(umi: String) =>
val sequences = umi.split('-')
val sequences = umi.split("-", -1)
if (sequences.exists(_.length != umiLength)) {
if (wrongLengthRecords == 0) {
logger.warning(s"Read (${rec.name}) detected with unexpected length UMI(s): ${sequences.mkString(" ")}.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import com.fulcrumgenomics.commons.util.LazyLogging
import com.fulcrumgenomics.umi.DuplexConsensusCaller._
import com.fulcrumgenomics.umi.UmiConsensusCaller.ReadType.{ReadType, _}
import com.fulcrumgenomics.umi.UmiConsensusCaller.{SimpleRead, SourceRead}
import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
import com.fulcrumgenomics.util.NumericTypes.PhredScore

/**
Expand Down Expand Up @@ -324,7 +323,7 @@ class DuplexConsensusCaller(override val readNamePrefix: String,
// UMI bases are present, `None` otherwise.
reads.flatMap(_.sam).flatMap { rec =>
rec.get[String](ConsensusTags.UmiBases).map { umi =>
if (rec.firstOfPair == firstOfPair) umi else UmiSeparatorPattern.split(umi, -1).reverse.mkString("-")
if (rec.firstOfPair == firstOfPair) umi else umi.split("-", -1).reverse.mkString("-")
}
}
}
Expand Down
3 changes: 1 addition & 2 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool}
import com.fulcrumgenomics.commons.util.{LazyLogging, NumericCounter, SimpleCounter}
import com.fulcrumgenomics.sopt.{arg, clp}
import com.fulcrumgenomics.umi.GroupReadsByUmi._
import com.fulcrumgenomics.umi.Umis.UmiSeparatorPattern
import com.fulcrumgenomics.util.Metric.{Count, Proportion}
import com.fulcrumgenomics.util.Sequences.countMismatches
import com.fulcrumgenomics.util._
Expand Down Expand Up @@ -832,7 +831,7 @@ class GroupReadsByUmi
val pos1 = if (r1.positiveStrand) r1.unclippedStart else r1.unclippedEnd
val pos2 = if (r2.positiveStrand) r2.unclippedStart else r2.unclippedEnd
val r1Lower = r1.refIndex < r2.refIndex || (r1.refIndex == r2.refIndex && (pos1 < pos2 || (pos1 == pos2 && r1.positiveStrand)))
val umis = UmiSeparatorPattern.split(umi, -1) // Split and ensure we return empty strings for missing UMIs.
val umis = umi.split("-", -1) // Split and ensure we return empty strings for missing UMIs.
require(umis.length == 2, s"Paired strategy used but umi did not contain 2 segments delimited by a '-': $umi")

if (r1Lower) paired.lowerReadUmiPrefix + ":" + umis(0) + "-" + paired.higherReadUmiPrefix + ":" + umis(1)
Expand Down
5 changes: 0 additions & 5 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,8 @@ package com.fulcrumgenomics.umi

import com.fulcrumgenomics.bam.api.SamRecord

import java.util.regex.Pattern

object Umis {

/** The separator pattern for concatenated UMIs. For example "ACGT-GTAA". */
val UmiSeparatorPattern: Pattern = Pattern.compile("-")

/** Copies the UMI sequence from the read name.
*
* The read name is split by the given name delimiter, and the last field is assumed to be the UMI sequence. The UMI
Expand Down

0 comments on commit f806f57

Please sign in to comment.