Skip to content

Commit

Permalink
feat: add --umi-prefix to CopyUmiFromReadName
Browse files Browse the repository at this point in the history
  • Loading branch information
msto committed Jan 19, 2024
1 parent afa634e commit c575dfc
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,16 @@ import com.fulcrumgenomics.util.{Io, ProgressLogger}
|
|If a read name contains multiple UMIs they may be delimited by either hyphens (`-`) or pluses (`+`). The
|resulting UMI in the `RX` tag will always be hyphen delimited.
|
|To obtain behavior similar to `umi_tools`' `--umi-separator=":r"`, specify the delimiter and
|prefix separately, i.e. `--umi-delimiter=":"` and `--umi-prefix="r"`.
""")
class CopyUmiFromReadName
( @arg(flag='i', doc="The input BAM file") input: PathToBam,
@arg(flag='o', doc="The output BAM file") output: PathToBam,
@arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false
@arg(doc="Remove the UMI from the read name") removeUmi: Boolean = false,
@arg(doc="Delimiter between the read name and UMI.") umiDelimiter: Char = ':',
@arg(doc="Any characters preceding the UMI sequence in the read name.") umiPrefix: Option[String] = None,
) extends FgBioTool with LazyLogging {

Io.assertReadable(input)
Expand All @@ -58,7 +63,7 @@ class CopyUmiFromReadName
val progress = new ProgressLogger(logger)
source.foreach { rec =>
progress.record(rec)
writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi)
writer += Umis.copyUmiFromReadName(rec=rec, removeUmi=removeUmi, delimiter=umiDelimiter, prefix=umiPrefix)
}
progress.logLast()
source.safelyClose()
Expand Down
11 changes: 6 additions & 5 deletions src/main/scala/com/fulcrumgenomics/umi/Umis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,9 @@ object Umis {
* @param delimiter the delimiter of fields within the read name
* @return the modified record
*/
def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':'): SamRecord = {
def copyUmiFromReadName(rec: SamRecord, removeUmi: Boolean = false, delimiter: Char = ':', prefix: Option[String] = None): SamRecord = {
// Extract and set the UMI
val umi = extractUmisFromReadName(rec.name, delimiter, strict=false)
val umi = extractUmisFromReadName(rec.name, delimiter, strict=false, prefix=prefix)
require(umi.nonEmpty, f"No valid UMI found in: ${rec.name}")
umi.foreach(u => rec(ConsensusTags.UmiBases) = u)

Expand All @@ -67,7 +67,7 @@ object Umis {
*
* If `strict` is false the last segment is returned so long as it appears to be a valid UMI.
*/
def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean): Option[String] = {
def extractUmisFromReadName(name: String, delimiter: Char = ':', strict: Boolean, prefix: Option[String] = None): Option[String] = {
// If strict, check that the read name actually has eight parts, which is expected
val rawUmi = if (strict) {
val colons = name.count(_ == delimiter)
Expand All @@ -80,8 +80,9 @@ object Umis {
Some(name.substring(idx + 1, name.length))
}

val umi = rawUmi.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
val valid = umi.forall(u => u.forall(isValidUmiCharacter))
val umiSeq = rawUmi.map(seq => (if (prefix.isEmpty) seq else seq.stripPrefix(prefix.get)))
val umi = umiSeq.map(raw => (if (raw.indexOf('+') > 0) raw.replace('+', '-') else raw).toUpperCase)
val valid = umi.forall(u => u.forall(isValidUmiCharacter))

if (strict && !valid) throw new IllegalArgumentException(s"Invalid UMI '${umi.get}' extracted from name '${name}")
else if (!valid) None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
private case class Result(name: String, umi: String)

/** Runs CopyUmiFromReadName using the given read names returning the output read names and UMIs. */
private def run(names: Iterable[String], removeUmi: Boolean): IndexedSeq[Result] = {
private def run(names: Iterable[String], removeUmi: Boolean, umiPrefix: Option[String] = None): IndexedSeq[Result] = {
// build the reads
val builder = new SamBuilder()
names.foreach { name => builder.addFrag(name=name, unmapped=true) }

// run the tool
val out = makeTempFile("test.", ".bam")
val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi)
val tool = new CopyUmiFromReadName(input=builder.toTempFile(), output=out, removeUmi=removeUmi, umiPrefix=umiPrefix)
executeFgbioTool(tool)

// slurp the results
Expand Down Expand Up @@ -69,4 +69,11 @@ class CopyUmiFromReadNameTest extends UnitSpec with OptionValues {
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}

it should "remove any additional separator characters preceding the UMI" in {
val names = Seq("1:rAAAA", "1:2:rCCCC", "1:2:3:rGGGG", "blah:rAAAA-CCCC")
val results = run(names=names, removeUmi=true, umiPrefix=Some("r"))
results.map(_.name) should contain theSameElementsInOrderAs Seq("1", "1:2", "1:2:3", "blah")
results.map(_.umi) should contain theSameElementsInOrderAs Seq("AAAA", "CCCC", "GGGG", "AAAA-CCCC")
}
}

0 comments on commit c575dfc

Please sign in to comment.