Skip to content

Commit

Permalink
feat: output umi grouping metrics (#1021)
Browse files Browse the repository at this point in the history
* feat: output umi grouping metrics

* refactor: variable names

* test: check metric values

* refactor: rename attributes

* test: check metric in additional tests
  • Loading branch information
znorgaard authored Jan 14, 2025
1 parent 1c96bd7 commit e4248df
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 7 deletions.
37 changes: 32 additions & 5 deletions src/main/scala/com/fulcrumgenomics/umi/GroupReadsByUmi.scala
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,20 @@ case class TagFamilySizeMetric(family_size: Int,
var fraction: Proportion = 0,
var fraction_gt_or_eq_family_size: Proportion = 0) extends Metric

/**
* Metrics produced by `GroupReadsByUmi` to describe reads passed through UMI grouping
* @param accepted_sam_records The number of SAM records accepted for grouping.
* @param discarded_non_pf The number of discarded non-PF SAM records.
* @param discarded_poor_alignment The number of SAM records discarded for poor alignment.
* @param discarded_ns_in_umi The number of SAM records discarded due to one or more Ns in the UMI.
* @param discarded_umis_to_short The number of SAM records discarded due to a shorter than expected UMI.
*/
case class UmiGroupingMetric(accepted_sam_records: Long,
discarded_non_pf: Long,
discarded_poor_alignment: Long,
discarded_ns_in_umi: Long,
discarded_umis_to_short: Long) extends Metric

/** The strategies implemented by [[GroupReadsByUmi]] to identify reads from the same source molecule.*/
sealed trait Strategy extends EnumEntry {
def newStrategy(edits: Int, threads: Int): UmiAssigner
Expand Down Expand Up @@ -568,6 +582,7 @@ class GroupReadsByUmi
(@arg(flag='i', doc="The input BAM file.") val input: PathToBam = Io.StdIn,
@arg(flag='o', doc="The output BAM file.") val output: PathToBam = Io.StdOut,
@arg(flag='f', doc="Optional output of tag family size counts.") val familySizeHistogram: Option[FilePath] = None,
@arg(flag='g', doc="Optional output of UMI grouping metrics.") val groupingMetrics: Option[FilePath] = None,
@arg(flag='t', doc="The tag containing the raw UMI.") val rawTag: String = ConsensusTags.UmiBases,
@arg(flag='T', doc="The output tag for UMI grouping.") val assignTag: String = ConsensusTags.MolecularId,
@arg(flag='d', doc="Turn on duplicate marking mode.") val markDuplicates: Boolean = false,
Expand Down Expand Up @@ -742,14 +757,26 @@ class GroupReadsByUmi
ms.tails.foreach { tail => tail.headOption.foreach(m => m.fraction_gt_or_eq_family_size = tail.map(_.fraction).sum) }
Metric.write(p, ms)
}

// Write out UMI grouping metrics
this.groupingMetrics.foreach { path =>
val groupingMetrics = UmiGroupingMetric(
accepted_sam_records = kept,
discarded_non_pf = filteredNonPf,
discarded_poor_alignment = filteredPoorAlignment,
discarded_ns_in_umi = filteredNsInUmi,
discarded_umis_to_short = filterUmisTooShort,
)
Metric.write(path, groupingMetrics)
}
}

private def logStats(): Unit = {
logger.info(f"Accepted $kept%,d reads for grouping.")
if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF reads.")
logger.info(f"Filtered out $filteredPoorAlignment%,d reads due to mapping issues.")
logger.info(f"Filtered out $filteredNsInUmi%,d reads that contained one or more Ns in their UMIs.")
this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d reads that contained UMIs that were too short.") }
logger.info(f"Accepted $kept%,d SAM records for grouping.")
if (filteredNonPf > 0) logger.info(f"Filtered out $filteredNonPf%,d non-PF SAM records.")
logger.info(f"Filtered out $filteredPoorAlignment%,d SAM records due to mapping issues.")
logger.info(f"Filtered out $filteredNsInUmi%,d SAM records that contained one or more Ns in their UMIs.")
this.minUmiLength.foreach { _ => logger.info(f"Filtered out $filterUmisTooShort%,d SAM records that contained UMIs that were too short.") }
}

/** Consumes the next group of templates with all matching end positions and returns them. */
Expand Down
14 changes: 12 additions & 2 deletions src/test/scala/com/fulcrumgenomics/umi/GroupReadsByUmiTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import com.fulcrumgenomics.cmdline.FgBioMain.FailureException
import com.fulcrumgenomics.testing.SamBuilder.{Minus, Plus}
import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec}
import com.fulcrumgenomics.umi.GroupReadsByUmi._
import com.fulcrumgenomics.util.Metric
import org.scalatest.{OptionValues, PrivateMethodTester}

import java.nio.file.Files
Expand Down Expand Up @@ -247,7 +248,8 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
val in = builder.toTempFile()
val out = Files.createTempFile("umi_grouped.", ".sam")
val hist = Files.createTempFile("umi_grouped.", ".histogram.txt")
val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30))
val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt")
val tool = new GroupReadsByUmi(input=in, output=out, familySizeHistogram=Some(hist), groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Edit, edits=1, minMapQ=Some(30))
val logs = executeFgbioTool(tool)

val groups = readBamRecs(out).groupBy(_.name.charAt(0))
Expand All @@ -267,6 +269,10 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT

hist.toFile.exists() shouldBe true

// TODO: Consider creating more unit tests that vary the following metric fields
val expectedMetric = UmiGroupingMetric(accepted_sam_records = 10, discarded_non_pf = 0, discarded_poor_alignment = 2, discarded_ns_in_umi = 0, discarded_umis_to_short = 0)
Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric)

// Make sure that we skip sorting for TemplateCoordinate
val sortMessage = "Sorting the input to TemplateCoordinate order"
logs.exists(_.contains(sortMessage)) shouldBe (sortOrder != TemplateCoordinate)
Expand Down Expand Up @@ -424,8 +430,12 @@ class GroupReadsByUmiTest extends UnitSpec with OptionValues with PrivateMethodT
builder.addPair(name="a03", start1=100, start2=300, strand1=Plus, strand2=Minus, attrs=Map("RX" -> "ACT-ANN"))

val in = builder.toTempFile()
val metrics = Files.createTempFile("umi_grouped.", ".metrics.txt")
val out = Files.createTempFile("umi_grouped.", ".bam")
new GroupReadsByUmi(input=in, output=out, rawTag="RX", assignTag="MI", strategy=Strategy.Paired, edits=2).execute()
new GroupReadsByUmi(input=in, output=out, groupingMetrics=Some(metrics), rawTag="RX", assignTag="MI", strategy=Strategy.Paired, edits=2).execute()

val expectedMetric = UmiGroupingMetric(accepted_sam_records = 4, discarded_non_pf = 0, discarded_poor_alignment = 0, discarded_ns_in_umi = 2, discarded_umis_to_short = 0)
Metric.read[UmiGroupingMetric](metrics) shouldEqual Seq(expectedMetric)

readBamRecs(out).map(_.name).distinct shouldBe Seq("a01", "a02")
}
Expand Down

0 comments on commit e4248df

Please sign in to comment.