diff --git a/CMakeLists.txt b/CMakeLists.txt index 7bfdbc7c..faf76f54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ project(octopus) set(octopus_VERSION_MAJOR 0) set(octopus_VERSION_MINOR 7) -set(octopus_VERSION_PATCH 3) +set(octopus_VERSION_PATCH 4) set(octopus_VERSION_RELEASE "") # Generate list of compile commands. This helps debugging and doesn't have a downside. diff --git a/README.md b/README.md index 495bcd65..f9758a0a 100644 --- a/README.md +++ b/README.md @@ -350,7 +350,7 @@ Octopus will generate BAM files `realigned/NA12878.bam` and `realigned/NA24385.b ## Output format -Octopus outputs variants in the [VCF 4.3 format](https://github.com/samtools/hts-specs/blob/master/VCFv4.3.pdf). To represent complex multi-allelic loci, Octopus prefers a decompose alleles into multiple records and use the `*` allele to resoolve conflicts, for example: +Octopus outputs variants in the [VCF 4.3 format](https://github.com/samtools/hts-specs/blob/master/VCFv4.3.pdf). To represent complex multi-allelic loci, Octopus prefers to decompose alleles into multiple records and use the `*` allele to resolve conflicts, for example: ``` #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 @@ -366,14 +366,14 @@ in contrast to how such a site would usually be represented, either: 1 102738191 . ATTATTTATTTAT A . . . GT 1/0 ``` -which is inconsistent as the reference is deduced in each record, or: +which is inconsistent as existence of the reference allele is implied in each record, or: ``` #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878 1 102738191 . ATTATTTATTTAT ATTAT,A . . . GT 1/2 ``` -which is at least consistent, but rapidly becomes unmanageable as the length and number of overlapping variants increases. +which is consistent, but rapidly becomes unmanageable as the length and number of overlapping variants increases. ## Documentation diff --git a/lib/date/CMakeLists.txt b/lib/date/CMakeLists.txt index b063cfdc..1ac5abbd 100644 --- a/lib/date/CMakeLists.txt +++ b/lib/date/CMakeLists.txt @@ -3,7 +3,8 @@ set(DATE_SOURCES tz.h tz.cpp tz_private.h - ios.h) + ios.h + ptz.h) add_library(date-tz STATIC ${DATE_SOURCES}) diff --git a/lib/date/ptz.h b/lib/date/ptz.h new file mode 100644 index 00000000..dc8f0cf7 --- /dev/null +++ b/lib/date/ptz.h @@ -0,0 +1,788 @@ +#ifndef PTZ_H +#define PTZ_H + +// The MIT License (MIT) +// +// Copyright (c) 2017 Howard Hinnant +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// This header allows Posix-style time zones as specified for TZ here: +// http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap08.html#tag_08_03 +// +// Posix::time_zone can be constructed with a posix-style string and then used in +// a zoned_time like so: +// +// zoned_time zt{"EST5EDT,M3.2.0,M11.1.0", +// system_clock::now()}; +// or: +// +// Posix::time_zone tz{"EST5EDT,M3.2.0,M11.1.0"}; +// zoned_time zt{tz, system_clock::now()}; +// +// If the rule set is missing (everything starting with ','), then the rule is that the +// alternate offset is never enabled. +// +// Note, Posix-style time zones are not recommended for all of the reasons described here: +// https://stackoverflow.com/tags/timezone/info +// +// They are provided here as a non-trivial custom time zone example, and if you really +// have to have Posix time zones, you're welcome to use this one. + +#include "date/tz.h" +#include +#include +#include + +namespace Posix +{ + +namespace detail +{ + +#if HAS_STRING_VIEW + +using string_t = std::string_view; + +#else // !HAS_STRING_VIEW + +using string_t = std::string; + +#endif // !HAS_STRING_VIEW + +class rule; + +void throw_invalid(const string_t& s, unsigned i, const string_t& message); +unsigned read_date(const string_t& s, unsigned i, rule& r); +unsigned read_name(const string_t& s, unsigned i, std::string& name); +unsigned read_signed_time(const string_t& s, unsigned i, std::chrono::seconds& t); +unsigned read_unsigned_time(const string_t& s, unsigned i, std::chrono::seconds& t); +unsigned read_unsigned(const string_t& s, unsigned i, unsigned limit, unsigned& u, + const string_t& message = string_t{}); + +class rule +{ + enum {off, J, M, N}; + + date::month m_; + date::weekday wd_; + unsigned short n_ : 14; + unsigned short mode_ : 2; + std::chrono::duration time_ = std::chrono::hours{2}; + +public: + rule() : mode_(off) {} + + bool ok() const {return mode_ != off;} + date::local_seconds operator()(date::year y) const; + std::string to_string() const; + + friend std::ostream& operator<<(std::ostream& os, const rule& r); + friend unsigned read_date(const string_t& s, unsigned i, rule& r); + friend bool operator==(const rule& x, const rule& y); +}; + +inline +bool +operator==(const rule& x, const rule& y) +{ + if (x.mode_ != y.mode_) + return false; + switch (x.mode_) + { + case rule::J: + case rule::N: + return x.n_ == y.n_; + case rule::M: + return x.m_ == y.m_ && x.n_ == y.n_ && x.wd_ == y.wd_; + default: + return true; + } +} + +inline +bool +operator!=(const rule& x, const rule& y) +{ + return !(x == y); +} + +inline +date::local_seconds +rule::operator()(date::year y) const +{ + using date::local_days; + using date::January; + using date::days; + using date::last; + using sec = std::chrono::seconds; + date::local_seconds t; + switch (mode_) + { + case J: + t = local_days{y/January/0} + days{n_ + (y.is_leap() && n_ > 59)} + sec{time_}; + break; + case M: + t = (n_ == 5 ? local_days{y/m_/wd_[last]} : local_days{y/m_/wd_[n_]}) + sec{time_}; + break; + case N: + t = local_days{y/January/1} + days{n_} + sec{time_}; + break; + default: + assert(!"rule called with bad mode"); + } + return t; +} + +inline +std::string +rule::to_string() const +{ + using namespace std::chrono; + auto print_offset = [](seconds off) + { + std::string nm; + if (off != hours{2}) + { + date::hh_mm_ss offset{off}; + nm = '/'; + nm += std::to_string(offset.hours().count()); + if (offset.minutes() != minutes{0} || offset.seconds() != seconds{0}) + { + nm += ':'; + if (offset.minutes() < minutes{10}) + nm += '0'; + nm += std::to_string(offset.minutes().count()); + if (offset.seconds() != seconds{0}) + { + nm += ':'; + if (offset.seconds() < seconds{10}) + nm += '0'; + nm += std::to_string(offset.seconds().count()); + } + } + } + return nm; + }; + + std::string nm; + switch (mode_) + { + case rule::J: + nm = 'J'; + nm += std::to_string(n_); + break; + case rule::M: + nm = 'M'; + nm += std::to_string(static_cast(m_)); + nm += '.'; + nm += std::to_string(n_); + nm += '.'; + nm += std::to_string(wd_.c_encoding()); + break; + case rule::N: + nm = std::to_string(n_); + break; + default: + break; + } + nm += print_offset(time_); + return nm; +} + +inline +std::ostream& +operator<<(std::ostream& os, const rule& r) +{ + switch (r.mode_) + { + case rule::J: + os << 'J' << r.n_ << date::format(" %T", r.time_); + break; + case rule::M: + if (r.n_ == 5) + os << r.m_/r.wd_[date::last]; + else + os << r.m_/r.wd_[r.n_]; + os << date::format(" %T", r.time_); + break; + case rule::N: + os << r.n_ << date::format(" %T", r.time_); + break; + default: + break; + } + return os; +} + +} // namespace detail + +class time_zone +{ + std::string std_abbrev_; + std::string dst_abbrev_ = {}; + std::chrono::seconds offset_; + std::chrono::seconds save_ = std::chrono::hours{1}; + detail::rule start_rule_; + detail::rule end_rule_; + +public: + explicit time_zone(const detail::string_t& name); + + template + date::sys_info get_info(date::sys_time st) const; + template + date::local_info get_info(date::local_time tp) const; + + template + date::sys_time::type> + to_sys(date::local_time tp) const; + + template + date::sys_time::type> + to_sys(date::local_time tp, date::choose z) const; + + template + date::local_time::type> + to_local(date::sys_time tp) const; + + friend std::ostream& operator<<(std::ostream& os, const time_zone& z); + + const time_zone* operator->() const {return this;} + + std::string name() const; + + friend bool operator==(const time_zone& x, const time_zone& y); +}; + +inline +time_zone::time_zone(const detail::string_t& s) +{ + using detail::read_name; + using detail::read_signed_time; + using detail::throw_invalid; + auto i = read_name(s, 0, std_abbrev_); + i = read_signed_time(s, i, offset_); + offset_ = -offset_; + if (i != s.size()) + { + i = read_name(s, i, dst_abbrev_); + if (i != s.size()) + { + if (s[i] != ',') + { + i = read_signed_time(s, i, save_); + save_ = -save_ - offset_; + } + if (i != s.size()) + { + if (s[i] != ',') + throw_invalid(s, i, "Expecting end of string or ',' to start rule"); + ++i; + i = read_date(s, i, start_rule_); + if (i == s.size() || s[i] != ',') + throw_invalid(s, i, "Expecting ',' and then the ending rule"); + ++i; + i = read_date(s, i, end_rule_); + if (i != s.size()) + throw_invalid(s, i, "Found unexpected trailing characters"); + } + } + } +} + +template +date::sys_info +time_zone::get_info(date::sys_time st) const +{ + using date::sys_info; + using date::year_month_day; + using date::sys_seconds; + using date::sys_days; + using date::floor; + using date::ceil; + using date::days; + using date::years; + using date::year; + using date::January; + using date::December; + using date::last; + using std::chrono::minutes; + sys_info r{}; + r.offset = offset_; + if (start_rule_.ok()) + { + auto y = year_month_day{floor(st)}.year(); + auto start = sys_seconds{(start_rule_(y) - offset_).time_since_epoch()}; + auto end = sys_seconds{(end_rule_(y) - (offset_ + save_)).time_since_epoch()}; + if (start <= st && st < end) + { + r.begin = start; + r.end = end; + r.offset += save_; + r.save = ceil(save_); + r.abbrev = dst_abbrev_; + } + else if (st < start) + { + r.begin = sys_seconds{(end_rule_(y-years{1}) - + (offset_ + save_)).time_since_epoch()}; + r.end = start; + r.abbrev = std_abbrev_; + } + else // st >= end + { + r.begin = end; + r.end = sys_seconds{(start_rule_(y+years{1}) - offset_).time_since_epoch()}; + r.abbrev = std_abbrev_; + } + } + else // constant offset + { + r.begin = sys_days{year::min()/January/1}; + r.end = sys_days{year::max()/December/last}; + r.abbrev = std_abbrev_; + } + return r; +} + +template +date::local_info +time_zone::get_info(date::local_time tp) const +{ + using date::local_info; + using date::year_month_day; + using date::days; + using date::sys_days; + using date::sys_seconds; + using date::years; + using date::year; + using date::ceil; + using date::January; + using date::December; + using date::last; + using std::chrono::seconds; + using std::chrono::minutes; + local_info r{}; + using date::floor; + if (start_rule_.ok()) + { + auto y = year_month_day{floor(tp)}.year(); + auto start = sys_seconds{(start_rule_(y) - offset_).time_since_epoch()}; + auto end = sys_seconds{(end_rule_(y) - (offset_ + save_)).time_since_epoch()}; + auto utcs = sys_seconds{floor(tp - offset_).time_since_epoch()}; + auto utcd = sys_seconds{floor(tp - (offset_ + save_)).time_since_epoch()}; + if ((utcs < start) != (utcd < start)) + { + r.first.begin = sys_seconds{(end_rule_(y-years{1}) - + (offset_ + save_)).time_since_epoch()}; + r.first.end = start; + r.first.offset = offset_; + r.first.abbrev = std_abbrev_; + r.second.begin = start; + r.second.end = end; + r.second.abbrev = dst_abbrev_; + r.second.offset = offset_ + save_; + r.second.save = ceil(save_); + r.result = save_ > seconds{0} ? local_info::nonexistent + : local_info::ambiguous; + } + else if ((utcs < end) != (utcd < end)) + { + r.first.begin = start; + r.first.end = end; + r.first.offset = offset_ + save_; + r.first.save = ceil(save_); + r.first.abbrev = dst_abbrev_; + r.second.begin = end; + r.second.end = sys_seconds{(start_rule_(y+years{1}) - + offset_).time_since_epoch()}; + r.second.abbrev = std_abbrev_; + r.second.offset = offset_; + r.result = save_ > seconds{0} ? local_info::ambiguous + : local_info::nonexistent; + } + else if (utcs < start) + { + r.first.begin = sys_seconds{(end_rule_(y-years{1}) - + (offset_ + save_)).time_since_epoch()}; + r.first.end = start; + r.first.offset = offset_; + r.first.abbrev = std_abbrev_; + } + else if (utcs < end) + { + r.first.begin = start; + r.first.end = end; + r.first.offset = offset_ + save_; + r.first.save = ceil(save_); + r.first.abbrev = dst_abbrev_; + } + else + { + r.first.begin = end; + r.first.end = sys_seconds{(start_rule_(y+years{1}) - + offset_).time_since_epoch()}; + r.first.abbrev = std_abbrev_; + r.first.offset = offset_; + } + } + else // constant offset + { + r.first.begin = sys_days{year::min()/January/1}; + r.first.end = sys_days{year::max()/December/last}; + r.first.abbrev = std_abbrev_; + r.first.offset = offset_; + } + return r; +} + +template +date::sys_time::type> +time_zone::to_sys(date::local_time tp) const +{ + using date::local_info; + using date::sys_time; + using date::ambiguous_local_time; + auto i = get_info(tp); + if (i.result == local_info::nonexistent) + throw nonexistent_local_time(tp, i); + else if (i.result == local_info::ambiguous) + throw ambiguous_local_time(tp, i); + return sys_time{tp.time_since_epoch()} - i.first.offset; +} + +template +date::sys_time::type> +time_zone::to_sys(date::local_time tp, date::choose z) const +{ + using date::local_info; + using date::sys_time; + using date::choose; + auto i = get_info(tp); + if (i.result == local_info::nonexistent) + { + return i.first.end; + } + else if (i.result == local_info::ambiguous) + { + if (z == choose::latest) + return sys_time{tp.time_since_epoch()} - i.second.offset; + } + return sys_time{tp.time_since_epoch()} - i.first.offset; +} + +template +date::local_time::type> +time_zone::to_local(date::sys_time tp) const +{ + using date::local_time; + using std::chrono::seconds; + using LT = local_time::type>; + auto i = get_info(tp); + return LT{(tp + i.offset).time_since_epoch()}; +} + +inline +std::ostream& +operator<<(std::ostream& os, const time_zone& z) +{ + using date::operator<<; + os << '{'; + os << z.std_abbrev_ << ", " << z.dst_abbrev_ << date::format(", %T, ", z.offset_) + << date::format("%T, [", z.save_) << z.start_rule_ << ", " << z.end_rule_ << ")}"; + return os; +} + +inline +std::string +time_zone::name() const +{ + using namespace date; + using namespace std::chrono; + auto nm = std_abbrev_; + auto print_offset = [](seconds off) + { + std::string nm; + hh_mm_ss offset{-off}; + if (offset.is_negative()) + nm += '-'; + nm += std::to_string(offset.hours().count()); + if (offset.minutes() != minutes{0} || offset.seconds() != seconds{0}) + { + nm += ':'; + if (offset.minutes() < minutes{10}) + nm += '0'; + nm += std::to_string(offset.minutes().count()); + if (offset.seconds() != seconds{0}) + { + nm += ':'; + if (offset.seconds() < seconds{10}) + nm += '0'; + nm += std::to_string(offset.seconds().count()); + } + } + return nm; + }; + nm += print_offset(offset_); + if (!dst_abbrev_.empty()) + { + nm += dst_abbrev_; + if (save_ != hours{1}) + nm += print_offset(offset_+save_); + if (start_rule_.ok()) + { + nm += ','; + nm += start_rule_.to_string(); + nm += ','; + nm += end_rule_.to_string(); + } + } + return nm; +} + +inline +bool +operator==(const time_zone& x, const time_zone& y) +{ + return x.std_abbrev_ == y.std_abbrev_ && + x.dst_abbrev_ == y. dst_abbrev_ && + x.offset_ == y.offset_ && + x.save_ == y.save_ && + x.start_rule_ == y.start_rule_ && + x.end_rule_ == y.end_rule_; +} + +inline +bool +operator!=(const time_zone& x, const time_zone& y) +{ + return !(x == y); +} + +namespace detail +{ + +inline +void +throw_invalid(const string_t& s, unsigned i, const string_t& message) +{ + throw std::runtime_error(std::string("Invalid time_zone initializer.\n") + + std::string(message) + ":\n" + + std::string(s) + '\n' + + "\x1b[1;32m" + + std::string(i, '~') + '^' + + std::string(i < s.size() ? s.size()-i-1 : 0, '~') + + "\x1b[0m"); +} + +inline +unsigned +read_date(const string_t& s, unsigned i, rule& r) +{ + using date::month; + using date::weekday; + if (i == s.size()) + throw_invalid(s, i, "Expected rule but found end of string"); + if (s[i] == 'J') + { + ++i; + unsigned n; + i = read_unsigned(s, i, 3, n, "Expected to find the Julian day [1, 365]"); + r.mode_ = rule::J; + r.n_ = n; + } + else if (s[i] == 'M') + { + ++i; + unsigned m; + i = read_unsigned(s, i, 2, m, "Expected to find month [1, 12]"); + if (i == s.size() || s[i] != '.') + throw_invalid(s, i, "Expected '.' after month"); + ++i; + unsigned n; + i = read_unsigned(s, i, 1, n, "Expected to find week number [1, 5]"); + if (i == s.size() || s[i] != '.') + throw_invalid(s, i, "Expected '.' after weekday index"); + ++i; + unsigned wd; + i = read_unsigned(s, i, 1, wd, "Expected to find day of week [0, 6]"); + r.mode_ = rule::M; + r.m_ = month{m}; + r.wd_ = weekday{wd}; + r.n_ = n; + } + else if (std::isdigit(s[i])) + { + unsigned n; + i = read_unsigned(s, i, 3, n); + r.mode_ = rule::N; + r.n_ = n; + } + else + throw_invalid(s, i, "Expected 'J', 'M', or a digit to start rule"); + if (i != s.size() && s[i] == '/') + { + ++i; + std::chrono::seconds t; + i = read_unsigned_time(s, i, t); + r.time_ = t; + } + return i; +} + +inline +unsigned +read_name(const string_t& s, unsigned i, std::string& name) +{ + if (i == s.size()) + throw_invalid(s, i, "Expected a name but found end of string"); + if (s[i] == '<') + { + ++i; + while (true) + { + if (i == s.size()) + throw_invalid(s, i, + "Expected to find closing '>', but found end of string"); + if (s[i] == '>') + break; + name.push_back(s[i]); + ++i; + } + ++i; + } + else + { + while (i != s.size() && std::isalpha(s[i])) + { + name.push_back(s[i]); + ++i; + } + } + if (name.size() < 3) + throw_invalid(s, i, "Found name to be shorter than 3 characters"); + return i; +} + +inline +unsigned +read_signed_time(const string_t& s, unsigned i, + std::chrono::seconds& t) +{ + if (i == s.size()) + throw_invalid(s, i, "Expected to read signed time, but found end of string"); + bool negative = false; + if (s[i] == '-') + { + negative = true; + ++i; + } + else if (s[i] == '+') + ++i; + i = read_unsigned_time(s, i, t); + if (negative) + t = -t; + return i; +} + +inline +unsigned +read_unsigned_time(const string_t& s, unsigned i, std::chrono::seconds& t) +{ + using std::chrono::seconds; + using std::chrono::minutes; + using std::chrono::hours; + if (i == s.size()) + throw_invalid(s, i, "Expected to read unsigned time, but found end of string"); + unsigned x; + i = read_unsigned(s, i, 2, x, "Expected to find hours [0, 24]"); + t = hours{x}; + if (i != s.size() && s[i] == ':') + { + ++i; + i = read_unsigned(s, i, 2, x, "Expected to find minutes [0, 59]"); + t += minutes{x}; + if (i != s.size() && s[i] == ':') + { + ++i; + i = read_unsigned(s, i, 2, x, "Expected to find seconds [0, 59]"); + t += seconds{x}; + } + } + return i; +} + +inline +unsigned +read_unsigned(const string_t& s, unsigned i, unsigned limit, unsigned& u, + const string_t& message) +{ + if (i == s.size() || !std::isdigit(s[i])) + throw_invalid(s, i, message); + u = static_cast(s[i] - '0'); + unsigned count = 1; + for (++i; count < limit && i != s.size() && std::isdigit(s[i]); ++i, ++count) + u = u * 10 + static_cast(s[i] - '0'); + return i; +} + +} // namespace detail + +} // namespace Posix + +namespace date +{ + +template <> +struct zoned_traits +{ + +#if HAS_STRING_VIEW + + static + Posix::time_zone + locate_zone(std::string_view name) + { + return Posix::time_zone{name}; + } + +#else // !HAS_STRING_VIEW + + static + Posix::time_zone + locate_zone(const std::string& name) + { + return Posix::time_zone{name}; + } + + static + Posix::time_zone + locate_zone(const char* name) + { + return Posix::time_zone{name}; + } + +#endif // !HAS_STRING_VIEW + +}; + +} // namespace date + +#endif // PTZ_H diff --git a/resources/configs/PacBioCCS.config b/resources/configs/PacBioCCS.config index f7dc6674..a6cbe064 100644 --- a/resources/configs/PacBioCCS.config +++ b/resources/configs/PacBioCCS.config @@ -12,7 +12,7 @@ allow-reads-with-good-unplaced-or-unlocalized-supplementary-alignments=true # Setup variant discovery for noisy reads variant-discovery-mode=PACBIO -allow-pileup-candidates-from-likely-misaligned-reads=true +force-pileup-candidates=true max-region-to-assemble=2000 max-assemble-region-overlap=500 diff --git a/scripts/filter_assigned_reads.py b/scripts/filter_assigned_reads.py new file mode 100755 index 00000000..417576e9 --- /dev/null +++ b/scripts/filter_assigned_reads.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python3 + +import argparse +import pysam as ps +from pathlib import Path + +def is_assigned_read(read): + tags = dict(read.tags) + return "HP" in tags and len(tags["HP"]) == 1 + +def filter_assigned_reads(in_bam_filename, out_bam_filename, region=None): + in_bam = ps.AlignmentFile(in_bam_filename) + out_bam = ps.AlignmentFile(out_bam_filename, 'wb', template=in_bam) + contig, start, end = None, None, None + if region is not None: + if ':' in region: + contig, rest = region.split(':') + start, end = rest.split('-') + start, end = start.replace(',', ''), end.replace(',', '') + start, end = int(start), int(end) + else: + contig = region + for read in in_bam.fetch(contig, start, end): + if is_assigned_read(read): + out_bam.write(read) + out_bam.close() + ps.index(str(out_bam_filename)) + +def main(args): + if args.in_bam == args.out_bam: + print("--out-bam cannot be the same as --in-bam") + else: + filter_assigned_reads(args.in_bam, args.out_bam, args.region) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('-I','--in-bam', + type=Path, + required=True, + help='Input Octopus realigned BAM') + parser.add_argument('-O','--out-bam', + type=Path, + required=True, + help='Output BAM') + parser.add_argument('-T','--region', + type=str, + required=False, + help='Region to filter') + parsed, unparsed = parser.parse_known_args() + main(parsed) diff --git a/src/config/config.cpp b/src/config/config.cpp index 9f026dc5..59401c46 100644 --- a/src/config/config.cpp +++ b/src/config/config.cpp @@ -119,7 +119,7 @@ const std::string BugReport {"https://github.com/luntergroup/octopus/issues"}; const std::vector Authors {"Daniel Cooke"}; -const std::string CopyrightNotice {"Copyright (c) 2015-2020 University of Oxford"}; +const std::string CopyrightNotice {"Copyright (c) 2015-2021 University of Oxford"}; const unsigned CommandLineWidth {72}; diff --git a/src/config/option_collation.cpp b/src/config/option_collation.cpp index 6ee35a8e..be173894 100644 --- a/src/config/option_collation.cpp +++ b/src/config/option_collation.cpp @@ -939,17 +939,23 @@ auto make_read_filterer(const OptionMap& options) if (options.at("no-adapter-contaminated-reads").as()) { result.add(make_unique()); } - if (options.at("no-reads-with-decoy-supplementary-alignments").as()) { + const auto max_decoy_supplementary_mq = as_unsigned("max-decoy-supplementary-alignment-mapping-quality", options); + if (max_decoy_supplementary_mq > 0) { + result.add(make_unique(max_decoy_supplementary_mq)); + } else { result.add(make_unique()); - } else if (!options.at("allow-reads-with-good-decoy-supplementary-alignments").as()) { - result.add(make_unique(min_mapping_quality)); } - if (options.at("no-reads-with-unplaced-or-unlocalized-supplementary-alignments").as()) { - result.add(make_unique()); + const auto max_unplaced_supplementary_mq = as_unsigned("max-unplaced-supplementary-alignment-mapping-quality", options); + if (max_unplaced_supplementary_mq > 0) { + result.add(make_unique(max_unplaced_supplementary_mq)); + } else { result.add(make_unique()); - } else if (!options.at("allow-reads-with-good-unplaced-or-unlocalized-supplementary-alignments").as()) { - result.add(make_unique(min_mapping_quality)); - result.add(make_unique(min_mapping_quality)); + } + const auto max_unlocalized_supplementary_mq = as_unsigned("max-unlocalized-supplementary-alignment-mapping-quality", options); + if (max_unlocalized_supplementary_mq > 0) { + result.add(make_unique(max_unlocalized_supplementary_mq)); + } else { + result.add(make_unique()); } result.shrink_to_fit(); return result; @@ -1220,7 +1226,7 @@ auto make_variant_generator_builder(const OptionMap& options, const boost::optio } scanner_options.match = get_candidate_variant_match_predicate(options); scanner_options.use_clipped_coverage_tracking = true; - if (!options.at("allow-pileup-candidates-from-likely-misaligned-reads").as()) { + if (!options.at("force-pileup-candidates").as()) { CigarScanner::Options::MisalignmentParameters misalign_params {}; misalign_params.max_expected_mutation_rate = get_max_expected_heterozygosity(options); misalign_params.snv_threshold = as_unsigned("min-pileup-base-quality", options); @@ -1237,6 +1243,8 @@ auto make_variant_generator_builder(const OptionMap& options, const boost::optio } if (repeat_candidate_variant_generator_enabled(options)) { RepeatScanner::Options repeat_scanner_options {}; + repeat_scanner_options.min_snvs = 1; + repeat_scanner_options.min_base_quality = 10; repeat_scanner_options.min_vaf = get_repeat_scanner_min_vaf(options); result.set_repeat_scanner(repeat_scanner_options); } diff --git a/src/config/option_parser.cpp b/src/config/option_parser.cpp index 1b93ce1c..1690a4fb 100644 --- a/src/config/option_parser.cpp +++ b/src/config/option_parser.cpp @@ -298,21 +298,17 @@ OptionMap parse_options(const int argc, const char** argv) po::bool_switch()->default_value(false), "Filter reads with possible adapter contamination") - ("no-reads-with-decoy-supplementary-alignments", - po::bool_switch()->default_value(false), - "Filter reads with supplementary alignments mapped to decoy contigs") - - ("allow-reads-with-good-decoy-supplementary-alignments", - po::bool_switch()->default_value(false), - "Do not filer reads with supplementary alignments mapped to decoy contigs with high mapping quality (--min-mapping-quality)") + ("max-decoy-supplementary-alignment-mapping-quality", + po::value()->default_value(5), + "Filter reads with supplementary alignments mapped to decoy contigs with mapping quality greater than this") - ("no-reads-with-unplaced-or-unlocalized-supplementary-alignments", - po::bool_switch()->default_value(false), - "Filter reads with supplementary alignments mapped to unplaced or unlocalized contigs") + ("max-unplaced-supplementary-alignment-mapping-quality", + po::value()->default_value(5), + "Filter reads with supplementary alignments mapped to unplaced contigs with mapping quality greater than this") - ("allow-reads-with-good-unplaced-or-unlocalized-supplementary-alignments", - po::bool_switch()->default_value(false), - "Do not filer reads with supplementary alignments mapped to unplaced or unlocalized contigs with high mapping quality (--min-mapping-quality)") + ("max-unlocalized-supplementary-alignment-mapping-quality", + po::value()->default_value(5), + "Filter reads with supplementary alignments mapped to unlocalized contigs with mapping quality greater than this") ("disable-downsampling", po::bool_switch()->default_value(false), @@ -378,9 +374,9 @@ OptionMap parse_options(const int argc, const char** argv) "Minimum number of reads that must support a variant if it is to be considered a candidate." " By default octopus will automatically determine this value") - ("allow-pileup-candidates-from-likely-misaligned-reads", + ("force-pileup-candidates", po::bool_switch()->default_value(false), - "Allow pileup candidate variants discovered from reads that are considered likely to be misaligned") + "Include pileup candidate variants discovered from reads that are considered likely to be misaligned") ("max-variant-size", po::value()->default_value(2000), diff --git a/src/core/callers/cancer_caller.cpp b/src/core/callers/cancer_caller.cpp index 08de2237..040d7855 100644 --- a/src/core/callers/cancer_caller.cpp +++ b/src/core/callers/cancer_caller.cpp @@ -425,7 +425,7 @@ void CancerCaller::generate_cancer_genotypes_with_clean_normal(Latents& latents, const auto max_allowed_cancer_genotypes = *parameters_.max_genotypes; if (!latents.cancer_genotypes_.empty()) { const auto max_old_cancer_genotype_bases = std::max(max_allowed_cancer_genotypes / num_haplotypes, std::size_t {1}); - const auto& cancer_genotype_posteriors = latents.somatic_model_inferences_.back().max_evidence_params.genotype_probabilities; + const auto& cancer_genotype_posteriors = latents.somatic_model_inferences_.back().weighted_genotype_posteriors; const auto old_cancer_genotype_bases = copy_greatest_probability_values(latents.cancer_genotypes_.back(), cancer_genotype_posteriors, max_old_cancer_genotype_bases); latents.cancer_genotypes_.push_back(extend_somatic(old_cancer_genotype_bases, latents.indexed_haplotypes_)); erase_duplicates(latents.cancer_genotypes_.back()); diff --git a/src/core/octopus.cpp b/src/core/octopus.cpp index 9fbf7907..961748b1 100644 --- a/src/core/octopus.cpp +++ b/src/core/octopus.cpp @@ -27,6 +27,7 @@ #include #include "date/tz.h" +#include "date/ptz.h" #include "config/common.hpp" #include "basics/genomic_region.hpp" @@ -92,13 +93,34 @@ bool apply_csr(const GenomeCallingComponents& components) noexcept using CallTypeSet = std::set; +template +inline +auto +make_zoned(const Posix::time_zone& tz, const date::sys_time& st) +{ + return date::zoned_time::type, + Posix::time_zone>{tz, st}; +} + std::string get_current_time_str() { using namespace date; using namespace std::chrono; - const auto time = make_zoned(current_zone(), system_clock::now()); std::ostringstream ss {}; - ss << time; + try { + const auto time = make_zoned(current_zone(), system_clock::now()); + ss << time; + } catch (const std::exception& e) { + // https://github.com/HowardHinnant/date/issues/310 + static bool warned {false}; + if (!warned) { + logging::WarningLogger warn_log {}; + warn_log << "Failed to find system time zone information. Assuming UTC."; + warned = true; + } + const auto time = make_zoned(Posix::time_zone{"UTC0"}, system_clock::now()); + ss << time; + } return ss.str(); } diff --git a/src/core/tools/hapgen/genome_walker.cpp b/src/core/tools/hapgen/genome_walker.cpp index bcb3ecc4..8599628e 100644 --- a/src/core/tools/hapgen/genome_walker.cpp +++ b/src/core/tools/hapgen/genome_walker.cpp @@ -96,12 +96,24 @@ bool is_interacting_indel(BidirIt first, BidirIt allele, BidirIt last, } } +template +bool splits_indel_pad_base(BidirIt first, BidirIt allele, BidirIt last) +{ + if (allele != first && (is_indel(*allele) || is_empty_region(*allele))) { + const auto pad_region = head_position(expand_lhs(mapped_region(*allele), 1)); + return has_overlapped(first, allele, pad_region); + } else { + return false; + } +} + template bool is_good_indicator_begin(BidirIt first_possible, BidirIt allele_itr, BidirIt last_possible) { return !(is_sandwich_allele(first_possible, allele_itr, last_possible) || is_indel_boundary(first_possible, allele_itr, last_possible) - || is_interacting_indel(first_possible, allele_itr, last_possible)); + || is_interacting_indel(first_possible, allele_itr, last_possible) + || splits_indel_pad_base(first_possible, allele_itr, last_possible)); } template @@ -202,7 +214,7 @@ GenomeWalker::walk(const GenomicRegion& previous_region, auto expanded_leftmost = mapped_region(*it); std::for_each(it, included_itr, [&] (const auto& allele) { const auto ref_dist = reference_distance(allele); - if (ref_dist > 1) { + if (ref_dist > 0) { const auto max_expansion = std::min(mapped_begin(allele), 2 * ref_dist); auto expanded_allele_begin = mapped_begin(allele) - max_expansion; if (expanded_allele_begin < mapped_begin(expanded_leftmost)) { diff --git a/src/core/tools/hapgen/haplotype_generator.cpp b/src/core/tools/hapgen/haplotype_generator.cpp index 4a7f2146..7366bd3f 100644 --- a/src/core/tools/hapgen/haplotype_generator.cpp +++ b/src/core/tools/hapgen/haplotype_generator.cpp @@ -575,23 +575,35 @@ std::vector get_max_ref_distances(const std::vector& re return result; } -std::vector get_joined_blocks(const std::vector& blocks, - const MappableFlatSet& alleles, - const HaplotypeTree& tree) +std::vector +get_joined_blocks(const std::vector& blocks, + const MappableFlatSet& alleles, + const HaplotypeTree& tree) { - std::vector expanded_blocks {}; + std::vector expanded_blocks {}, lhs_expanded_blocks {}; expanded_blocks.reserve(blocks.size()); + lhs_expanded_blocks.reserve(blocks.size()); for (const auto& block : blocks) { auto ref_dist = max_ref_distance(block, alleles, tree); - if (ref_dist > 0) expanded_blocks.push_back(expand(block, ref_dist)); + if (ref_dist > 0) { + expanded_blocks.push_back(expand(block, ref_dist)); + // lhs_expanded_blocks ensure indels are always called + // with alleles adjacent to them on their left, to avoid problems + // when the indel is REF padded during VCF encoding. + lhs_expanded_blocks.push_back(expand_lhs(block, 1)); + } } if (expanded_blocks.empty()) return blocks; std::sort(std::begin(expanded_blocks), std::end(expanded_blocks)); + std::sort(std::begin(lhs_expanded_blocks), std::end(lhs_expanded_blocks)); std::vector interacting_expanded_blocks {}; interacting_expanded_blocks.reserve(expanded_blocks.size()); std::copy_if(std::begin(expanded_blocks), std::end(expanded_blocks), std::back_inserter(interacting_expanded_blocks), [&] (const auto& block) { return count_overlapped(expanded_blocks, block) > 1; }); + std::copy_if(std::begin(lhs_expanded_blocks), std::end(lhs_expanded_blocks), std::back_inserter(interacting_expanded_blocks), + [&] (const auto& block) { return count_overlapped(blocks, block) > 1; }); if (interacting_expanded_blocks.empty()) return blocks; + std::sort(std::begin(interacting_expanded_blocks), std::end(interacting_expanded_blocks)); const auto join_regions = extract_covered_regions(interacting_expanded_blocks); std::vector result {}; result.reserve(blocks.size()); diff --git a/src/core/tools/vargen/repeat_scanner.cpp b/src/core/tools/vargen/repeat_scanner.cpp index a49bb2ee..2d6d0b69 100644 --- a/src/core/tools/vargen/repeat_scanner.cpp +++ b/src/core/tools/vargen/repeat_scanner.cpp @@ -236,11 +236,20 @@ struct AdjacentRepeatPair : public Mappable region {encompassing_region(this->lhs, this->rhs)} {} }; +auto generate_tandem_repeats(const ReferenceGenome& reference, const GenomicRegion& region, + const unsigned max_period, const unsigned min_tract_length) +{ + auto result = find_exact_tandem_repeats(reference, region, max_period); + const auto is_too_short = [=] (const auto& repeat) { return region_size(repeat) < min_tract_length; }; + result.erase(std::remove_if(std::begin(result), std::end(result), is_too_short), std::end(result)); + return result; +} + std::deque find_adjacent_tandem_repeats(const ReferenceGenome& reference, const GenomicRegion& region, - const unsigned max_period) + const unsigned max_period, const unsigned min_tract_length) { - const auto repeats = find_exact_tandem_repeats(reference, region, max_period); + const auto repeats = generate_tandem_repeats(reference, region, max_period, min_tract_length); std::deque result {}; if (repeats.size() > 1) { for (auto lhs_itr = std::cbegin(repeats); lhs_itr != std::prev(std::cend(repeats)); ++lhs_itr) { @@ -300,9 +309,14 @@ void RepeatScanner::generate(const GenomicRegion& region, std::vector& assert(!segment.empty()); const auto segment_region = encompassing_region(segment); const auto repeat_search_region = expand(segment_region, 100); - const auto segment_repeat_pairs = find_adjacent_tandem_repeats(reference_, repeat_search_region, options_.max_period); + const auto segment_repeat_pairs = find_adjacent_tandem_repeats(reference_, repeat_search_region, options_.max_period, options_.min_tract_length); for (const auto& mnv : segment) { for (const auto& repeat_pair : overlap_range(segment_repeat_pairs, mnv)) { + if (is_snv(mnv) && (repeat_pair.lhs.period() > 1 || repeat_pair.rhs.period() > 1 + || !(mnv.alt_allele().sequence() == repeat_pair.lhs.motif() || mnv.alt_allele().sequence() == repeat_pair.rhs.motif()))) { + // Only try to split SNV candidates sandwiched by homopolymers + continue; + } if (are_adjacent(repeat_pair.lhs, mnv) && contains(repeat_pair.rhs, mnv)) { // insertion of lhs repeat, deletion of rhs repeat const auto num_deleted_periods = count_whole_repeats(region_size(mnv), repeat_pair.rhs.period()); diff --git a/src/core/tools/vargen/repeat_scanner.hpp b/src/core/tools/vargen/repeat_scanner.hpp index 43ac808f..7385c0a4 100644 --- a/src/core/tools/vargen/repeat_scanner.hpp +++ b/src/core/tools/vargen/repeat_scanner.hpp @@ -34,6 +34,7 @@ class RepeatScanner : public VariantGenerator { unsigned min_snvs = 2; unsigned max_period = 6; + unsigned min_tract_length = 3; unsigned min_observations = 2; unsigned min_sample_observations = 2; boost::optional min_vaf = boost::none; diff --git a/src/core/tools/vcf_record_factory.cpp b/src/core/tools/vcf_record_factory.cpp index a1f27c89..a96ced70 100644 --- a/src/core/tools/vcf_record_factory.cpp +++ b/src/core/tools/vcf_record_factory.cpp @@ -240,6 +240,35 @@ void pad_indels(std::vector& calls, const std::vector& } } +class InconsistentPloidyError : public ProgramError +{ +public: + InconsistentPloidyError(SampleName sample, Genotype first, Genotype second) + : sample_ {std::move(sample)} + , first_ {std::move(first)} + , second_ {std::move(second)} + {} +private: + SampleName sample_; + Genotype first_, second_; + + std::string do_where() const override + { + return "VcfRecordFactory::make"; + } + std::string do_why() const override + { + std::ostringstream ss {}; + ss << "In sample " << sample_ << ", calls at " + << first_.mapped_region() + << " & " << second_.mapped_region() + << " were called in the same phase set but have different genotype ploidies" + << " (" << first_.ploidy() << " & " + << second_.ploidy() << ")"; + return ss.str(); + } +}; + std::vector VcfRecordFactory::make(std::vector&& calls) const { using std::begin; using std::end; using std::cbegin; using std::cend; using std::next; @@ -357,12 +386,16 @@ std::vector VcfRecordFactory::make(std::vector&& calls) std::vector> prev_represented {}; prev_represented.reserve(samples_.size()); for (const auto& sample : samples_) { - const auto ploidy = block_begin_itr->call->get_genotype_call(sample).genotype.ploidy(); + const auto& genotype = block_begin_itr->call->get_genotype_call(sample).genotype; + const auto ploidy = genotype.ploidy(); prev_represented.emplace_back(ploidy, nullptr); for (auto itr = block_begin_itr; itr != block_head_end_itr; ++itr) { const auto& gt = itr->call->get_genotype_call(sample).genotype; for (unsigned i {0}; i < gt.ploidy(); ++i) { if (itr->call->is_represented(gt[i])) { + if (prev_represented.back().size() < i) { + throw InconsistentPloidyError {sample, genotype, gt}; + } prev_represented.back()[i] = std::addressof(*itr->call); } } @@ -578,6 +611,11 @@ void set_allele_counts(const Call& call, const std::vector& samples, result.set_info("AN", p.second); } +auto phred_round(const double phreds) +{ + return phreds < 1 ? maths::round_sf(phreds, 2) : maths::round(phreds, 2); +} + VcfRecord VcfRecordFactory::make(std::unique_ptr call) const { auto result = VcfRecord::Builder {}; @@ -595,13 +633,13 @@ VcfRecord VcfRecordFactory::make(std::unique_ptr call) const result.set_ref(call->reference().sequence()); set_allele_counts(*call, samples_, alts, result); result.set_alt(std::move(alts)); - result.set_qual(std::min(max_qual, maths::round(call->quality().score(), 2))); + result.set_qual(std::min(max_qual, phred_round(call->quality().score()))); const auto call_reads = copy_overlapped(reads_, region); result.set_info("NS", count_samples_with_coverage(call_reads)); result.set_info("DP", sum_max_coverages(call_reads)); result.set_info("MQ", static_cast(rmq_mapping_quality(call_reads))); if (call->model_posterior()) { - result.set_info("MP", maths::round(call->model_posterior()->score(), 2)); + result.set_info("MP", phred_round(call->model_posterior()->score())); } if (!sites_only_) { if (call->all_phased()) { @@ -612,7 +650,7 @@ VcfRecord VcfRecordFactory::make(std::unique_ptr call) const for (const auto& sample : samples_) { const auto& genotype_call = call->get_genotype_call(sample); static const Phred max_genotype_quality {10'000}; - const auto gq = static_cast(std::round(std::min(max_genotype_quality, genotype_call.posterior).score())); + const auto gq = static_cast(phred_round(std::min(max_genotype_quality, genotype_call.posterior).score())); set_vcf_genotype(sample, genotype_call, result, is_refcall); result.set_format(sample, "GQ", std::to_string(gq)); result.set_format(sample, "DP", max_coverage(call_reads.at(sample))); @@ -739,13 +777,13 @@ VcfRecord VcfRecordFactory::make_segment(std::vector>&& ca result.set_alt(std::move(alt_alleles)); auto q = std::min_element(std::cbegin(calls), std::cend(calls), [] (const auto& lhs, const auto& rhs) { return lhs->quality() < rhs->quality(); }); - result.set_qual(std::min(max_qual, maths::round(q->get()->quality().score(), 2))); + result.set_qual(std::min(max_qual, phred_round(q->get()->quality().score()))); result.set_info("NS", count_samples_with_coverage(reads_, region)); result.set_info("DP", sum_max_coverages(reads_, region)); result.set_info("MQ", static_cast(rmq_mapping_quality(reads_, region))); const auto mp = get_model_posterior(calls); if (mp) { - result.set_info("MP", maths::round(*mp, 2)); + result.set_info("MP", phred_round(*mp)); } if (!sites_only_) { if (calls.front()->all_phased()) { @@ -757,7 +795,7 @@ VcfRecord VcfRecordFactory::make_segment(std::vector>&& ca for (const auto& sample : samples_) { const auto posterior = calls.front()->get_genotype_call(sample).posterior; static const Phred max_genotype_quality {10'000}; - const auto gq = static_cast(std::round(std::min(max_genotype_quality, posterior).score())); + const auto gq = static_cast(phred_round(std::min(max_genotype_quality, posterior).score())); auto& genotype_call = *sample_itr++; if (is_refcall) { std::replace(std::begin(genotype_call), std::end(genotype_call), diff --git a/src/readpipe/buffered_read_pipe.cpp b/src/readpipe/buffered_read_pipe.cpp index d9cd34bb..ead9ef49 100644 --- a/src/readpipe/buffered_read_pipe.cpp +++ b/src/readpipe/buffered_read_pipe.cpp @@ -118,7 +118,8 @@ GenomicRegion BufferedReadPipe::get_max_fetch_region(const GenomicRegion& reques if (hints_.empty() || hints_.count(request.contig_name()) == 0 || hints_.at(request.contig_name()).empty()) { return default_max_region; } else { - const auto contained_hints = contained_range(hints_.at(request.contig_name()), default_max_region); + const auto hint_search_region = right_overhang_region(default_max_region, request); + const auto contained_hints = contained_range(hints_.at(request.contig_name()), hint_search_region); if (empty(contained_hints)) { return request; } else {