Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migrate interval tree from cgranges to superintervals #42

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,6 @@ jobs:
PYTHON_VERSION: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
with:
submodules: 'true'

- name: Set up Python ${{matrix.PYTHON_VERSION}}
uses: actions/setup-python@v4
Expand Down
2 changes: 0 additions & 2 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ jobs:

steps:
- uses: actions/checkout@v4
with:
submodules: "true"

# Used to host cibuildwheel
- uses: actions/setup-python@v4
Expand Down
4 changes: 0 additions & 4 deletions .gitmodules

This file was deleted.

5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ conda activate pybedlite

# Getting Setup for Development Work

Clone the repository to your local machine. Note that pybedlite >= 0.0.4 includes [cgranges][cgranges-link] as a submodule, so you must use the `--recurse-submodules` option:
Clone the repository to your local machine.
```
git clone --recurse-submodules https://github.com/fulcrumgenomics/pybedlite.git
git clone https://github.com/fulcrumgenomics/pybedlite.git
```

[Poetry][poetry-link] is used to manage the python development environment.
Expand All @@ -85,7 +85,6 @@ export CFLAGS="-stdlib=libc++"

[poetry-link]: https://github.com/python-poetry/poetry
[conda-link]: https://docs.conda.io/en/latest/miniconda.html
[cgranges-link]: https://github.com/lh3/cgranges

## Checking the Build
### Run all checks with:
Expand Down
26 changes: 0 additions & 26 deletions build.py

This file was deleted.

1 change: 0 additions & 1 deletion cgranges
Submodule cgranges deleted from 2fb5a2
639 changes: 404 additions & 235 deletions poetry.lock

Large diffs are not rendered by default.

76 changes: 42 additions & 34 deletions pybedlite/overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,13 @@
from typing import List
from typing import Optional
from typing import Protocol
from typing import Set
from typing import Type
from typing import TypeVar
from typing import Union

import attr
from superintervals import IntervalSet

import cgranges as cr
from pybedlite.bed_record import BedRecord
from pybedlite.bed_record import BedStrand
from pybedlite.bed_source import BedSource
Expand Down Expand Up @@ -269,7 +268,7 @@ class OverlapDetector(Generic[SpanType], Iterable[SpanType]):

def __init__(self, intervals: Optional[Iterable[SpanType]] = None) -> None:
# A mapping from the contig/chromosome name to the associated interval tree
self._refname_to_tree: Dict[str, cr.cgranges] = {} # type: ignore
self._refname_to_tree: Dict[str, IntervalSet] = {}
self._refname_to_indexed: Dict[str, bool] = {}
self._refname_to_intervals: Dict[str, List[SpanType]] = {}
if intervals is not None:
Expand All @@ -286,7 +285,7 @@ def add(self, interval: SpanType) -> None:
interval: the interval to add to this detector
"""
if interval.refname not in self._refname_to_tree:
self._refname_to_tree[interval.refname] = cr.cgranges() # type: ignore
self._refname_to_tree[interval.refname] = IntervalSet()
self._refname_to_indexed[interval.refname] = False
self._refname_to_intervals[interval.refname] = []

Expand All @@ -295,9 +294,10 @@ def add(self, interval: SpanType) -> None:
interval_idx: int = len(self._refname_to_intervals[interval.refname])
self._refname_to_intervals[interval.refname].append(interval)

# Add the interval to the tree
# Add the interval to the tree. Note that IntervalSet uses closed intervals whereas we are
# using half-open intervals, so add 1 to start
tree = self._refname_to_tree[interval.refname]
tree.add(interval.refname, interval.start, interval.end, interval_idx)
tree.add(interval.start + 1, interval.end, interval_idx)

# Flag this tree as needing to be indexed after adding a new interval, but defer
# indexing
Expand All @@ -322,18 +322,38 @@ def overlaps_any(self, interval: Span) -> bool:
True if and only if the given interval overlaps with any interval in this
detector.
"""
tree = self._refname_to_tree.get(interval.refname)
tree = self._refname_to_tree.get(interval.refname, None)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doesn't .get() already return None as the default empty value?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes. I tend to prefer explicit Nones in this kind of use case and changed it without thinking much. I can revert if this is an issue.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries, I don't really have an opinion. Explicit is nice.

if tree is None:
return False
else:
if not self._refname_to_indexed[interval.refname]:
tree.index()
try:
next(iter(tree.overlap(interval.refname, interval.start, interval.end)))
except StopIteration:
return False
else:
return True
self._refname_to_indexed[interval.refname] = True
# IntervalSet uses closed intervals whereas we are using half-open intervals, so add 1
# to start
return tree.any_overlaps(interval.start + 1, interval.end)

def iter_overlaps(self, interval: Span) -> Iterator[SpanType]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This new method isn't much different in behavior than get_overlaps so I don't think we need both publicly available. Although this one looks lazy, it isn't.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Responding to @clintval and @msto in the same place since you made similar comments:

  • It's not intended to be lazy, just unopinionated about the container and sort order. If you just want to iterate over all the overlaps in any order, this saves processing and memory, as well as keeping any duplicates (as far as __hash__ is concernced).
  • In another project I had to write my own subclass with duplicate overlap-getting logic because I explicitly wanted duplicates and couldn't get them otherwise. But I can't exactly change the behavior of get_overlaps without breaking the API.
  • To me this design seemed like a compromise that kept all the overlap logic in one place while allowing a lighter-weight call for those who wanted it. But it's outside the remit of the originating issue, so I could punt and remove this if the consensus is that it's unwanted.

Copy link
Contributor

@msto msto Dec 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd usually favor including those as options in get_overlaps() instead of having a separate public method.

e.g.

def get_overlaps(
    self,
    interval: Span,
    sort_overlaps: bool = True,
    include_duplicates: bool = False,
) -> list[SpanType]:

I agree with Clint that returning an Iterator implies lazy evaluation. If the method can be lazy, I think our usual pattern is to return an iterator and allow the user to convert to list at the call site. Otherwise, the method should return list.

msto marked this conversation as resolved.
Show resolved Hide resolved
"""Yields any intervals in this detector that overlap the given interval

Args:
interval: the interval to check

Yields:
Intervals in this detector that overlap the given interval, in insertion order.
"""
tree = self._refname_to_tree.get(interval.refname, None)
if tree is not None:
if not self._refname_to_indexed[interval.refname]:
tree.index()
self._refname_to_indexed[interval.refname] = True
ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname]
# IntervalSet uses closed intervals whereas we are using half-open intervals, so add 1
# to start.
# Also IntervalSet yields indices in reverse insertion order, so yield intervals in
# reverse of indices list.
for index in reversed(tree.find_overlaps(interval.start + 1, interval.end)):
yield ref_intervals[index]
Comment on lines +355 to +356
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The call to reversed won't make this lazy despite its use in a generator.

Are you doing this to preserve original order?

IMHO order wouldn't matter to me.

Or if it did, I'd want the same order as is forced in get_overlaps() further down.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I put the call to reversed to yield them in insertion order. The IntervalSet returns a list in reverse-insertion order, and reversed iterates backwards through the list without copying, so basically no overhead. I don't feel strongly, the point of this method was to have a lightweight method for yielding all the intervals without concern for order.


def get_overlaps(self, interval: Span) -> List[SpanType]:
"""Returns any intervals in this detector that overlap the given interval.
Expand All @@ -351,27 +371,15 @@ def get_overlaps(self, interval: Span) -> List[SpanType]:
* The interval's strand, positive or negative (assumed to be positive if undefined)
* The interval's reference sequence name (lexicographically)
"""
tree = self._refname_to_tree.get(interval.refname)
if tree is None:
return []
else:
if not self._refname_to_indexed[interval.refname]:
tree.index()
ref_intervals: List[SpanType] = self._refname_to_intervals[interval.refname]
# NB: only return unique instances of intervals
intervals: Set[SpanType] = {
ref_intervals[index]
for _, _, index in tree.overlap(interval.refname, interval.start, interval.end)
}
return sorted(
intervals,
key=lambda intv: (
intv.start,
intv.end,
self._negative(intv),
intv.refname,
),
)
return sorted(
set(self.iter_overlaps(interval)),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious about the set here - how does IntervalSet handle duplicate intervals?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IntervalSet will only be storing indices into the intervals list that the OverlapDetector stores. So it won't even be aware of any exact duplicates (i.e. even the index is the same). As for intervals with duplicate start and stop positions, it stores them correctly.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will depend on the hash of the object you place in the OverlapDetector, since the overlap detector is generic now.

If you send in a dataclass with frozen=True, then all fields will be used as a part of the hash. However, if you make a custom interval-like object and define your own hash method, then that hash method will be used.

Both BedRecord and Interval hash on all their fields.

Mypy should disallow any custom interval-like object that does not have a __hash__() dunder method:

class Span(Hashable, Protocol):

I don't think a call to set or sorted should have been included in the original implementation (a user should have the choice of "unique-ifying" later, or sorting later) but because they are already a part of the implementation details, it makes sense to preserve behavior:

intervals: Set[SpanType] = {
ref_intervals[index]
for _, _, index in tree.overlap(interval.refname, interval.start, interval.end)
}
return sorted(
intervals,
key=lambda intv: (
intv.start,
intv.end,
self._negative(intv),
intv.refname,
),
)

key=lambda intv: (
intv.start,
intv.end,
self._negative(intv),
intv.refname,
),
)

@staticmethod
def _negative(interval: Span) -> bool:
Expand Down
15 changes: 15 additions & 0 deletions pybedlite/tests/test_overlap_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,3 +384,18 @@ def test_the_overlap_detector_can_be_built_from_a_bed_file(tmp_path: Path) -> No
detector: OverlapDetector[BedRecord] = OverlapDetector.from_bed(bed)
overlaps: List[BedRecord] = detector.get_overlaps(Interval("chr1", 1, 2))
assert overlaps == [BedRecord(chrom="chr1", start=1, end=2)]


def test_alternating_query_and_adding_intervals() -> None:
detector: OverlapDetector[Interval] = OverlapDetector()

query = Interval("1", 10, 15)
target1 = Interval("1", 10, 100, name="target1")
detector.add(target1)
# Test get_overlaps()
assert detector.get_overlaps(query) == [target1]

target2 = Interval("1", 11, 101, name="target2")
detector.add(target2)
# Test get_overlaps()
assert detector.get_overlaps(query) == [target1, target2]
9 changes: 3 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,13 @@ classifiers = [
"Topic :: Software Development :: Libraries :: Python Modules",
]
include = ["LICENSE"]
packages = [{ include = "pybedlite" }, { include = "cgranges" }]
packages = [{ include = "pybedlite" }]

[tool.poetry.dependencies]
python = "^3.8.0"
attrs = "^23.0.0"
sphinx = { version = "^7.0.0", optional = true }
superintervals = "0.2.2"

[tool.poetry.dev-dependencies]
pytest = "^7.0.0"
Expand All @@ -43,10 +44,6 @@ pytest-cov = "^4.0.0"
[tool.poetry.extras]
docs = ["sphinx"]

[tool.poetry.build]
script = "build.py"
generate-setup-file = true

[build-system]
clintval marked this conversation as resolved.
Show resolved Hide resolved
requires = ["poetry-core", "setuptools", "cython"]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
Loading