nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
overlaps_within.hpp
Go to the documentation of this file.
1#ifndef NCLIST_OVERLAPS_WITHIN_HPP
2#define NCLIST_OVERLAPS_WITHIN_HPP
3
4#include <vector>
5#include <algorithm>
6#include <optional>
7
8#include "build.hpp"
9
15namespace nclist {
16
24template<typename Index_>
29 struct State {
30 State() = default;
31 State(Index_ cat, Index_ cend) : child_at(cat), child_end(cend) {}
32 Index_ child_at = 0, child_end = 0;
33 };
34 std::vector<State> history;
38};
39
44template<typename Position_>
51 std::optional<Position_> max_gap; // can't default to -1 as Position_ might be unsigned.
52
57 Position_ min_overlap = 0;
58
63 bool quit_on_first = false;
64};
65
81template<typename Index_, typename Position_>
83 const Nclist<Index_, Position_>& subject,
84 Position_ query_start,
85 Position_ query_end,
88 std::vector<Index_>& matches)
89{
90 matches.clear();
91 if (subject.root_children == 0) {
92 return;
93 }
94
95 /****************************************
96 * # Default
97 *
98 * Our aim is to find overlaps to a subject interval `i` where `subject_starts[i] <= query_start` and `subject_ends[i] >= query_end`.
99 *
100 * At each node of the NCList, we search for the first child where the `subject_ends` is greater than or equal to `query_end`.
101 * Earlier "sibling" intervals must have earlier end positions that must be less than that of the query interval, as well as their children, and so can be skipped.
102 * We do so using a binary search (std::lower_bound) on `subject_ends` - recall that these are sorted for children of each node.
103 *
104 * We then iterate until the first interval `j` where `query_start < subject_starts[j]`, at which point we stop.
105 * None of `j`, the children of `j`, nor any sibling intervals after `j` can have a start position less than or equal to that of the query interval, so there's no point in traversing those nodes.
106 * All subject intervals encountered during this iteration are reported in `matches`, and their children are searched in the same manner.
107 *
108 * Unlike `overlaps_any()`, there is no ability to skip the binary search for the descendents of a node.
109 * Sure, the start positions of all descendents are no less than the node's subject interval's start position,
110 * but this doesn't say much about the comparison between the subject and query end positions.
111 *
112 * # Max gap
113 *
114 * Here, our aim is the same as in the default case, with the extra requirement that the difference in lengths of overlapping subject/query pairs is less than or equal to `max_gap`.
115 * This follows the same logic as in the default case with some adjustments:
116 *
117 * - We do not report a subject interval where the query width is greater than the subject's width by more than `max_gap`.
118 * However, we still traverse the children, as they are smaller and might satisfy the `max_gap` criterion.
119 *
120 * # Min overlap
121 *
122 * Here, we apply the extra restriction that the overlapping subinterval must have length greater than `min_overlap`.
123 * This is as simple as returning early if the query interval is of length less than `min_overlap`.
124 * Otherwise, any overlap that satisfies the default case will automatically have an overlapping subinterval of at least `min_overlap`.
125 *
126 ****************************************/
127
128 Position_ query_width = query_end - query_start;
129 if (params.min_overlap > 0 && query_width < params.min_overlap) {
130 return;
131 }
132
133 auto find_first_child = [&](Index_ children_start, Index_ children_end) -> Index_ {
134 auto ebegin = subject.ends.begin();
135 auto estart = ebegin + children_start;
136 auto eend = ebegin + children_end;
137 return std::lower_bound(estart, eend, query_end) - ebegin;
138 };
139
140 auto is_finished = [&](Position_ subject_start) -> bool {
141 return subject_start > query_start;
142 };
143
144 Index_ root_child_at = find_first_child(0, subject.root_children);
145
146 workspace.history.clear();
147 while (1) {
148 Index_ current_subject;
149 if (workspace.history.empty()) {
150 if (root_child_at == subject.root_children || is_finished(subject.starts[root_child_at])) {
151 break;
152 }
153 current_subject = root_child_at;
154 ++root_child_at;
155 } else {
156 auto& current_state = workspace.history.back();
157 if (current_state.child_at == current_state.child_end || is_finished(subject.starts[current_state.child_at])) {
158 workspace.history.pop_back();
159 continue;
160 }
161 current_subject = current_state.child_at;
162 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
163 }
164
165 const auto& current_node = subject.nodes[current_subject];
166
167 // If max_gap is violated, we don't bother to add the current subject interval,
168 // but the children could be okay so we proceed to the next level of the NClist.
169 bool add_self = true;
170 if (params.max_gap.has_value()) {
171 auto subject_start = subject.starts[current_subject];
172 auto subject_end = subject.ends[current_subject];
173 auto subject_width = subject_end - subject_start;
174 if (subject_width - query_width > *(params.max_gap)) {
175 add_self = false;
176 }
177 }
178
179 if (add_self) {
180 matches.push_back(current_node.id);
181 if (params.quit_on_first) {
182 return;
183 }
184 if (current_node.duplicates_start != current_node.duplicates_end) {
185 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
186 }
187 }
188
189 if (current_node.children_start != current_node.children_end) {
190 Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
191 if (start_pos != current_node.children_end) {
192 workspace.history.emplace_back(start_pos, current_node.children_end);
193 }
194 }
195 }
196}
197
198}
199
200#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void overlaps_within(const Nclist< Index_, Position_ > &subject, Position_ query_start, Position_ query_end, const OverlapsWithinParameters< Position_ > &params, OverlapsWithinWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition overlaps_within.hpp:82
Pre-built nested containment list.
Definition build.hpp:28
Parameters for overlaps_within().
Definition overlaps_within.hpp:45
bool quit_on_first
Definition overlaps_within.hpp:63
std::optional< Position_ > max_gap
Definition overlaps_within.hpp:51
Position_ min_overlap
Definition overlaps_within.hpp:57
Workspace for overlaps_within().
Definition overlaps_within.hpp:25