nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
overlaps_any.hpp
Go to the documentation of this file.
1#ifndef NCLIST_OVERLAPS_ANY_HPP
2#define NCLIST_OVERLAPS_ANY_HPP
3
4#include <vector>
5#include <algorithm>
6#include <optional>
7#include <limits>
8
9#include "build.hpp"
10#include "utils.hpp"
11
17namespace nclist {
18
26template<typename Index_>
31 struct State {
32 State() = default;
33 State(Index_ cat, Index_ cend, bool skip) : child_at(cat), child_end(cend), skip_search(skip) {}
34 Index_ child_at = 0, child_end = 0;
35 bool skip_search = false;
36 };
37 std::vector<State> history;
41};
42
47template<typename Position_>
56 std::optional<Position_> max_gap; // can't default to -1 as Position_ might be unsigned.
57
62 Position_ min_overlap = 0;
63
68 bool quit_on_first = false;
69};
70
86template<typename Index_, typename Position_>
88 const Nclist<Index_, Position_>& subject,
89 Position_ query_start,
90 Position_ query_end,
93 std::vector<Index_>& matches)
94{
95 matches.clear();
96 if (subject.root_children == 0) {
97 return;
98 }
99
100 enum class OverlapsAnyMode : char { BASIC, MIN_OVERLAP, MAX_GAP };
101 OverlapsAnyMode mode = OverlapsAnyMode::BASIC;
102 if (params.min_overlap > 0) {
103 mode = OverlapsAnyMode::MIN_OVERLAP;
104 } else if (params.max_gap.has_value()) {
105 mode = OverlapsAnyMode::MAX_GAP;
106 }
107
108 /****************************************
109 * # Default
110 *
111 * Our aim is to find overlaps to a subject interval `i` where `subject_starts[i] < query_end` and `query_start < subject_ends[i]`.
112 *
113 * At each node of the NCList, we search for the first child where the `subject_ends` is greater than `query_start` (as ends are non-inclusive).
114 * Earlier "sibling" intervals cannot overlap with the query interval, nor can their children.
115 * We do so using a binary search (std::upper_bound) on `subject_ends` - recall that these are sorted for children of each node.
116 *
117 * We then iterate until the first interval `j` where `query_end < subject_starts[j]`, at which point we stop.
118 * This is because `j`, the children of `j`, nor any sibling intervals after `j` can overlap with the query interval, so there's no point in traversing those nodes.
119 * Any subject interval encountered during this iteration is reported as an overlap, and we process its children in the same manner.
120 *
121 * For a modest efficiency boost, we consider the case where `query_start < subject_starts[i]` for node `i`.
122 * In such cases, `subject_ends` for all children of `i` must also satisfy `query_start < subject_ends[i]`, in which case the binary search can be skipped.
123 * Moreover, all descendent intervals of `i` must end after `query_start`, so the binary search can be skipped for the entire lineage of the NCList.
124 *
125 * # Max gap
126 *
127 * Here, our aim is to find subject intervals where `subject_starts[i] <= query_end + max_gap` and `query_start <= subject_ends[i] + max_gap`.
128 * (Yes, the `<=` is deliberate.)
129 * This follows much the same logic as in the default case with some adjustments:
130 *
131 * - We consider an "effective" query start as `effective_query_start = query_start - max_gap`.
132 * - We then perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_start`.
133 * This is the earliest subject interval that could lie within `max_gap` of the query or could have children that could do so.
134 * The search uses `std::lower_bound()` due to the `<=` in the problem definition above.
135 * - Similarly, we check if `effective_query_start <= subject_starts[i]` to determine whether to skip the binary search.
136 * - We stop iteration once `subject_starts[j] - query_end > max_gap`, after which we know that all start positions of siblings/children must lie beyond `max_gap`.
137 *
138 * # Min overlap
139 *
140 * Here, we apply the extra restriction that the overlapping subinterval must have length greater than `min_overlap`.
141 * This follows much the same logic as in the default case with some adjustments:
142 *
143 * - We consider an "effective" query start as `effective_query_start = query_start + min_overlap`.
144 * This defines the earliest entry of `subject_ends` that still could provide an overlap of at least `min_overlap`.
145 * - We perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_start`.
146 * This uses `std::lower_bound()` as the overlap-adjusted query start is inclusive with the ends.
147 * - Similarly, we check if `effective_query_start <= subject_starts[i]` to determine whether to skip the binary search.
148 * - We stop iteration once `query_end - subject_start < min_overlap`, after which we know that all start positions of siblings/children must have smaller overlaps.
149 * - If the length of the overlapping subinterval is less than `min_overlap`, we do not report the subject interval in `matches`.
150 * Morever, we skip traversal of that node's children, as the children must be smaller and will not satisfy `min_overlap` anyway.
151 *
152 * We return early if the length of the query itself is less than `min_overlap`, as no overlap will be satisfactory.
153 *
154 * Fortunately, `min_overlap` and `max_gap` are mutually exclusive parameters in `overlaps_any()`, so we don't have to worry about their interaction.
155 *
156 ****************************************/
157
158 if (mode == OverlapsAnyMode::MIN_OVERLAP) {
159 if (query_end - query_start < params.min_overlap) {
160 return;
161 }
162 }
163
164 Position_ effective_query_start = std::numeric_limits<Position_>::max(); // set it to something crazy so that something weird will happen if we use it in BASIC mode.
165 if (mode == OverlapsAnyMode::MAX_GAP) {
166 effective_query_start = safe_subtract_gap(query_start, *(params.max_gap));
167 } else if (mode == OverlapsAnyMode::MIN_OVERLAP) {
168 constexpr Position_ maxed = std::numeric_limits<Position_>::max();
169 if (maxed - params.min_overlap < query_start) {
170 // No point continuing as nothing will be found in the binary search.
171 return;
172 } else {
173 effective_query_start = query_start + params.min_overlap;
174 }
175 }
176
177 auto find_first_child = [&](Index_ children_start, Index_ children_end) -> Index_ {
178 auto ebegin = subject.ends.begin();
179 auto estart = ebegin + children_start;
180 auto eend = ebegin + children_end;
181 if (mode == OverlapsAnyMode::BASIC) {
182 return std::upper_bound(estart, eend, query_start) - ebegin;
183 } else {
184 return std::lower_bound(estart, eend, effective_query_start) - ebegin;
185 }
186 };
187
188 auto can_skip_search = [&](Position_ subject_start) -> bool {
189 if (mode == OverlapsAnyMode::BASIC) {
190 return subject_start > query_start;
191 } else {
192 return subject_start >= effective_query_start;
193 }
194 };
195
196 auto is_finished = [&](Position_ subject_start) -> bool {
197 if (mode == OverlapsAnyMode::BASIC) {
198 return subject_start >= query_end;
199 } else if (mode == OverlapsAnyMode::MAX_GAP) {
200 if (subject_start < query_end) {
201 return false;
202 }
203 return subject_start - query_end > *(params.max_gap);
204 } else { // i.e., mode == OverlapsAnyMode::MIN_OVERLAP
205 if (subject_start >= query_end) {
206 return true;
207 }
208 return query_end - subject_start < params.min_overlap;
209 }
210 };
211
212 Index_ root_child_at = 0;
213 bool root_skip_search = can_skip_search(subject.starts[0]);
214 if (!root_skip_search) {
215 root_child_at = find_first_child(0, subject.root_children);
216 }
217
218 workspace.history.clear();
219 while (1) {
220 Index_ current_subject;
221 bool skip_search;
222 if (workspace.history.empty()) {
223 if (root_child_at == subject.root_children || is_finished(subject.starts[root_child_at])) {
224 break;
225 }
226 current_subject = root_child_at;
227 skip_search = root_skip_search;
228 ++root_child_at;
229 } else {
230 auto& current_state = workspace.history.back();
231 if (current_state.child_at == current_state.child_end || is_finished(subject.starts[current_state.child_at])) {
232 workspace.history.pop_back();
233 continue;
234 }
235 current_subject = current_state.child_at;
236 skip_search = current_state.skip_search;
237 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
238 }
239
240 const auto& current_node = subject.nodes[current_subject];
241 if (mode == OverlapsAnyMode::MIN_OVERLAP) {
242 if (std::min(query_end, subject.ends[current_subject]) - std::max(query_start, subject.starts[current_subject]) < params.min_overlap) {
243 // No point continuing with the children, as all children will by definition have smaller overlaps and cannot satisfy `min_overlap`.
244 continue;
245 }
246 }
247
248 matches.push_back(current_node.id);
249 if (params.quit_on_first) {
250 return;
251 }
252 if (current_node.duplicates_start != current_node.duplicates_end) {
253 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
254 }
255
256 if (current_node.children_start != current_node.children_end) {
257 if (skip_search) {
258 workspace.history.emplace_back(current_node.children_start, current_node.children_end, true);
259 } else {
260 Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
261 if (start_pos != current_node.children_end) {
262 workspace.history.emplace_back(start_pos, current_node.children_end, can_skip_search(subject.starts[start_pos]));
263 }
264 }
265 }
266 }
267}
268
269}
270
271#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void overlaps_any(const Nclist< Index_, Position_ > &subject, Position_ query_start, Position_ query_end, const OverlapsAnyParameters< Position_ > &params, OverlapsAnyWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition overlaps_any.hpp:87
Pre-built nested containment list.
Definition build.hpp:28
Parameters for overlaps_any().
Definition overlaps_any.hpp:48
std::optional< Position_ > max_gap
Definition overlaps_any.hpp:56
bool quit_on_first
Definition overlaps_any.hpp:68
Position_ min_overlap
Definition overlaps_any.hpp:62
Workspace for overlaps_any().
Definition overlaps_any.hpp:27