nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
nearest.hpp
Go to the documentation of this file.
1#ifndef NCLIST_NEAREST_HPP
2#define NCLIST_NEAREST_HPP
3
4#include <vector>
5#include <algorithm>
6#include <optional>
7#include <limits>
8
9#include "build.hpp"
10#include "utils.hpp"
11
17namespace nclist {
18
26template<typename Index_>
31 struct State {
32 State() = default;
33 State(Index_ cat, Index_ cend, bool skip) : child_at(cat), child_end(cend), skip_search(skip) {}
34 Index_ child_at = 0, child_end = 0;
35 bool skip_search = false;
36 };
37 std::vector<State> history;
41};
42
47template<typename Position_>
53 bool quit_on_first = false;
54
61};
62
66template<typename Index_, typename Position_>
67void nearest_before(
68 const Nclist<Index_, Position_>& subject,
69 const Index_ root_index,
70 const Position_ end_position,
71 const bool quit_on_first,
72 std::vector<Index_>& matches)
73{
74 Index_ current_subject = root_index;
75 while (1) {
76 const auto& current_node = subject.nodes[current_subject];
77 matches.push_back(current_node.id);
78 if (quit_on_first) {
79 return;
80 }
81 if (current_node.duplicates_start != current_node.duplicates_end) {
82 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
83 }
84 if (current_node.children_start == current_node.children_end) {
85 return;
86 }
87 current_subject = current_node.children_end - 1;
88 if (subject.ends[current_subject] != end_position) {
89 return;
90 }
91 }
92}
93
94template<typename Index_, typename Position_>
95void nearest_after(
96 const Nclist<Index_, Position_>& subject,
97 const Index_ root_index,
98 const Position_ start_position,
99 const bool quit_on_first,
100 std::vector<Index_>& matches)
101{
102 Index_ current_subject = root_index;
103 while (1) {
104 const auto& current_node = subject.nodes[current_subject];
105 matches.push_back(current_node.id);
106 if (quit_on_first) {
107 return;
108 }
109 if (current_node.duplicates_start != current_node.duplicates_end) {
110 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
111 }
112 if (current_node.children_start == current_node.children_end) {
113 return;
114 }
115 current_subject = current_node.children_start;
116 if (subject.starts[current_subject] != start_position) {
117 return;
118 }
119 }
120}
121
122template<typename Index_, typename Position_>
123Index_ nearest_overlaps(
124 const Nclist<Index_, Position_>& subject,
125 const Position_ query_start,
126 const Position_ query_end,
127 const bool quit_on_first,
128 const bool adjacent_equals_overlap,
129 NearestWorkspace<Index_>& workspace,
130 std::vector<Index_>& matches)
131{
132 /****************************************
133 * # Default
134 *
135 * This section is the same as that of overlaps_any().
136 * Our aim is to find overlaps to a subject interval `i` where `subject_starts[i] < query_end` and `query_start < subject_ends[i]`.
137 *
138 * At each node of the NCList, we search for the first child where the `subject_ends` is greater than `query_start` (as ends are non-inclusive).
139 * Earlier "sibling" intervals cannot overlap with the query interval, nor can their children.
140 * We do so using a binary search (std::upper_bound) on `subject_ends` - recall that these are sorted for children of each node.
141 *
142 * We then iterate until the first interval `j` where `query_end <= subject_starts[j]`, at which point we stop.
143 * This is because `j`, the children of `j`, nor any sibling intervals after `j` can overlap with the query interval, so there's no point in traversing those nodes.
144 * Any subject interval encountered during this iteration is reported as an overlap, and we process its children in the same manner.
145 *
146 * For a modest efficiency boost, we consider the case where `query_start < subject_starts[i]` for node `i`.
147 * In such cases, `subject_ends` for all children of `i` must also satisfy `query_start < subject_ends[i]`, in which case the binary search can be skipped.
148 * Moreover, all descendent intervals of `i` must end after `query_start`, so the binary search can be skipped for the entire lineage of the NCList.
149 *
150 * # Adjacent == overlaps
151 *
152 * For historical reasons, we can also report intervals that are immediately adjacent to the query.
153 *
154 * At each node of the NCList, we check whether there is a child where `subject_ends == query_start`, i.e., the subject immediately precedes the query.
155 * This is quite easily accomplished as we already have the first child where `subject_ends > query_start`, so we just backtrack by 1 if available.
156 * We then recurse within the preceding-adjacent child to see if there are any descendents that are also immediately adjacent, see `nearest_before()`.
157 *
158 * Similarly, to find subjects that are immediately following the query, we check whether there is a child where `query_end == subject_starts`.
159 * We already do this check as part of the termination of the iterations, so it's not much extra work involved.
160 * We then recurse within the following-adjacent child to see if there are any descendents that are also immediately adjacent, see `nearest_after()`.
161 *
162 * If `query_start < subject_starts[i]` for node `i`, we can skip the check for preceding subjects.
163 * There are no subjects before the query at that level of the tree.
164 */
165
166 const auto find_first_child = [&](const Index_ children_start, const Index_ children_end) -> Index_ {
167 const auto ebegin = subject.ends.begin();
168 const auto estart = ebegin + children_start;
169 const auto eend = ebegin + children_end;
170 return std::upper_bound(estart, eend, query_start) - ebegin;
171 };
172
173 const auto can_skip_search = [&](const Position_ subject_start) -> bool {
174 return subject_start > query_start;
175 };
176
177 const auto is_finished = [&](const Position_ subject_start) -> bool {
178 return subject_start >= query_end;
179 };
180
181 Index_ root_child_at = 0;
182 const bool root_skip_search = can_skip_search(subject.starts[0]);
183 if (!root_skip_search) {
184 root_child_at = find_first_child(0, subject.root_children);
185 if (adjacent_equals_overlap && root_child_at) {
186 const Index_ previous_child = root_child_at - 1;
187 if (query_start == subject.ends[previous_child]) {
188 nearest_before(subject, previous_child, query_start, quit_on_first, matches);
189 if (quit_on_first && !matches.empty()) {
190 return root_child_at;
191 }
192 }
193 }
194 }
195
196 workspace.history.clear();
197 while (1) {
198 Index_ current_subject;
199 bool skip_search;
200
201 if (workspace.history.empty()) {
202 if (root_child_at == subject.root_children) {
203 break;
204 } else {
205 const Position_ next_start = subject.starts[root_child_at];
206 if (is_finished(next_start)) {
207 if (adjacent_equals_overlap && next_start == query_end) {
208 nearest_after(subject, root_child_at, query_end, quit_on_first, matches);
209 // No need to return early if `quit_on_first = true`, we're already breaking out of the loop anyway.
210 }
211 break;
212 }
213 }
214 current_subject = root_child_at;
215 skip_search = root_skip_search;
216 ++root_child_at;
217
218 } else {
219 auto& current_state = workspace.history.back();
220 if (current_state.child_at == current_state.child_end) {
221 workspace.history.pop_back();
222 continue;
223 } else {
224 const Position_ next_start = subject.starts[current_state.child_at];
225 if (is_finished(next_start)) {
226 if (adjacent_equals_overlap && next_start == query_end) {
227 nearest_after(subject, current_state.child_at, query_end, false, matches);
228 // `quit_on_first` must be false because, if it were true, we would have returned in a previous iteration;
229 // we can't get to this point in the function without overlapping a parent interval.
230 }
231 workspace.history.pop_back();
232 continue;
233 }
234 }
235 current_subject = current_state.child_at;
236 skip_search = current_state.skip_search;
237 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
238 }
239
240 const auto& current_node = subject.nodes[current_subject];
241
242 matches.push_back(current_node.id);
243 if (quit_on_first) {
244 break;
245 }
246 if (current_node.duplicates_start != current_node.duplicates_end) {
247 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
248 }
249
250 if (current_node.children_start != current_node.children_end) {
251 if (skip_search) {
252 workspace.history.emplace_back(current_node.children_start, current_node.children_end, true);
253 } else {
254 const Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
255 if (adjacent_equals_overlap && start_pos > current_node.children_start) {
256 const Index_ previous_child = start_pos - 1;
257 if (query_start == subject.ends[previous_child]) {
258 nearest_before(subject, previous_child, query_start, false, matches);
259 // `quit_on_first` must be false because, if it were true, we would have returned already;
260 // we can't get to this point in the function without overlapping a parent interval.
261 }
262 }
263 if (start_pos != current_node.children_end) {
264 workspace.history.emplace_back(start_pos, current_node.children_end, can_skip_search(subject.starts[start_pos]));
265 }
266 }
267 }
268 }
269
270 return root_child_at;
271}
298template<typename Index_, typename Position_>
300 const Nclist<Index_, Position_>& subject,
301 const Position_ query_start,
302 const Position_ query_end,
303 const NearestParameters<Position_>& params,
304 NearestWorkspace<Index_>& workspace,
305 std::vector<Index_>& matches)
306{
307 matches.clear();
308 if (subject.root_children == 0) {
309 return;
310 }
311
312 const auto root_index = nearest_overlaps(
313 subject,
314 query_start,
315 query_end,
316 params.quit_on_first,
318 workspace,
319 matches
320 );
321 if (!matches.empty()) {
322 return;
323 }
324
325 /****************************************
326 * If there are no overlaps, checking the distance to the first subject interval before or after the query.
327 *
328 * `root_index` is initially defined in `nearest_overlaps()` as the upper bound of `query_start` on the subject ends of the root node of the NCList.
329 * If there are no overlaps, this definition won`t change and the same value will be returned to `nearest()`.
330 * At this point, `root_index` defines the first subject interval on the root node that starts after the query.
331 * Conversely, `root_index - 1` is the last subject interval on the root node that ends before the query.
332 *
333 * Thus, to find the nearest following interval, only the `root_index`-th interval on the root node needs to be considered as it will have the start closest to `query_end`.
334 * All later intervals on the root node will have later starts, otherwise they would be nested within the `root_index`-th interval.
335 * We still need to check the descendents of the `root_index`-th interval but we only need to care about the first child at each level of the lineage.
336 * In particular, we check whether it has the same start coordinate as its parent;
337 * if not, we know that the rest of that lineage will start after the `root_index`-th interval and we terminate the search.
338 *
339 * Similarly, to find the nearest preceding interval, only the `root_index - 1`-th interval on the root node needs to be considered as it will have the end closest to `query_start`.
340 * All earlier intervals on the root node will have earlier ends, otherwise they would be nested within the `root_index - 1`-th interval.
341 * We still need to check the descendents of the `root_index - 1`-th interval but we only need to care about the last child at each level of the lineage.
342 * In particular, we check whether it has the same end coordinate as its parent;
343 * if not, we know that the rest of that lineage will end before the `root_index - 1`-th interval and we terminate the search.
344 *
345 * Note that this function behaves a little strangely when a zero-width query is supplied and there is an identical zero-width subject interval.
346 * Specifically, the latter may be treated as preceding or following the former, depending on whether they are nested in other subject intervals.
347 * Fortunately, this doesn`t have any bearing on the final outcome as it will still be the nearest interval to the query in either case.
348 *
349 * If `adjacent_equals_overlap = true`, the same logic applies as we still don't have any overlaps if we get to this part of the function.
350 * The only difference is that our gaps are now guaranteed to be non-zero if no adjacent/overlapping intervals were found in `nearest_overlaps()`.
351 *
352 ****************************************/
353
354 std::optional<Position_> to_previous, to_next;
355 if (root_index) {
356 to_previous = query_start - subject.ends[root_index - 1];
357 }
358 if (root_index < subject.root_children) {
359 to_next = subject.starts[root_index] - query_end;
360 }
361
362 if (to_previous.has_value() && (!to_next.has_value() || *to_previous <= *to_next)) {
363 const Index_ previous_child = root_index - 1;
364 nearest_before(subject, previous_child, subject.ends[previous_child], params.quit_on_first, matches);
365 if (!matches.empty() && params.quit_on_first) {
366 return;
367 }
368 }
369 if (to_next.has_value() && (!to_previous.has_value() || *to_next <= *to_previous)) {
370 nearest_after(subject, root_index, subject.starts[root_index], params.quit_on_first, matches);
371 if (!matches.empty() && params.quit_on_first) {
372 return;
373 }
374 }
375}
376
377}
378
379#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void nearest(const Nclist< Index_, Position_ > &subject, const Position_ query_start, const Position_ query_end, const NearestParameters< Position_ > &params, NearestWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition nearest.hpp:299
Pre-built nested containment list.
Definition build.hpp:28
Parameters for nearest().
Definition nearest.hpp:48
bool adjacent_equals_overlap
Definition nearest.hpp:60
bool quit_on_first
Definition nearest.hpp:53
Workspace for nearest().
Definition nearest.hpp:27