nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
overlaps_end.hpp
Go to the documentation of this file.
1#ifndef NCLIST_OVERLAPS_END_HPP
2#define NCLIST_OVERLAPS_END_HPP
3
4#include <vector>
5#include <algorithm>
6#include <optional>
7#include <limits>
8
9#include "build.hpp"
10#include "utils.hpp"
11
17namespace nclist {
18
26template<typename Index_>
31 struct State {
32 State() = default;
33 State(Index_ cat, Index_ cend) : child_at(cat), child_end(cend) {}
34 Index_ child_at = 0, child_end = 0;
35 };
36 std::vector<State> history;
40};
41
46template<typename Position_>
52 Position_ max_gap = 0;
53
58 Position_ min_overlap = 0;
59
64 bool quit_on_first = false;
65};
66
82template<typename Index_, typename Position_>
84 const Nclist<Index_, Position_>& subject,
85 Position_ query_start,
86 Position_ query_end,
89 std::vector<Index_>& matches)
90{
91 matches.clear();
92 if (subject.root_children == 0) {
93 return;
94 }
95
96 /****************************************
97 * # Default
98 *
99 * Our aim is to find overlaps to a subject interval `i` where `subject_ends[i] == query_end`.
100 *
101 * At each node of the NCList, we search for the first child where the `subject_ends` is greater than or equal to `query_end`.
102 * Earlier "sibling" intervals must have earlier end positions that cannot be equal to that of the query interval - nor can their children.
103 * We do so using a binary search (std::lower_bound) on `subject_ends` - recall that these are sorted for children of each node.
104 *
105 * We then iterate until the first interval `j` where `query_end < subject_starts[j]`, at which point we stop.
106 * None of `j`, the children of `j`, nor any sibling intervals after `j` can have an end position equal to the query interval, so there's no point in traversing those nodes.
107 * Any subject interval encountered during this iteration with an equal end position to the query is reported in `matches`.
108 * Regardless of whether the end position is equal, we repeat the search on the children of every subject intervals encountered during iteration, as one of them might have an equal end position.
109 *
110 * Unlike `overlaps_any()`, there is no ability to skip the binary search for the descendents of a node.
111 * Sure, the start positions of all descendents are no less than the node's subject interval's start position,
112 * but this doesn't say much about the comparison between the subject and query end positions.
113 *
114 * # Max gap
115 *
116 * Here, our aim is to find subject intervals where `abs(query_end - subject_ends[i]) <= max_gap`.
117 * This follows much the same logic as in the default case with some adjustments:
118 *
119 * - We consider an "effective" query end as `effective_query_end = query_end - max_gap`.
120 * - We then perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_end`.
121 * This is the earliest subject interval that has an endpoint (or has children with an endpoint) that lies within `max_gap` of the query end.
122 * - We stop iteration once `subject_starts[j] - query_end > max_gap`, after which we know that all endpoints of siblings/children must lie beyond `max_gap`.
123 * - Any subject interval encountered during this iteration with an end position that satisfies `max_gap` is reported in `matches`.
124 *
125 * # Min overlap
126 *
127 * Here, we apply the extra restriction that the overlapping subinterval must have length greater than `min_overlap`.
128 * This follows the same logic as the default case, with some modifications:
129 *
130 * - We stop iteration once `query_end - subject_starts[j] < min_overlap`, after which we know that all children cannot achieve `min_overlap`.
131 * - If the length of the overlapping subinterval is less than `min_overlap`, we do not report the subject interval in `matches`.
132 * Additionally, we skip traversal of that node's children, as the children must be smaller and will not satisfy `min_overlap` anyway.
133 *
134 * We return early if the length of the query itself is less than `min_overlap`, as no overlap will be satisfactory.
135 *
136 * Note that we do not use an effective query end for the binary search.
137 * The comparison between query/subject ends doesn't say anything about the length of the overlap subinterval.
138 *
139 * These modifications are mostly orthogonal to those required when `max_gap > 0`.
140 * We stop iterations when either of the stopping criteria for `max_gap` or `min_overlap` are satisfied.
141 *
142 ****************************************/
143
144 if (params.min_overlap > 0) {
145 if (query_end - query_start < params.min_overlap) {
146 return;
147 }
148 }
149
150 Position_ effective_query_end = query_end;
151 if (params.max_gap > 0) {
152 effective_query_end = safe_subtract_gap(query_end, params.max_gap);
153 }
154
155 auto find_first_child = [&](Index_ children_start, Index_ children_end) -> Index_ {
156 auto ebegin = subject.ends.begin();
157 auto estart = ebegin + children_start;
158 auto eend = ebegin + children_end;
159 return std::lower_bound(estart, eend, effective_query_end) - ebegin;
160 };
161
162 auto is_finished = [&](Position_ subject_start) -> bool {
163 if (subject_start > query_end) {
164 if (params.max_gap == 0) {
165 return true;
166 }
167 if (params.min_overlap > 0) {
168 return true;
169 }
170 if (subject_start - query_end > params.max_gap) {
171 return true;
172 }
173 } else {
174 if (params.min_overlap > 0) {
175 if (query_end - subject_start < params.min_overlap) {
176 return true;
177 }
178 }
179 }
180 return false;
181 };
182
183 Index_ root_child_at = find_first_child(0, subject.root_children);
184
185 workspace.history.clear();
186 while (1) {
187 Index_ current_subject;
188 if (workspace.history.empty()) {
189 if (root_child_at == subject.root_children || is_finished(subject.starts[root_child_at])) {
190 break;
191 }
192 current_subject = root_child_at;
193 ++root_child_at;
194 } else {
195 auto& current_state = workspace.history.back();
196 if (current_state.child_at == current_state.child_end || is_finished(subject.starts[current_state.child_at])) {
197 workspace.history.pop_back();
198 continue;
199 }
200 current_subject = current_state.child_at;
201 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
202 }
203
204 const auto& current_node = subject.nodes[current_subject];
205 auto subject_start = subject.starts[current_subject];
206 auto subject_end = subject.ends[current_subject];
207
208 if (params.min_overlap > 0) {
209 auto common_end = std::min(subject_end, query_end);
210 auto common_start = std::max(subject_start, query_start);
211 if (common_end <= common_start || common_end - common_start < params.min_overlap) {
212 // No point processing the children if the minimum overlap isn't satisified.
213 continue;
214 }
215 }
216
217 // Even if the current subject interval isn't a match, its children might still be okay, so we have to keep going.
218 bool okay;
219 if (params.max_gap == 0) {
220 okay = (subject_end == query_end);
221 } else {
222 okay = !diff_above_gap(query_end, subject_end, params.max_gap);
223 }
224
225 if (okay) {
226 matches.push_back(current_node.id);
227 if (params.quit_on_first) {
228 return;
229 }
230 if (current_node.duplicates_start != current_node.duplicates_end) {
231 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
232 }
233 }
234
235 if (current_node.children_start != current_node.children_end) {
236 Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
237 if (start_pos != current_node.children_end) {
238 workspace.history.emplace_back(start_pos, current_node.children_end);
239 }
240 }
241 }
242}
243
244}
245
246#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void overlaps_end(const Nclist< Index_, Position_ > &subject, Position_ query_start, Position_ query_end, const OverlapsEndParameters< Position_ > &params, OverlapsEndWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition overlaps_end.hpp:83
Pre-built nested containment list.
Definition build.hpp:28
Parameters for overlaps_end().
Definition overlaps_end.hpp:47
bool quit_on_first
Definition overlaps_end.hpp:64
Position_ min_overlap
Definition overlaps_end.hpp:58
Position_ max_gap
Definition overlaps_end.hpp:52
Workspace for overlaps_end().
Definition overlaps_end.hpp:27