nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
overlaps_equal.hpp
Go to the documentation of this file.
1#ifndef NCLIST_OVERLAPS_EQUAL_HPP
2#define NCLIST_OVERLAPS_EQUAL_HPP
3
4#include <vector>
5#include <algorithm>
6
7#include "build.hpp"
8#include "utils.hpp"
9
15namespace nclist {
16
24template<typename Index_>
29 struct State {
30 State() = default;
31 State(Index_ cat, Index_ cend) : child_at(cat), child_end(cend) {}
32 Index_ child_at = 0, child_end = 0;
33 };
34 std::vector<State> history;
38};
39
44template<typename Position_>
50 Position_ max_gap = 0;
51
56 Position_ min_overlap = 0;
57
62 bool quit_on_first = false;
63};
64
65
83template<typename Index_, typename Position_>
85 const Nclist<Index_, Position_>& subject,
86 Position_ query_start,
87 Position_ query_end,
90 std::vector<Index_>& matches)
91{
92 matches.clear();
93 if (subject.root_children == 0) {
94 return;
95 }
96
97 /****************************************
98 * # Default
99 *
100 * Our aim is to find overlaps to a subject interval `i` where `subject_starts[i] == query_start` and `subject_ends[i] == query_end`.
101 *
102 * At each node of the NCList, we search for the first child where the `subject_ends` is greater or equal to `query_end`.
103 * Earlier "sibling" intervals must have earlier end positions that cannot be equal to that of the query interval - nor can their children.
104 * We do so using a binary search (std::lower_bound) on `subject_ends` - recall that these are sorted for children of each node.
105 *
106 * We then iterate until the first interval `j` where `query_start < subject_starts[j]`, at which point we stop.
107 * None of `j`, the children of `j`, nor any sibling intervals after `j` can have an start position equal to the query interval, so there's no point in traversing those nodes.
108 * Any subject interval encountered during this iteration with equal start/end positions to the query is reported in `matches`.
109 * Note that we can return early if we find a matching interval, as exactly one node will contain all duplicate intervals with the same start/end positions, so there's no point continuing.
110 * Otherwise, we repeat the search on the children of all subject intervals encountered during iteration, as one of them might have equal start/end positions.
111 *
112 * Unlike `overlaps_any()`, there is no ability to skip the binary search for the descendents of a node.
113 * Sure, the start positions of all descendents are no less than the node's subject interval's start position,
114 * but this doesn't say much about the comparison between the subject and query end positions.
115 *
116 * # Max gap
117 *
118 * Here, our aim is to find subject intervals where `abs(query_end - subject_ends[i]) <= max_gap` and `abs(query_start - subject_starts[i]) <= max_gap`.
119 * This follows much the same logic as in the default case with some adjustments:
120 *
121 * - We consider an "effective" query end as `effective_query_end = query_end - max_gap`.
122 * - We then perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_start`.
123 * - We stop iteration once `subject_starts[j] - query_start > max_gap`, after which we know that all children's starts must lie beyond `max_gap`.
124 * - Any subject interval encountered during this iteration with start/end positions that satisfies `max_gap` is reported in `matches`.
125 * - We do not return early if we find a matching interval, as its children or siblings can still match within `max_gap`.
126 *
127 * # Min overlap
128 *
129 * Here, we apply the extra restriction that the overlapping subinterval must have length greater than `min_overlap`.
130 * This follows the same logic as the default case, with some modifications:
131 *
132 * - We stop iteration once `query_end - subject_starts[j] < min_overlap`, after which we know that all children cannot achieve `min_overlap`.
133 * - We apply an extra check on each subject interval to determine if it achieves `min_overlap` before reporting it in `matches`.
134 * - If the length of the overlapping subinterval is less than `min_overlap`, we do not report the subject interval in `matches`.
135 * Morever, we skip traversal of that node's children, as the children must be smaller and will not satisfy `min_overlap` anyway.
136 *
137 * We return early if the length of the query itself is less than `min_overlap`, as no overlap will be satisfactory.
138 *
139 * Note that we do not use an effective query end for the binary search.
140 * The comparison between query/subject ends doesn't say anything about the length of the overlap subinterval.
141 *
142 * These modifications are mostly orthogonal to those required when `max_gap > 0`, so can be combined without much effort.
143 * We stop iterations when either of the stopping criteria for `max_gap` or `min_overlap` are satisfied.
144 *
145 ****************************************/
146
147 if (params.min_overlap > 0) {
148 if (query_end - query_start < params.min_overlap) {
149 return;
150 }
151 }
152
153 Position_ effective_query_end = query_end;
154 if (params.max_gap > 0) {
155 effective_query_end = safe_subtract_gap(query_end, params.max_gap);
156 }
157
158 auto find_first_child = [&](Index_ children_start, Index_ children_end) -> Index_ {
159 auto ebegin = subject.ends.begin();
160 auto estart = ebegin + children_start;
161 auto eend = ebegin + children_end;
162 return std::lower_bound(estart, eend, effective_query_end) - ebegin;
163 };
164
165 auto is_finished = [&](Position_ subject_start) -> bool {
166 if (subject_start > query_start) {
167 if (params.max_gap > 0) {
168 if (subject_start - query_start > params.max_gap) {
169 return true;
170 }
171 } else {
172 return true;
173 }
174
175 if (params.min_overlap > 0) {
176 if (subject_start >= query_end || query_end - subject_start < params.min_overlap) {
177 return true;
178 }
179 }
180 } else {
181 if (params.min_overlap > 0) {
182 // if query_start >= subject_start, then query_end >=
183 // subject_start as well, so the LHS will be non-negative.
184 if (query_end - subject_start < params.min_overlap) {
185 return true;
186 }
187 }
188 }
189
190 return false;
191 };
192
193 Index_ root_child_at = find_first_child(0, subject.root_children);
194
195 workspace.history.clear();
196 while (1) {
197 Index_ current_subject;
198 if (workspace.history.empty()) {
199 if (root_child_at == subject.root_children || is_finished(subject.starts[root_child_at])) {
200 break;
201 }
202 current_subject = root_child_at;
203 ++root_child_at;
204 } else {
205 auto& current_state = workspace.history.back();
206 if (current_state.child_at == current_state.child_end || is_finished(subject.starts[current_state.child_at])) {
207 workspace.history.pop_back();
208 continue;
209 }
210 current_subject = current_state.child_at;
211 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
212 }
213
214 const auto& current_node = subject.nodes[current_subject];
215 auto subject_start = subject.starts[current_subject];
216 auto subject_end = subject.ends[current_subject];
217
218 if (params.min_overlap > 0) {
219 auto common_end = std::min(subject_end, query_end);
220 auto common_start = std::max(subject_start, query_start);
221 if (common_end <= common_start || common_end - common_start < params.min_overlap) {
222 // No point processing the children if the minimum overlap isn't satisified.
223 continue;
224 }
225 }
226
227 // Even if the current subject interval isn't a match, its children might still be okay, so we have to keep going.
228 bool okay = true;
229 if (params.max_gap > 0) {
230 okay = !diff_above_gap(query_start, subject_start, params.max_gap) && !diff_above_gap(query_end, subject_end, params.max_gap);
231 } else {
232 okay = (subject_start == query_start && subject_end == query_end);
233 }
234
235 if (okay) {
236 matches.push_back(current_node.id);
237 if (params.quit_on_first) {
238 return;
239 }
240 if (current_node.duplicates_start != current_node.duplicates_end) {
241 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
242 }
243 if (params.max_gap == 0) { // no need to continue traversal, there should only be one node that is exactly equal.
244 return;
245 }
246 }
247
248 if (current_node.children_start != current_node.children_end) {
249 Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
250 if (start_pos != current_node.children_end) {
251 workspace.history.emplace_back(start_pos, current_node.children_end);
252 }
253 }
254 }
255}
256
257}
258
259#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void overlaps_equal(const Nclist< Index_, Position_ > &subject, Position_ query_start, Position_ query_end, const OverlapsEqualParameters< Position_ > &params, OverlapsEqualWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition overlaps_equal.hpp:84
Pre-built nested containment list.
Definition build.hpp:28
Parameters for overlaps_equal().
Definition overlaps_equal.hpp:45
Position_ max_gap
Definition overlaps_equal.hpp:50
Position_ min_overlap
Definition overlaps_equal.hpp:56
bool quit_on_first
Definition overlaps_equal.hpp:62
Workspace for overlaps_equal().
Definition overlaps_equal.hpp:25