nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
overlaps_start.hpp
Go to the documentation of this file.
1#ifndef NCLIST_OVERLAPS_START_HPP
2#define NCLIST_OVERLAPS_START_HPP
3
4#include <vector>
5#include <algorithm>
6#include <optional>
7#include <limits>
8
9#include "build.hpp"
10#include "utils.hpp"
11
17namespace nclist {
18
26template<typename Index_>
31 struct State {
32 State() = default;
33 State(Index_ cat, Index_ cend, bool skip) : child_at(cat), child_end(cend), skip_search(skip) {}
34 Index_ child_at = 0, child_end = 0;
35 bool skip_search = false;
36 };
37 std::vector<State> history;
41};
42
47template<typename Position_>
53 Position_ max_gap = 0;
54
59 Position_ min_overlap = 0;
60
65 bool quit_on_first = false;
66};
67
83template<typename Index_, typename Position_>
85 const Nclist<Index_, Position_>& subject,
86 Position_ query_start,
87 Position_ query_end,
90 std::vector<Index_>& matches)
91{
92 matches.clear();
93 if (subject.root_children == 0) {
94 return;
95 }
96
97 /****************************************
98 * # Default
99 *
100 * Our aim is to find overlaps to a subject interval `i` where `subject_start[i] == query_start`.
101 *
102 * At each node of the NCList, we search for the first child where the `subject_ends` is greater than or equal to `query_start`.
103 * Earlier "sibling" intervals must have earlier start positions that cannot be equal to that of the query interval - nor can their children.
104 * We do so using a binary search (std::lower_bound) on `subject_ends` - recall that these are sorted for children of each node.
105 *
106 * We then iterate until the first interval `j` where `query_start < subject_starts[j]`, at which point we stop.
107 * None of `j`, the children of `j`, nor any sibling intervals after `j` can have a start position equal to the query interval, so there's no point in traversing those nodes.
108 * Any subject interval encountered during this iteration with an equal start position to the query is reported in `matches`.
109 * Regardless of whether the start position is equal, we repeat the search on the children of all subject intervals encountered during iteration, as one of them might have an equal start position.
110 *
111 * For a modest efficiency boost, we consider the case where `query_start <= subject_starts[i]` for node `i`.
112 * In such cases, `subject_ends` for all children of `i` must also satisfy `query_start <= subject_ends[i]`, in which case the binary search can be skipped.
113 * Moreover, all descendent intervals of `i` must end after `query_start`, so the binary search can be skipped for the entire lineage of the NCList.
114 *
115 * # Max gap
116 *
117 * Here, our aim is to find subject intervals where `abs(query_start - subject_starts[i]) <= max_gap`.
118 * This follows much the same logic as in the default case with some adjustments:
119 *
120 * - We consider an "effective" query start as `effective_query_start = query_start - max_gap`.
121 * - We then perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_start`.
122 * This is the earlier subject interval that has a start position (or has children with a start position) within `max_gap` of the query start.
123 * - Similarly, we check if `effective_query_start <= subject_starts[i]` to determine whether to skip the binary search.
124 * - We stop iteration once `subject_starts[j] - query_start > max_gap`, after which we know that all start positions of siblings/children must lie beyond `max_gap`.
125 * - Any subject interval encountered during this iteration with a start position that satisfies `max_gap` is reported in `matches`.
126 *
127 * # Min overlap
128 *
129 * Here, we apply the extra restriction that the overlapping subinterval must have length greater than `min_overlap`.
130 * This follows the same logic as the default case, with some modifications:
131 *
132 * - We consider an "effective" query start as `effective_query_start = query_start + min_overlap`.
133 * This defines the earliest entry of `subject_ends` that still could provide an overlap of at least `min_overlap`.
134 * - We perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_start`.
135 * - Similarly, we check if `effective_query_start <= subject_starts[i]` to determine whether to skip the binary search.
136 * - We stop iteration once `query_end - subject_starts[j] < min_overlap`, after which we know that all children cannot achieve `min_overlap`.
137 * - If the length of the overlapping subinterval is less than `min_overlap`, we do not report the subject interval in `matches`.
138 * Additionally, we skip traversal of that node's children, as the children must be smaller and will not satisfy `min_overlap` anyway.
139 *
140 * We return early if the length of the query itself is less than `min_overlap`, as no overlap will be satisfactory.
141 *
142 * Note that we do not use an effective query end for the binary search.
143 * The comparison between query/subject ends doesn't say anything about the length of the overlap subinterval.
144 *
145 * These modifications are mostly orthogonal to those required when `max_gap > 0`, so can be combined without much effort.
146 * We stop iterations when either of the stopping criteria for `max_gap` or `min_overlap` are satisfied.
147 * For the effective query start, the definition from `min_overlaps` will take precedence as it is more stringent, i.e., restricts more of the search space.
148 *
149 ****************************************/
150
151 if (params.min_overlap > 0) {
152 if (query_end - query_start < params.min_overlap) {
153 return;
154 }
155 }
156
157 Position_ effective_query_start = query_start;
158 if (params.min_overlap > 0) {
159 constexpr Position_ maxed = std::numeric_limits<Position_>::max();
160 if (maxed - params.min_overlap < query_start) {
161 return; // No point continuing as nothing will be found in the binary search.
162 }
163 effective_query_start = query_start + params.min_overlap;
164 } else if (params.max_gap > 0) {
165 effective_query_start = safe_subtract_gap(query_start, params.max_gap);
166 }
167
168 auto find_first_child = [&](Index_ children_start, Index_ children_end) -> Index_ {
169 auto ebegin = subject.ends.begin();
170 auto estart = ebegin + children_start;
171 auto eend = ebegin + children_end;
172 return std::lower_bound(estart, eend, effective_query_start) - ebegin;
173 };
174
175 auto skip_binary_search = [&](Position_ subject_start) -> bool {
176 return subject_start >= effective_query_start;
177 };
178
179 auto is_finished = [&](Position_ subject_start) -> bool {
180 if (subject_start > query_start) {
181 if (params.max_gap == 0) {
182 return true;
183 }
184 if (subject_start - query_start > params.max_gap) {
185 return true;
186 }
187 if (params.min_overlap > 0) {
188 if (subject_start >= query_end || query_end - subject_start < params.min_overlap) {
189 return true;
190 }
191 }
192 } else {
193 if (params.min_overlap > 0) {
194 // if query_start >= subject_start, then query_end >=
195 // subject_start as well, so the LHS will be non-negative.
196 if (query_end - subject_start < params.min_overlap) {
197 return true;
198 }
199 }
200 }
201
202 return false;
203 };
204
205 Index_ root_child_at = 0;
206 bool root_skip_search = skip_binary_search(subject.starts[0]);
207 if (!root_skip_search) {
208 root_child_at = find_first_child(0, subject.root_children);
209 }
210
211 workspace.history.clear();
212 while (1) {
213 Index_ current_subject;
214 bool skip_search;
215 if (workspace.history.empty()) {
216 if (root_child_at == subject.root_children || is_finished(subject.starts[root_child_at])) {
217 break;
218 }
219 current_subject = root_child_at;
220 skip_search = root_skip_search;
221 ++root_child_at;
222 } else {
223 auto& current_state = workspace.history.back();
224 if (current_state.child_at == current_state.child_end || is_finished(subject.starts[current_state.child_at])) {
225 workspace.history.pop_back();
226 continue;
227 }
228 current_subject = current_state.child_at;
229 skip_search = current_state.skip_search;
230 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
231 }
232
233 const auto& current_node = subject.nodes[current_subject];
234 auto subject_start = subject.starts[current_subject];
235 auto subject_end = subject.ends[current_subject];
236
237 if (params.min_overlap > 0) {
238 auto common_end = std::min(subject_end, query_end);
239 auto common_start = std::max(subject_start, query_start);
240 if (common_end <= common_start || common_end - common_start < params.min_overlap) {
241 // No point processing the children if the minimum overlap isn't satisified.
242 continue;
243 }
244 }
245
246 // Even if the current subject interval isn't a match, its children might still be okay, so we have to keep going.
247 bool okay;
248 if (params.max_gap > 0) {
249 okay = !diff_above_gap(query_start, subject_start, params.max_gap);
250 } else {
251 okay = (subject_start == query_start);
252 }
253 if (okay) {
254 matches.push_back(current_node.id);
255 if (params.quit_on_first) {
256 return;
257 }
258 if (current_node.duplicates_start != current_node.duplicates_end) {
259 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
260 }
261 }
262
263 if (current_node.children_start != current_node.children_end) {
264 if (skip_search) {
265 workspace.history.emplace_back(current_node.children_start, current_node.children_end, true);
266 } else {
267 Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
268 if (start_pos != current_node.children_end) {
269 workspace.history.emplace_back(start_pos, current_node.children_end, skip_binary_search(subject.starts[start_pos]));
270 }
271 }
272 }
273 }
274}
275
276}
277
278#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void overlaps_start(const Nclist< Index_, Position_ > &subject, Position_ query_start, Position_ query_end, const OverlapsStartParameters< Position_ > &params, OverlapsStartWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition overlaps_start.hpp:84
Pre-built nested containment list.
Definition build.hpp:28
Parameters for overlaps_start().
Definition overlaps_start.hpp:48
Position_ min_overlap
Definition overlaps_start.hpp:59
bool quit_on_first
Definition overlaps_start.hpp:65
Position_ max_gap
Definition overlaps_start.hpp:53
Workspace for overlaps_start().
Definition overlaps_start.hpp:27