nclist-cpp
C++ implementation of nested containment lists
Loading...
Searching...
No Matches
overlaps_extend.hpp
Go to the documentation of this file.
1#ifndef NCLIST_OVERLAPS_EXTEND_HPP
2#define NCLIST_OVERLAPS_EXTEND_HPP
3
4#include <vector>
5#include <algorithm>
6#include <optional>
7#include <limits>
8
9#include "build.hpp"
10
16namespace nclist {
17
25template<typename Index_>
30 struct State {
31 State() = default;
32 State(Index_ cat, Index_ cend, bool skip) : child_at(cat), child_end(cend), skip_search(skip) {}
33 Index_ child_at = 0, child_end = 0;
34 bool skip_search = false;
35 };
36 std::vector<State> history;
40};
41
46template<typename Position_>
53 std::optional<Position_> max_gap; // can't default to -1 as Position_ might be unsigned.
54
59 Position_ min_overlap = 0;
60
65 bool quit_on_first = false;
66};
67
68
84template<typename Index_, typename Position_>
86 const Nclist<Index_, Position_>& subject,
87 Position_ query_start,
88 Position_ query_end,
91 std::vector<Index_>& matches)
92{
93 matches.clear();
94 if (subject.root_children == 0) {
95 return;
96 }
97
98 /****************************************
99 * # Default
100 *
101 * Our aim is to find overlaps to a subject interval `i` where `subject_ends[i] <= query_end` and `query_start <= subject_starts[i]`.
102 *
103 * At each node of the NCList, we search for the first child where the `subject_ends` is greater than or equal to `query_start`.
104 * Earlier "sibling" intervals and their children must have start positions that are earlier than the query interval, and thus cannot satisfy the constraint above.
105 * We do so using a binary search (std::lower_bound) on `subject_ends` - recall that these are sorted for children of each node.
106 *
107 * We then iterate until the first interval `j` where `query_end < subject_starts[j]`, at which point we stop.
108 * None of `j`, the children of `j`, nor any sibling intervals after `j` can have an end position before `query_end`, so there's no point in traversing those nodes.
109 * Any subject interval encountered during this iteration that is enclosed by the query is reported in `matches`.
110 * Regardless of whether the query encloses the subject, we repeat the search on the children of all subject intervals encountered during iteration, as one of them might be enclosed.
111 *
112 * For a modest efficiency boost, we consider the case where `query_start <= subject_starts[i]` for node `i`.
113 * In such cases, `subject_ends` for all children of `i` must also satisfy `query_start <= subject_ends[i]`, in which case the binary search can be skipped.
114 * Moreover, all descendent intervals of `i` must end after `query_start`, so the binary search can be skipped for the entire lineage of the NCList.
115 *
116 * # Max gap
117 *
118 * Here, our aim is the same as in the default case, with the extra requirement that the difference in lengths of overlapping subject/query pairs is less than or equal to `max_gap`.
119 * This follows the same logic as in the default case with some adjustments:
120 *
121 * - We do not traverse the children of a subject interval where the query width is greater than the subject's width by more than `max_gap`.
122 * Children will only ever be smaller so there's no point traversing them.
123 *
124 * # Min overlap
125 *
126 * Here, we apply the extra restriction that the overlapping subinterval must have length greater than `min_overlap`.
127 * This follows the same logic as the default case, with some modifications:
128 *
129 * - We consider an "effective" query start as `effective_query_start = query_start + min_overlap`.
130 * This defines the earliest entry of `subject_ends` that still could provide an overlap of at least `min_overlap`.
131 * - We perform a binary search to find the first `subject_ends` that is greater than or equal to `effective_query_start`.
132 * - Similarly, we check if `effective_query_start <= subject_starts[i]` to determine whether to skip the binary search.
133 * - We stop iteration once `query_end - subject_starts[j] < min_overlap`, after which we know that all children cannot achieve `min_overlap`.
134 * - If the length of the subject interval is less than `min_overlap`, we skip that node and all its children, as none of them will satisfy `min_overlap`.
135 *
136 * We return early if the length of the query itself is less than `min_overlap`, as no overlap will be satisfactory.
137 *
138 * Note that we do not use an effective query end for the binary search.
139 * The comparison between query/subject ends doesn't say anything about the length of the overlap subinterval.
140 *
141 * These modifications are orthogonal to those required when `max_gap > 0`, and so can be combined without much effort.
142 *
143 ****************************************/
144
145 Position_ query_width = query_end - query_start;
146 if (params.min_overlap > 0 && query_width < params.min_overlap) {
147 return;
148 }
149
150 Position_ effective_query_start = query_start;
151 if (params.min_overlap > 0) {
152 constexpr Position_ maxed = std::numeric_limits<Position_>::max();
153 if (maxed - params.min_overlap < query_start) {
154 return; // no point continuing as nothing can be greater than or equal to the overlap-adjusted start.
155 } else {
156 effective_query_start = query_start + params.min_overlap;
157 }
158 }
159
160 auto find_first_child = [&](Index_ children_start, Index_ children_end) -> Index_ {
161 auto ebegin = subject.ends.begin();
162 auto estart = ebegin + children_start;
163 auto eend = ebegin + children_end;
164 return std::lower_bound(estart, eend, effective_query_start) - ebegin;
165 };
166
167 auto can_skip_search = [&](Position_ subject_start) -> bool {
168 return subject_start >= effective_query_start;
169 };
170
171 auto is_finished = [&](Position_ subject_start) -> bool {
172 if (params.min_overlap > 0) {
173 if (subject_start >= query_end) {
174 return true;
175 }
176 return query_end - subject_start < params.min_overlap;
177 } else {
178 return subject_start > query_end;
179 }
180 };
181
182 Index_ root_child_at = 0;
183 bool root_skip_search = can_skip_search(subject.starts[0]);
184 if (!root_skip_search) {
185 root_child_at = find_first_child(0, subject.root_children);
186 }
187
188 workspace.history.clear();
189 while (1) {
190 Index_ current_subject;
191 bool current_skip_search;
192 if (workspace.history.empty()) {
193 if (root_child_at == subject.root_children || is_finished(subject.starts[root_child_at])) {
194 break;
195 }
196 current_subject = root_child_at;
197 current_skip_search = root_skip_search;
198 ++root_child_at;
199 } else {
200 auto& current_state = workspace.history.back();
201 if (current_state.child_at == current_state.child_end || is_finished(subject.starts[current_state.child_at])) {
202 workspace.history.pop_back();
203 continue;
204 }
205 current_subject = current_state.child_at;
206 current_skip_search = current_state.skip_search;
207 ++(current_state.child_at); // do this before the emplace_back(), otherwise the history might get reallocated and the reference would be dangling.
208 }
209
210 const auto& current_node = subject.nodes[current_subject];
211 auto subject_start = subject.starts[current_subject];
212 auto subject_end = subject.ends[current_subject];
213 auto subject_width = subject_end - subject_start;
214
215 if (params.min_overlap > 0) {
216 if (subject_width < params.min_overlap) {
217 continue;
218 }
219 }
220 if (params.max_gap.has_value()) {
221 if (query_width - subject_width > *(params.max_gap)) {
222 continue;
223 }
224 }
225
226 if (query_start <= subject_start && query_end >= subject_end) {
227 matches.push_back(current_node.id);
228 if (params.quit_on_first) {
229 return;
230 }
231 if (current_node.duplicates_start != current_node.duplicates_end) {
232 matches.insert(matches.end(), subject.duplicates.begin() + current_node.duplicates_start, subject.duplicates.begin() + current_node.duplicates_end);
233 }
234 }
235
236 if (current_node.children_start != current_node.children_end) {
237 if (current_skip_search) {
238 workspace.history.emplace_back(current_node.children_start, current_node.children_end, true);
239 } else {
240 Index_ start_pos = find_first_child(current_node.children_start, current_node.children_end);
241 if (start_pos != current_node.children_end) {
242 workspace.history.emplace_back(start_pos, current_node.children_end, can_skip_search(subject.starts[start_pos]));
243 }
244 }
245 }
246 }
247}
248
249}
250
251#endif
Build a nested containment list.
Header-only library for nested containment lists.
Definition build.hpp:17
void overlaps_extend(const Nclist< Index_, Position_ > &subject, Position_ query_start, Position_ query_end, const OverlapsExtendParameters< Position_ > &params, OverlapsExtendWorkspace< Index_ > &workspace, std::vector< Index_ > &matches)
Definition overlaps_extend.hpp:85
Pre-built nested containment list.
Definition build.hpp:28
Parameters for overlaps_extend().
Definition overlaps_extend.hpp:47
bool quit_on_first
Definition overlaps_extend.hpp:65
std::optional< Position_ > max_gap
Definition overlaps_extend.hpp:53
Position_ min_overlap
Definition overlaps_extend.hpp:59
Workspace for overlaps_extend().
Definition overlaps_extend.hpp:26