WeightedLowess
A C++ library for LOWESS with various weighting schemes
Loading...
Searching...
No Matches
window.hpp
Go to the documentation of this file.
1#ifndef WEIGHTEDLOWESS_WINDOW_HPP
2#define WEIGHTEDLOWESS_WINDOW_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <stdexcept>
8
9#include "sanisizer/sanisizer.hpp"
10
11#include "Options.hpp"
12#include "parallelize.hpp"
13#include "utils.hpp"
14
20namespace WeightedLowess {
21
25namespace internal {
26
27/*
28 * Determining the `delta`. For a anchor point with x-coordinate `x`, we skip all
29 * points in `[x, x + delta]` before finding the next anchor point.
30 * We try to choose a `delta` that satisfies the constraints on the number
31 * of anchor points in `num_anchors`. A naive approach would be to simply divide the
32 * range of `x` by `num_anchors - 1`. However, this may place anchor points inside
33 * large gaps on the x-axis intervals where there are no actual observations.
34 *
35 * Instead, we try to distribute the anchor points so that they don't fall
36 * inside such large gaps. We do so by looking at the largest gaps and seeing
37 * what happens if we were to shift the anchor points to avoid such gaps. If we
38 * jump across a gap, though, we need to "use up" a anchor point to restart
39 * the sequence of anchor points on the other side of the gap. This requires some
40 * iteration to find the compromise that minimizes the 'delta' (and thus the
41 * degree of approximation in the final lowess calculation).
42 */
43template<typename Data_>
44Data_ derive_delta(const std::size_t num_anchors, const std::size_t num_points, const Data_* x) {
45 const auto points_m1 = num_points - 1;
46 auto diffs = sanisizer::create<std::vector<Data_> >(points_m1);
47 for (decltype(I(points_m1)) i = 0; i < points_m1; ++i) {
48 diffs[i] = x[i + 1] - x[i];
49 }
50
51 std::sort(diffs.begin(), diffs.end());
52 for (decltype(I(points_m1)) i = 1; i < points_m1; ++i) {
53 diffs[i] += diffs[i-1];
54 }
55
56 Data_ lowest_delta = diffs.back();
57 if (num_anchors > 1) {
58 auto max_skips = sanisizer::min(num_anchors - 1, points_m1);
59 for (decltype(I(max_skips)) nskips = 0; nskips < max_skips; ++nskips) {
60 Data_ candidate_delta = diffs[points_m1 - nskips - 1] / (num_anchors - nskips);
61 lowest_delta = std::min(candidate_delta, lowest_delta);
62 }
63 }
64
65 return lowest_delta;
66}
67
68/*
69 * Finding the anchor points, given the deltas. As previously mentioned, for a
70 * anchor point with x-coordinate `x`, we skip all points in `[x, x + delta]`
71 * before finding the next anchor point.
72 *
73 * We start at the first point (so it is always an anchor) and we do this
74 * skipping up to but not including the last point; the last point itself is
75 * always included as an anchor to ensure we have exactness at the ends.
76 */
77template<typename Data_>
78void find_anchors(const std::size_t num_points, const Data_* x, Data_ delta, std::vector<std::size_t>& anchors) {
79 anchors.clear();
80 anchors.push_back(0);
81
82 std::size_t last_pt = 0;
83 const std::size_t points_m1 = num_points - 1;
84 for (decltype(I(points_m1)) pt = 1; pt < points_m1; ++pt) {
85 if (x[pt] - x[last_pt] > delta) {
86 anchors.push_back(pt);
87 last_pt = pt;
88 }
89 }
90
91 anchors.push_back(points_m1);
92 return;
93}
94
95template<typename Data_>
96struct Window {
97 std::size_t left, right;
98 Data_ distance;
99};
100
101/* This function identifies the start and end index in the span for each chosen sampling
102 * point. It returns two arrays via reference containing said indices. It also returns
103 * an array containing the maximum distance between points at each span.
104 *
105 * We don't use the update-based algorithm in Cleveland's paper, as it ceases to be
106 * numerically stable once you throw in floating-point weights. It's not particularly
107 * amenable to updating through cycles of addition and subtraction. At any rate, the
108 * algorithm as a whole remains quadratic (as weights must be recomputed) so there's no
109 * damage to scalability.
110 */
111template<typename Data_>
112std::vector<Window<Data_> > find_limits(
113 const std::vector<std::size_t>& anchors,
114 const Data_ span_weight,
115 const std::size_t num_points,
116 const Data_* x,
117 const Data_* weights,
118 const Data_ min_width,
119 int nthreads = 1)
120{
121 const auto nanchors = anchors.size();
122 auto limits = sanisizer::create<std::vector<Window<Data_> > >(nanchors);
123 const auto half_min_width = min_width / 2;
124 const auto points_m1 = num_points - 1;
125
126 parallelize(nthreads, nanchors, [&](const int, const decltype(I(nanchors)) start, const decltype(I(nanchors)) length) {
127 for (decltype(I(start)) s = start, end = start + length; s < end; ++s) {
128 const auto curpt = anchors[s];
129 const auto curx = x[curpt];
130 auto left = curpt, right = curpt;
131 Data_ curw = (weights == NULL ? 1 : weights[curpt]);
132
133 // First expanding in both directions, choosing the one that
134 // minimizes the increase in the window size.
135 if (curpt > 0 && curpt < points_m1) {
136 auto next_ldist = curx - x[left - 1];
137 auto next_rdist = x[right + 1] - curx;
138
139 while (curw < span_weight) {
140 if (next_ldist < next_rdist) {
141 --left;
142 curw += (weights == NULL ? 1 : weights[left]);
143 if (left == 0) {
144 break;
145 }
146 next_ldist = curx - x[left - 1];
147
148 } else if (next_ldist > next_rdist) {
149 ++right;
150 curw += (weights == NULL ? 1 : weights[right]);
151 if (right == points_m1) {
152 break;
153 }
154 next_rdist = x[right + 1] - curx;
155
156 } else {
157 // In the very rare case that distances are equal, we do a
158 // simultaneous jump to ensure that both points are
159 // included. Otherwise one of them is skipped if we break.
160 --left;
161 ++right;
162 curw += (weights == NULL ? 2 : weights[left] + weights[right]);
163 if (left == 0 || right == points_m1) {
164 break;
165 }
166 next_ldist = curx - x[left - 1];
167 next_rdist = x[right + 1] - curx;
168 }
169 }
170 }
171
172 // If we still need it, we expand in only one direction.
173 while (left > 0 && curw < span_weight) {
174 --left;
175 curw += (weights == NULL ? 1 : weights[left]);
176 }
177
178 while (right < points_m1 && curw < span_weight) {
179 ++right;
180 curw += (weights == NULL ? 1 : weights[right]);
181 }
182
183 /* Once we've found the span, we stretch it out to include all ties. */
184 while (left > 0 && x[left] == x[left - 1]) {
185 --left;
186 }
187
188 while (right < points_m1 && x[right] == x[right + 1]) {
189 ++right;
190 }
191
192 /* Forcibly extending the span if it fails the min width. We use
193 * the existing 'left' and 'right' to truncate the search space.
194 */
195 auto mdist = std::max(curx - x[left], x[right] - curx);
196 if (mdist < half_min_width) {
197 left = std::lower_bound(x, x + left, curx - half_min_width) - x;
198
199 /* 'right' still refers to a point inside the window, and we
200 * already know that the window is too small, so we shift it
201 * forward by one to start searching outside. However,
202 * upper_bound gives us the first element that is _outside_ the
203 * window, so we need to subtract one to get to the last
204 * element _inside_ the window.
205 */
206 right = std::upper_bound(x + right + 1, x + num_points, curx + half_min_width) - x;
207 --right;
208
209 mdist = std::max(curx - x[left], x[right] - curx);
210 }
211
212 limits[s].left = left;
213 limits[s].right = right;
214 limits[s].distance = mdist;
215 }
216 });
217
218 return limits;
219}
220
221}
233template<typename Data_>
238 std::vector<std::size_t> anchors;
239 const Data_* freq_weights = NULL;
240 Data_ total_weight = 0;
241 std::vector<internal::Window<Data_> > limits;
245};
246
266template<typename Data_>
267PrecomputedWindows<Data_> define_windows(const std::size_t num_points, const Data_* x, const Options<Data_>& opt) {
269
270 if (num_points) {
271 if (!std::is_sorted(x, x + num_points)) {
272 throw std::runtime_error("'x' should be sorted");
273 }
274
275 // Finding the anchors.
276 auto& anchors = output.anchors;
277 if (opt.delta == 0 || (opt.delta < 0 && opt.anchors >= num_points)) {
278 sanisizer::resize(anchors, num_points);
279 std::iota(anchors.begin(), anchors.end(), 0);
280 } else if (opt.delta < 0) {
281 Data_ eff_delta = internal::derive_delta(opt.anchors, num_points, x);
282 internal::find_anchors(num_points, x, eff_delta, anchors);
283 } else {
284 internal::find_anchors(num_points, x, opt.delta, anchors);
285 }
286
287 /* Computing the span weight that each window must achieve. */
288 output.freq_weights = (opt.frequency_weights ? opt.weights : NULL);
289 output.total_weight = (output.freq_weights != NULL ? std::accumulate(output.freq_weights, output.freq_weights + num_points, static_cast<Data_>(0)) : num_points);
290 const Data_ span_weight = (opt.span_as_proportion ? opt.span * output.total_weight : opt.span);
291 output.limits = internal::find_limits(anchors, span_weight, num_points, x, output.freq_weights, opt.minimum_width, opt.num_threads);
292 }
293
294 return output;
295}
296
297}
298
299#endif
Options for compute().
Namespace for LOWESS functions.
Definition compute.hpp:18
PrecomputedWindows< Data_ > define_windows(const std::size_t num_points, const Data_ *x, const Options< Data_ > &opt)
Definition window.hpp:267
void parallelize(const int num_workers, const Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Definitions for parallelization.
Options for compute().
Definition Options.hpp:17
Data_ * weights
Definition Options.hpp:78
bool frequency_weights
Definition Options.hpp:85
Data_ delta
Definition Options.hpp:71
std::size_t anchors
Definition Options.hpp:55
Data_ span
Definition Options.hpp:28
Data_ minimum_width
Definition Options.hpp:42
bool span_as_proportion
Definition Options.hpp:35
int num_threads
Definition Options.hpp:92
Precomputed windows for LOWESS smoothing.
Definition window.hpp:234