WeightedLowess
A C++ library for LOWESS with various weighting schemes
Loading...
Searching...
No Matches
window.hpp
Go to the documentation of this file.
1#ifndef WEIGHTEDLOWESS_WINDOW_HPP
2#define WEIGHTEDLOWESS_WINDOW_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <stdexcept>
8
9#include "Options.hpp"
10#include "parallelize.hpp"
11
17namespace WeightedLowess {
18
22namespace internal {
23
24/*
25 * Determining the `delta`. For a anchor point with x-coordinate `x`, we skip all
26 * points in `[x, x + delta]` before finding the next anchor point.
27 * We try to choose a `delta` that satisfies the constraints on the number
28 * of anchor points in `num_anchors`. A naive approach would be to simply divide the
29 * range of `x` by `num_anchors - 1`. However, this may place anchor points inside
30 * large gaps on the x-axis intervals where there are no actual observations.
31 *
32 * Instead, we try to distribute the anchor points so that they don't fall
33 * inside such large gaps. We do so by looking at the largest gaps and seeing
34 * what happens if we were to shift the anchor points to avoid such gaps. If we
35 * jump across a gap, though, we need to "use up" a anchor point to restart
36 * the sequence of anchor points on the other side of the gap. This requires some
37 * iteration to find the compromise that minimizes the 'delta' (and thus the
38 * degree of approximation in the final lowess calculation).
39 */
40template<typename Data_>
41Data_ derive_delta(size_t num_anchors, size_t num_points, const Data_* x) {
42 size_t points_m1 = num_points - 1;
43 std::vector<Data_> diffs(points_m1);
44 for (size_t i = 0; i < points_m1; ++i) {
45 diffs[i] = x[i + 1] - x[i];
46 }
47
48 std::sort(diffs.begin(), diffs.end());
49 for (size_t i = 1; i < points_m1; ++i) {
50 diffs[i] += diffs[i-1];
51 }
52
53 Data_ lowest_delta = diffs.back();
54 if (num_anchors > 1) {
55 size_t max_skips = std::min(points_m1, num_anchors - 1);
56 for (size_t nskips = 0; nskips < max_skips; ++nskips) {
57 Data_ candidate_delta = diffs[points_m1 - nskips - 1] / (num_anchors - nskips);
58 lowest_delta = std::min(candidate_delta, lowest_delta);
59 }
60 }
61
62 return lowest_delta;
63}
64
65/*
66 * Finding the anchor points, given the deltas. As previously mentioned, for a
67 * anchor point with x-coordinate `x`, we skip all points in `[x, x + delta]`
68 * before finding the next anchor point.
69 *
70 * We start at the first point (so it is always an anchor) and we do this
71 * skipping up to but not including the last point; the last point itself is
72 * always included as an anchor to ensure we have exactness at the ends.
73 */
74template<typename Data_>
75void find_anchors(size_t num_points, const Data_* x, Data_ delta, std::vector<size_t>& anchors) {
76 anchors.clear();
77 anchors.push_back(0);
78
79 size_t last_pt = 0;
80 size_t points_m1 = num_points - 1;
81 for (size_t pt = 1; pt < points_m1; ++pt) {
82 if (x[pt] - x[last_pt] > delta) {
83 anchors.push_back(pt);
84 last_pt = pt;
85 }
86 }
87
88 anchors.push_back(points_m1);
89 return;
90}
91
92template<typename Data_>
93struct Window {
94 size_t left, right;
95 Data_ distance;
96};
97
98/* This function identifies the start and end index in the span for each chosen sampling
99 * point. It returns two arrays via reference containing said indices. It also returns
100 * an array containing the maximum distance between points at each span.
101 *
102 * We don't use the update-based algorithm in Cleveland's paper, as it ceases to be
103 * numerically stable once you throw in floating-point weights. It's not particularly
104 * amenable to updating through cycles of addition and subtraction. At any rate, the
105 * algorithm as a whole remains quadratic (as weights must be recomputed) so there's no
106 * damage to scalability.
107 */
108template<typename Data_>
109std::vector<Window<Data_> > find_limits(
110 const std::vector<size_t>& anchors,
111 Data_ span_weight,
112 size_t num_points,
113 const Data_* x,
114 const Data_* weights,
115 Data_ min_width,
116 [[maybe_unused]] int nthreads = 1)
117{
118 const size_t nanchors = anchors.size();
119 std::vector<Window<Data_> > limits(nanchors);
120 auto half_min_width = min_width / 2;
121 auto points_m1 = num_points - 1;
122
123 parallelize(nthreads, nanchors, [&](int, size_t start, size_t length) {
124 for (size_t s = start, end = start + length; s < end; ++s) {
125 auto curpt = anchors[s], left = curpt, right = curpt;
126 auto curx = x[curpt];
127 Data_ curw = (weights == NULL ? 1 : weights[curpt]);
128
129 // First expanding in both directions, choosing the one that
130 // minimizes the increase in the window size.
131 if (curpt > 0 && curpt < points_m1) {
132 auto next_ldist = curx - x[left - 1];
133 auto next_rdist = x[right + 1] - curx;
134
135 while (curw < span_weight) {
136 if (next_ldist < next_rdist) {
137 --left;
138 curw += (weights == NULL ? 1 : weights[left]);
139 if (left == 0) {
140 break;
141 }
142 next_ldist = curx - x[left - 1];
143
144 } else if (next_ldist > next_rdist) {
145 ++right;
146 curw += (weights == NULL ? 1 : weights[right]);
147 if (right == points_m1) {
148 break;
149 }
150 next_rdist = x[right + 1] - curx;
151
152 } else {
153 // In the very rare case that distances are equal, we do a
154 // simultaneous jump to ensure that both points are
155 // included. Otherwise one of them is skipped if we break.
156 --left;
157 ++right;
158 curw += (weights == NULL ? 2 : weights[left] + weights[right]);
159 if (left == 0 || right == points_m1) {
160 break;
161 }
162 next_ldist = curx - x[left - 1];
163 next_rdist = x[right + 1] - curx;
164 }
165 }
166 }
167
168 // If we still need it, we expand in only one direction.
169 while (left > 0 && curw < span_weight) {
170 --left;
171 curw += (weights == NULL ? 1 : weights[left]);
172 }
173
174 while (right < points_m1 && curw < span_weight) {
175 ++right;
176 curw += (weights == NULL ? 1 : weights[right]);
177 }
178
179 /* Once we've found the span, we stretch it out to include all ties. */
180 while (left > 0 && x[left] == x[left - 1]) {
181 --left;
182 }
183
184 while (right < points_m1 && x[right] == x[right + 1]) {
185 ++right;
186 }
187
188 /* Forcibly extending the span if it fails the min width. We use
189 * the existing 'left' and 'right' to truncate the search space.
190 */
191 auto mdist = std::max(curx - x[left], x[right] - curx);
192 if (mdist < half_min_width) {
193 left = std::lower_bound(x, x + left, curx - half_min_width) - x;
194
195 /* 'right' still refers to a point inside the window, and we
196 * already know that the window is too small, so we shift it
197 * forward by one to start searching outside. However,
198 * upper_bound gives us the first element that is _outside_ the
199 * window, so we need to subtract one to get to the last
200 * element _inside_ the window.
201 */
202 right = std::upper_bound(x + right + 1, x + num_points, curx + half_min_width) - x;
203 --right;
204
205 mdist = std::max(curx - x[left], x[right] - curx);
206 }
207
208 limits[s].left = left;
209 limits[s].right = right;
210 limits[s].distance = mdist;
211 }
212 });
213
214 return limits;
215}
216
217}
229template<typename Data_>
234 std::vector<size_t> anchors;
235 const Data_* freq_weights = NULL;
236 Data_ total_weight = 0;
237 std::vector<internal::Window<Data_> > limits;
241};
242
262template<typename Data_>
263PrecomputedWindows<Data_> define_windows(size_t num_points, const Data_* x, const Options<Data_>& opt) {
265
266 if (num_points) {
267 if (!std::is_sorted(x, x + num_points)) {
268 throw std::runtime_error("'x' should be sorted");
269 }
270
271 // Finding the anchors.
272 auto& anchors = output.anchors;
273 if (opt.delta == 0 || (opt.delta < 0 && opt.anchors >= num_points)) {
274 anchors.resize(num_points);
275 std::iota(anchors.begin(), anchors.end(), 0);
276 } else if (opt.delta < 0) {
277 Data_ eff_delta = internal::derive_delta(opt.anchors, num_points, x);
278 internal::find_anchors(num_points, x, eff_delta, anchors);
279 } else {
280 internal::find_anchors(num_points, x, opt.delta, anchors);
281 }
282
283 /* Computing the span weight that each window must achieve. */
284 output.freq_weights = (opt.frequency_weights ? opt.weights : NULL);
285 output.total_weight = (output.freq_weights != NULL ? std::accumulate(output.freq_weights, output.freq_weights + num_points, static_cast<Data_>(0)) : num_points);
286 Data_ span_weight = (opt.span_as_proportion ? opt.span * output.total_weight : opt.span);
287 output.limits = internal::find_limits(anchors, span_weight, num_points, x, output.freq_weights, opt.minimum_width, opt.num_threads);
288 }
289
290 return output;
291}
292
293}
294
295#endif
Options for compute().
Namespace for LOWESS functions.
Definition compute.hpp:14
PrecomputedWindows< Data_ > define_windows(size_t num_points, const Data_ *x, const Options< Data_ > &opt)
Definition window.hpp:263
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Definitions for parallelization.
Options for compute().
Definition Options.hpp:17
Data_ * weights
Definition Options.hpp:78
bool frequency_weights
Definition Options.hpp:85
Data_ delta
Definition Options.hpp:71
Data_ span
Definition Options.hpp:28
Data_ minimum_width
Definition Options.hpp:42
size_t anchors
Definition Options.hpp:55
bool span_as_proportion
Definition Options.hpp:35
int num_threads
Definition Options.hpp:92
Precomputed windows for LOWESS smoothing.
Definition window.hpp:230