WeightedLowess
A C++ library for LOWESS with various weighting schemes
Loading...
Searching...
No Matches
window.hpp
Go to the documentation of this file.
1#ifndef WEIGHTEDLOWESS_WINDOW_HPP
2#define WEIGHTEDLOWESS_WINDOW_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <stdexcept>
8#include <cassert>
9#include <optional>
10
11#include "sanisizer/sanisizer.hpp"
12
13#include "Options.hpp"
14#include "parallelize.hpp"
15#include "utils.hpp"
16
22namespace WeightedLowess {
23
27namespace internal {
28
29/*
30 * Determining the `delta`. For a anchor point with x-coordinate `x`, we skip all
31 * points in `[x, x + delta]` before finding the next anchor point.
32 * We try to choose a `delta` that satisfies the constraints on the number
33 * of anchor points in `num_anchors`. A naive approach would be to simply divide the
34 * range of `x` by `num_anchors - 1`. However, this may place anchor points inside
35 * large gaps on the x-axis intervals where there are no actual observations.
36 *
37 * Instead, we try to distribute the anchor points so that they don't fall
38 * inside such large gaps. We do so by looking at the largest gaps and seeing
39 * what happens if we were to shift the anchor points to avoid such gaps. If we
40 * jump across a gap, though, we need to "use up" a anchor point to restart
41 * the sequence of anchor points on the other side of the gap. This requires some
42 * iteration to find the compromise that minimizes the 'delta' (and thus the
43 * degree of approximation in the final lowess calculation).
44 */
45template<typename Data_>
46Data_ derive_delta(const std::size_t num_anchors, const std::size_t num_points, const Data_* const x) {
47 assert(num_points > 0);
48
49 const auto points_m1 = num_points - 1;
50 auto diffs = sanisizer::create<std::vector<Data_> >(points_m1);
51 for (I<decltype(points_m1)> i = 0; i < points_m1; ++i) {
52 diffs[i] = x[i + 1] - x[i];
53 }
54
55 std::sort(diffs.begin(), diffs.end());
56 for (I<decltype(points_m1)> i = 1; i < points_m1; ++i) {
57 diffs[i] += diffs[i-1];
58 }
59
60 Data_ lowest_delta = diffs.back();
61 if (num_anchors > 1) {
62 auto max_skips = sanisizer::min(num_anchors - 1, points_m1);
63 for (I<decltype(max_skips)> nskips = 0; nskips < max_skips; ++nskips) {
64 Data_ candidate_delta = diffs[points_m1 - nskips - 1] / (num_anchors - nskips);
65 lowest_delta = std::min(candidate_delta, lowest_delta);
66 }
67 }
68
69 return lowest_delta;
70}
71
72/*
73 * Finding the anchor points, given the deltas. As previously mentioned, for a
74 * anchor point with x-coordinate `x`, we skip all points in `[x, x + delta]`
75 * before finding the next anchor point.
76 *
77 * We start at the first point (so it is always an anchor) and we do this
78 * skipping up to but not including the last point; the last point itself is
79 * always included as an anchor to ensure we have exactness at the ends.
80 */
81template<typename Data_>
82void find_anchors(const std::size_t num_points, const Data_* x, Data_ delta, std::vector<std::size_t>& anchors) {
83 assert(num_points > 0);
84 anchors.clear();
85 anchors.push_back(0);
86
87 std::size_t last_pt = 0;
88 const std::size_t points_m1 = num_points - 1;
89 for (I<decltype(points_m1)> pt = 1; pt < points_m1; ++pt) {
90 if (x[pt] - x[last_pt] > delta) {
91 anchors.push_back(pt);
92 last_pt = pt;
93 }
94 }
95
96 anchors.push_back(points_m1);
97 return;
98}
99
100template<typename Data_>
101struct Window {
102 std::size_t left, right;
103 Data_ distance;
104};
105
106/* This function identifies the start and end index in the span for each chosen sampling
107 * point. It returns two arrays via reference containing said indices. It also returns
108 * an array containing the maximum distance between points at each span.
109 *
110 * We don't use the update-based algorithm in Cleveland's paper, as it ceases to be
111 * numerically stable once you throw in floating-point weights. It's not particularly
112 * amenable to updating through cycles of addition and subtraction. At any rate, the
113 * algorithm as a whole remains quadratic (as weights must be recomputed) so there's no
114 * damage to scalability.
115 */
116template<typename Data_>
117std::vector<Window<Data_> > find_limits(
118 const std::vector<std::size_t>& anchors,
119 const Data_ span_weight,
120 const std::size_t num_points,
121 const Data_* const x,
122 const Data_* const weights,
123 const Data_ min_width,
124 int nthreads = 1)
125{
126 const auto nanchors = anchors.size();
127 auto limits = sanisizer::create<std::vector<Window<Data_> > >(nanchors);
128 const auto half_min_width = min_width / 2;
129
130 assert(num_points > 0);
131 const auto points_m1 = num_points - 1;
132
133 parallelize(nthreads, nanchors, [&](const int, const I<decltype(nanchors)> start, const I<decltype(nanchors)> length) {
134 for (I<decltype(start)> s = start, end = start + length; s < end; ++s) {
135 const auto curpt = anchors[s];
136 const auto curx = x[curpt];
137 auto left = curpt, right = curpt;
138 Data_ curw = (weights == NULL ? 1 : weights[curpt]);
139
140 // First expanding in both directions, choosing the one that
141 // minimizes the increase in the window size.
142 if (curpt > 0 && curpt < points_m1) {
143 auto next_ldist = curx - x[left - 1];
144 auto next_rdist = x[right + 1] - curx;
145
146 while (curw < span_weight) {
147 if (next_ldist < next_rdist) {
148 --left;
149 curw += (weights == NULL ? 1 : weights[left]);
150 if (left == 0) {
151 break;
152 }
153 next_ldist = curx - x[left - 1];
154
155 } else if (next_ldist > next_rdist) {
156 ++right;
157 curw += (weights == NULL ? 1 : weights[right]);
158 if (right == points_m1) {
159 break;
160 }
161 next_rdist = x[right + 1] - curx;
162
163 } else {
164 // In the very rare case that distances are equal, we do a
165 // simultaneous jump to ensure that both points are
166 // included. Otherwise one of them is skipped if we break.
167 --left;
168 ++right;
169 curw += (weights == NULL ? 2 : weights[left] + weights[right]);
170 if (left == 0 || right == points_m1) {
171 break;
172 }
173 next_ldist = curx - x[left - 1];
174 next_rdist = x[right + 1] - curx;
175 }
176 }
177 }
178
179 // If we still need it, we expand in only one direction.
180 while (left > 0 && curw < span_weight) {
181 --left;
182 curw += (weights == NULL ? 1 : weights[left]);
183 }
184
185 while (right < points_m1 && curw < span_weight) {
186 ++right;
187 curw += (weights == NULL ? 1 : weights[right]);
188 }
189
190 /* Once we've found the span, we stretch it out to include all ties. */
191 while (left > 0 && x[left] == x[left - 1]) {
192 --left;
193 }
194
195 while (right < points_m1 && x[right] == x[right + 1]) {
196 ++right;
197 }
198
199 /* Forcibly extending the span if it fails the min width. We use
200 * the existing 'left' and 'right' to truncate the search space.
201 */
202 auto mdist = std::max(curx - x[left], x[right] - curx);
203 if (mdist < half_min_width) {
204 left = std::lower_bound(x, x + left, curx - half_min_width) - x;
205
206 /* 'right' still refers to a point inside the window, and we
207 * already know that the window is too small, so we shift it
208 * forward by one to start searching outside. However,
209 * upper_bound gives us the first element that is _outside_ the
210 * window, so we need to subtract one to get to the last
211 * element _inside_ the window.
212 */
213 right = std::upper_bound(x + right + 1, x + num_points, curx + half_min_width) - x;
214 --right;
215
216 mdist = std::max(curx - x[left], x[right] - curx);
217 }
218
219 limits[s].left = left;
220 limits[s].right = right;
221 limits[s].distance = mdist;
222 }
223 });
224
225 return limits;
226}
227
228}
240template<typename Data_>
245 std::vector<std::size_t> anchors;
246 const Data_* freq_weights = NULL;
247 Data_ total_weight = 0;
248 std::vector<internal::Window<Data_> > limits;
252};
253
273template<typename Data_>
274PrecomputedWindows<Data_> define_windows(const std::size_t num_points, const Data_* const x, const Options<Data_>& opt) {
276 if (num_points == 0) {
277 return output;
278 }
279
280 if (!std::is_sorted(x, x + num_points)) {
281 throw std::runtime_error("'x' should be sorted");
282 }
283
284 // For back-compatibility for negative sentinel values.
285 std::optional<Data_> delta = opt.delta;
286 if (delta.has_value() && *delta < 0) {
287 delta.reset();
288 }
289
290 // Finding the anchors.
291 auto& anchors = output.anchors;
292 if (delta.has_value()) {
293 if (*delta == 0) {
294 sanisizer::resize(anchors, num_points);
295 std::iota(anchors.begin(), anchors.end(), static_cast<std::size_t>(0));
296 } else {
297 internal::find_anchors(num_points, x, *delta, anchors);
298 }
299 } else {
300 if (opt.anchors >= num_points) {
301 sanisizer::resize(anchors, num_points);
302 std::iota(anchors.begin(), anchors.end(), static_cast<std::size_t>(0));
303 } else {
304 Data_ eff_delta = internal::derive_delta(opt.anchors, num_points, x);
305 internal::find_anchors(num_points, x, eff_delta, anchors);
306 }
307 }
308
309 // Computing the span weight that each window must achieve.
310 output.freq_weights = (opt.frequency_weights ? opt.weights : NULL);
311 output.total_weight = (output.freq_weights != NULL ? std::accumulate(output.freq_weights, output.freq_weights + num_points, static_cast<Data_>(0)) : num_points);
312 const Data_ span_weight = (opt.span_as_proportion ? opt.span * output.total_weight : opt.span);
313 output.limits = internal::find_limits(anchors, span_weight, num_points, x, output.freq_weights, opt.minimum_width, opt.num_threads);
314
315 return output;
316}
317
318}
319
320#endif
Options for compute().
Namespace for LOWESS functions.
Definition compute.hpp:18
PrecomputedWindows< Data_ > define_windows(const std::size_t num_points, const Data_ *const x, const Options< Data_ > &opt)
Definition window.hpp:274
int parallelize(const int num_workers, const Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:32
Definitions for parallelization.
Options for compute().
Definition Options.hpp:20
const Data_ * weights
Definition Options.hpp:83
bool frequency_weights
Definition Options.hpp:90
std::size_t anchors
Definition Options.hpp:60
Data_ span
Definition Options.hpp:31
Data_ minimum_width
Definition Options.hpp:45
bool span_as_proportion
Definition Options.hpp:38
std::optional< Data_ > delta
Definition Options.hpp:76
int num_threads
Definition Options.hpp:97
Precomputed windows for LOWESS smoothing.
Definition window.hpp:241