partisub
Subsampling within partitions
Loading...
Searching...
No Matches
partisub.hpp
Go to the documentation of this file.
1#ifndef PARTISUB_HPP
2#define PARTISUB_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <type_traits>
8#include <random>
9
10#include "sanisizer/sanisizer.hpp"
11#include "aarand/aarand.hpp"
12
22namespace partisub {
23
27struct Options {
32 bool force_non_empty = true;
33
37 unsigned long long seed = 12345;
38};
39
69template<typename Index_, typename Partition_>
70void compute(const Index_ num_obs, const Partition_* partition, const Index_ target, const Options& options, std::vector<Index_>& output) {
71 if (target >= num_obs) {
72 sanisizer::resize(output, num_obs);
73 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
74 return;
75 }
76
77 // num_obs >= 0 at this point otherwise target >= num_obs would be true.
78 const Partition_ num_partitions = sanisizer::sum<Partition_>(*std::max_element(partition, partition + num_obs), 1);
79
80 auto partition_count = sanisizer::create<std::vector<Index_> >(num_partitions);
81 for (Index_ o = 0; o < num_obs; ++o) {
82 const auto part = partition[o];
83 partition_count[part] += 1;
84 }
85
86 std::mt19937_64 rng(options.seed);
87
88 // We compute the number of observations to take from each partition.
89 // This is mostly straightforward as it should just be a ratio between the target and full number of observations.
90 // However, this leaves us with some fractional observations in some partitions.
91 // To make use of the fractional part, we perform weighted sampling to distribute observations across those partitions.
92 //
93 // We use the magical Efraimidis and Spirakis algorithm to do a one-pass weighted sampling without replacement based on the fractional parts.
94 // See Algorithm A-Res at https://en.wikipedia.org/wiki/Reservoir_sampling
95 // and also https://stackoverflow.com/questions/15113650/faster-weighted-sampling-without-replacement
96 auto to_sample = sanisizer::create<std::vector<Index_> >(num_partitions);
97 {
98 std::vector<std::pair<double, Partition_> > probabilities;
99 probabilities.reserve(num_partitions);
100
101 const double ratio = static_cast<double>(target) / static_cast<double>(num_obs);
102 for (Partition_ p = 0; p < num_partitions; ++p) {
103 const double expected = static_cast<double>(partition_count[p]) * ratio;
104
105 if (expected == 0) {
106 ;
107 } else if (expected < 1 && options.force_non_empty) {
108 to_sample[p] = 1;
109 } else {
110 const double minimum = std::floor(expected);
111 to_sample[p] = minimum;
112 if (expected > minimum) {
113 probabilities.emplace_back(expected - minimum, p);
114 }
115 }
116 }
117
118 const Index_ already_used = std::accumulate(to_sample.begin(), to_sample.end(), static_cast<Index_>(0));
119
120 if (already_used < target) {
121 // The calculation of the weird random variate here is where the magic happens.
122 //
123 // The probability of 'unif()^(1/a)' being greater than 'unif()^(1/b)' is 'a/(a+b)'.
124 // So, if we sort by the random variate and only keep the larger values, we enforce this pairwise difference in selection probability.
125 // (In practice, we log-transform these random variates so that higher weights lead to lower values, for numeric precision.
126 // Infinities from log-transformed zeros are fine here as they sort as expected.)
127 //
128 // To convince ourselves, we can go do some painful integrations to compute the probability of orderings that lead to a particular combination being selected.
129 // This probability is equal to that of a naive weighted selection algorithm,
130 // where the probability of selection of a particular value is equal to the ratio of the weight of the sum of remaining weights in the denominator.
131 for (auto& prob : probabilities) {
132 prob.first = - std::log(aarand::standard_uniform(rng)) / prob.first;
133 }
134
135 const Index_ leftovers = target - already_used;
136 std::nth_element(probabilities.begin(), probabilities.begin() + leftovers, probabilities.end());
137
138 for (Index_ i = 0; i < leftovers; ++i) {
139 ++to_sample[probabilities[i].second];
140 }
141 }
142 }
143
144 // Alright, actually doing the sampling with replacement now.
145 output.clear();
146 for (Index_ i = 0; i < num_obs; ++i) {
147 const auto part = partition[i];
148 auto& needed = to_sample[part];
149 if (needed == 0) {
150 continue;
151 }
152
153 auto& available = partition_count[part];
154 if (available <= needed || aarand::standard_uniform(rng) * static_cast<double>(available) <= static_cast<double>(needed)) {
155 output.push_back(i);
156 needed -= 1;
157 }
158
159 available -= 1;
160 }
161}
162
178template<typename Index_, typename Partition_>
179std::vector<Index_> compute(const Index_ num_obs, const Partition_* partition, const Index_ target, const Options& options) {
180 std::vector<Index_> output;
181 compute(num_obs, partition, target, options, output);
182 return output;
183}
184
185}
186
187#endif
Subsampling in partitions.
void compute(const Index_ num_obs, const Partition_ *partition, const Index_ target, const Options &options, std::vector< Index_ > &output)
Definition partisub.hpp:70
Options for compute().
Definition partisub.hpp:27
bool force_non_empty
Definition partisub.hpp:32
unsigned long long seed
Definition partisub.hpp:37