partisub/partisub_8hpp_source.html

#ifndef PARTISUB_HPP

#define PARTISUB_HPP


#include <vector>

#include <algorithm>

#include <numeric>

#include <type_traits>

#include <random>


#include "sanisizer/sanisizer.hpp"

#include "aarand/aarand.hpp"


namespace partisub {


struct Options {

    bool force_non_empty = true;


    unsigned long long seed = 12345;

};


template<typename Index_, typename Partition_>


void compute(const Index_ num_obs, const Partition_* partition, const Index_ target, const Options& options, std::vector<Index_>& output) {

    if (target >= num_obs) {

        sanisizer::resize(output, num_obs);

        std::iota(output.begin(), output.end(), static_cast<Index_>(0));

        return;

    }


    // num_obs >= 0 at this point otherwise target >= num_obs would be true.

    const Partition_ num_partitions = sanisizer::sum<Partition_>(*std::max_element(partition, partition + num_obs), 1);


    auto partition_count = sanisizer::create<std::vector<Index_> >(num_partitions);

    for (Index_ o = 0; o < num_obs; ++o) {

        const auto part = partition[o];

        partition_count[part] += 1;

    }


    std::mt19937_64 rng(options.seed);


    // We compute the number of observations to take from each partition.

    // This is mostly straightforward as it should just be a ratio between the target and full number of observations.

    // However, this leaves us with some fractional observations in some partitions.

    // To make use of the fractional part, we perform weighted sampling to distribute observations across those partitions.

    //

    // We use the magical Efraimidis and Spirakis algorithm to do a one-pass weighted sampling without replacement based on the fractional parts.

    // See Algorithm A-Res at https://en.wikipedia.org/wiki/Reservoir_sampling

    // and also https://stackoverflow.com/questions/15113650/faster-weighted-sampling-without-replacement

    auto to_sample = sanisizer::create<std::vector<Index_> >(num_partitions);

    {

        std::vector<std::pair<double, Partition_> > probabilities;

        probabilities.reserve(num_partitions);


        const double ratio = static_cast<double>(target) / static_cast<double>(num_obs);

        for (Partition_ p = 0; p < num_partitions; ++p) {

            const double expected = static_cast<double>(partition_count[p]) * ratio;


            if (expected == 0) {

                ;

            } else if (expected < 1 && options.force_non_empty) {

                to_sample[p] = 1;

            } else {

                const double minimum = std::floor(expected);

                to_sample[p] = minimum;

                if (expected > minimum) {

                    probabilities.emplace_back(expected - minimum, p);

                }

            }

        }


        const Index_ already_used = std::accumulate(to_sample.begin(), to_sample.end(), static_cast<Index_>(0));


        if (already_used < target) {

            // The calculation of the weird random variate here is where the magic happens.

            //

            // The probability of 'unif()^(1/a)' being greater than 'unif()^(1/b)' is 'a/(a+b)'.

            // So, if we sort by the random variate and only keep the larger values, we enforce this pairwise difference in selection probability.

            // (In practice, we log-transform these random variates so that higher weights lead to lower values, for numeric precision.

            // Infinities from log-transformed zeros are fine here as they sort as expected.)

            //

            // To convince ourselves, we can go do some painful integrations to compute the probability of orderings that lead to a particular combination being selected.

            // This probability is equal to that of a naive weighted selection algorithm,

            // where the probability of selection of a particular value is equal to the ratio of the weight of the sum of remaining weights in the denominator.

            for (auto& prob : probabilities) {

                prob.first = - std::log(aarand::standard_uniform(rng)) / prob.first;

            }


            const Index_ leftovers = target - already_used;

            std::nth_element(probabilities.begin(), probabilities.begin() + leftovers, probabilities.end());


            for (Index_ i = 0; i < leftovers; ++i) {

                ++to_sample[probabilities[i].second];

            }

        }

    }


    // Alright, actually doing the sampling with replacement now.

    output.clear();

    for (Index_ i = 0; i < num_obs; ++i) {

        const auto part = partition[i];

        auto& needed = to_sample[part];

        if (needed == 0) {

            continue;

        }


        auto& available = partition_count[part];

        if (available <= needed || aarand::standard_uniform(rng) * static_cast<double>(available) <= static_cast<double>(needed)) {

            output.push_back(i);

            needed -= 1;

        }


        available -= 1;

    }

}


template<typename Index_, typename Partition_>


std::vector<Index_> compute(const Index_ num_obs, const Partition_* partition, const Index_ target, const Options& options) {

    std::vector<Index_> output;

    compute(num_obs, partition, target, options, output);

    return output;

}


}


#endif

partisub
Subsampling in partitions.

partisub::compute
void compute(const Index_ num_obs, const Partition_ *partition, const Index_ target, const Options &options, std::vector< Index_ > &output)
Definition partisub.hpp:70

partisub::Options
Options for compute().
Definition partisub.hpp:27

partisub::Options::force_non_empty
bool force_non_empty
Definition partisub.hpp:32

partisub::Options::seed
unsigned long long seed
Definition partisub.hpp:37