mnncorrect/mnncorrect_8hpp_source.html

#ifndef MNNCORRECT_HPP

#define MNNCORRECT_HPP


#include <algorithm>

#include <vector>

#include <numeric>

#include <stdexcept>

#include <cstddef>


#include "knncolle/knncolle.hpp"


#include "AutomaticOrder.hpp"

#include "restore_input_order.hpp"

#include "utils.hpp"


namespace mnncorrect {


template<typename Index_, typename Float_, class Matrix_ = knncolle::Matrix<Index_, Float_> >


struct Options {

    int num_neighbors = 15;


    int num_steps = 1;


    std::shared_ptr<knncolle::Builder<Index_, Float_, Float_, Matrix_> > builder;


    MergePolicy merge_policy = MergePolicy::RSS;


    int num_threads = 1;

};


namespace internal {


template<typename Index_, typename Float_, class Matrix_>

void compute(std::size_t num_dim, const std::vector<Index_>& num_obs, const std::vector<const Float_*>& batches, Float_* output, const Options<Index_, Float_, Matrix_>& options) {

    auto builder = options.builder;

    if (!builder) {

        typedef knncolle::EuclideanDistance<Float_, Float_> Euclidean;

        builder.reset(new knncolle::VptreeBuilder<Index_, Float_, Float_, Matrix_, Euclidean>(std::make_shared<Euclidean>()));

    }


    AutomaticOrder<Index_, Float_, Matrix_> runner(

        num_dim,

        num_obs,

        batches,

        output,

        *builder,

        options.num_neighbors,

        options.num_steps,

        options.merge_policy,

        options.num_threads

    );


    runner.merge();

}


}

template<typename Index_, typename Float_, class Matrix_>


void compute(std::size_t num_dim, const std::vector<Index_>& num_obs, const std::vector<const Float_*>& batches, Float_* output, const Options<Index_, Float_, Matrix_>& options) {

    internal::compute(num_dim, num_obs, batches, output, options);

}


template<typename Index_, typename Float_, class Matrix_>


void compute(std::size_t num_dim, const std::vector<Index_>& num_obs, const Float_* input, Float_* output, const Options<Index_, Float_, Matrix_>& options) {

    std::vector<const Float_*> batches;

    batches.reserve(num_obs.size());

    for (auto n : num_obs) {

        batches.push_back(input);

        input += static_cast<std::size_t>(n) * num_dim; // cast to size_t's to avoid overflow.

    }

    compute(num_dim, num_obs, batches, output, options);

}


template<typename Index_, typename Float_, typename Batch_, class Matrix_>


void compute(std::size_t num_dim, Index_ num_obs, const Float_* input, const Batch_* batch, Float_* output, const Options<Index_, Float_, Matrix_>& options) {

    const BatchIndex nbatches = (num_obs ? static_cast<BatchIndex>(*std::max_element(batch, batch + num_obs)) + 1 : 0);

    std::vector<Index_> sizes(nbatches);

    for (Index_ o = 0; o < num_obs; ++o) {

        ++sizes[batch[o]];

    }


    // Avoiding the need to allocate a temporary buffer

    // if we're already dealing with contiguous batches.

    bool already_sorted = true;

    for (Index_ o = 1; o < num_obs; ++o) {

       if (batch[o] < batch[o-1]) {

           already_sorted = false;

           break;

       }

    }

    if (already_sorted) {

        compute(num_dim, sizes, input, output, options);

        return;

    }


    std::size_t accumulated = 0; // use size_t to avoid overflow issues during later multiplication.

    std::vector<std::size_t> offsets(nbatches);

    for (BatchIndex b = 0; b < nbatches; ++b) {

        offsets[b] = accumulated;

        accumulated += sizes[b];

    }


    // Dumping everything by order into another vector.

    std::vector<Float_> tmp(num_dim * static_cast<std::size_t>(num_obs)); // cast to size_t to avoid overflow.

    std::vector<const Float_*> ptrs(nbatches);

    for (BatchIndex b = 0; b < nbatches; ++b) {

        ptrs[b] = tmp.data() + offsets[b] * num_dim; // already size_t's, so no need to cast to avoid overflow.

    }


    for (Index_ o = 0; o < num_obs; ++o) {

        auto current = input + static_cast<std::size_t>(o) * num_dim; // cast to size_t to avoid overflow.

        auto& offset = offsets[batch[o]];

        auto destination = tmp.data() + num_dim * offset; // already size_t's, so no need to cast to avoid overflow.

        std::copy_n(current, num_dim, destination);

        ++offset;

    }


    internal::compute(num_dim, sizes, ptrs, output, options);

    internal::restore_input_order(num_dim, sizes, batch, output);

}


}


#endif

knncolle::EuclideanDistance

knncolle::VptreeBuilder

knncolle.hpp

mnncorrect
Batch correction with mutual nearest neighbors.
Definition utils.hpp:20

mnncorrect::compute
void compute(std::size_t num_dim, const std::vector< Index_ > &num_obs, const std::vector< const Float_ * > &batches, Float_ *output, const Options< Index_, Float_, Matrix_ > &options)
Definition mnncorrect.hpp:142

mnncorrect::MergePolicy
MergePolicy
Definition utils.hpp:42

mnncorrect::BatchIndex
std::size_t BatchIndex
Definition utils.hpp:25

mnncorrect::Options
Options for compute().
Definition mnncorrect.hpp:36

mnncorrect::Options::num_steps
int num_steps
Definition mnncorrect.hpp:48

mnncorrect::Options::builder
std::shared_ptr< knncolle::Builder< Index_, Float_, Float_, Matrix_ > > builder
Definition mnncorrect.hpp:54

mnncorrect::Options::num_threads
int num_threads
Definition mnncorrect.hpp:65

mnncorrect::Options::num_neighbors
int num_neighbors
Definition mnncorrect.hpp:42

mnncorrect::Options::merge_policy
MergePolicy merge_policy
Definition mnncorrect.hpp:59

utils.hpp
Utilities for MNN correction.