mumosa/mumosa_8hpp_source.html

#ifndef MUMOSA_HPP

#define MUMOSA_HPP


#include <vector>

#include <stdexcept>

#include <cmath>

#include <algorithm>

#include <limits>

#include <cstddef>


#include "knncolle/knncolle.hpp"

#include "tatami_stats/tatami_stats.hpp"


namespace mumosa {


struct Options {

    int num_neighbors = 20;


    int num_threads = 1;

};


template<typename Index_, typename Distance_>


std::pair<Distance_, Distance_> compute_distance(Index_ num_cells, Distance_* distances) {

    Distance_ med = tatami_stats::medians::direct(distances, num_cells, /* skip_nan = */ false);

    Distance_ rmsd = 0;

    for (Index_ i = 0; i < num_cells; ++i) {

        auto d = distances[i];

        rmsd += d * d;

    }

    rmsd = std::sqrt(rmsd);

    return std::make_pair(med, rmsd);

}


template<typename Index_, typename Input_, typename Distance_>


std::pair<Distance_, Distance_> compute_distance(const knncolle::Prebuilt<Index_, Input_, Distance_>& prebuilt, const Options& options) {

    Index_ nobs = prebuilt.num_observations();

    auto capped_k = knncolle::cap_k(options.num_neighbors, nobs);

    std::vector<double> dist(nobs);


    knncolle::parallelize(options.num_threads, nobs, [&](int, Index_ start, Index_ length) -> void {

        auto searcher = prebuilt.initialize();

        std::vector<Distance_> distances;

        for (Index_ i = start, end = start + length; i < end; ++i) {

            searcher->search(i, capped_k, NULL, &distances);

            if (distances.size()) {

                dist[i] = distances.back();

            }

        }

    });


    return compute_distance(nobs, dist.data());

}


template<typename Index_, typename Input_, typename Distance_, class Matrix_ = knncolle::Matrix<Index_, Input_> >


std::pair<Distance_, Distance_> compute_distance(

    std::size_t num_dim,

    Index_ num_cells,

    const Input_* data,

    const knncolle::Builder<Index_, Input_, Distance_, Matrix_>& builder,

    const Options& options)

{

    auto prebuilt = builder.build_unique(knncolle::SimpleMatrix(num_dim, num_cells, data));

    return compute_distance(*prebuilt, options);

}


template<typename Distance_>


Distance_ compute_scale(const std::pair<Distance_, Distance_>& ref, const std::pair<Distance_, Distance_>& target) {

    if (target.first == 0 || ref.first == 0) {

        if (target.second == 0) {

            return std::numeric_limits<Distance_>::infinity();

        } else if (ref.second == 0) {

            return 0;

        } else {

            return ref.second / target.second;

        }

    } else {

        return ref.first / target.first;

    }

}


template<typename Distance_>


std::vector<Distance_> compute_scale(const std::vector<std::pair<Distance_, Distance_> >& distances) {

    std::vector<Distance_> output(distances.size());


    // Use the first entry with a non-zero RMSD as the reference.

    bool found_ref = false;

    auto ndist = distances.size();

    decltype(ndist) ref = 0;

    for (decltype(ndist) e = 0; e < ndist; ++e) {

        if (distances[e].second) {

            found_ref = true;

            ref = e;

            break;

        }

    }


    // If all of them have a zero RMSD, then all scalings are zero, because it doesn't matter.

    if (found_ref) {

        const auto& dref = distances[ref];

        for (decltype(ndist) e = 0; e < ndist; ++e) {

            output[e] = (e == ref ? 1 : compute_scale(dref, distances[e]));

        }

    }


    return output;

}


template<typename Index_, typename Input_, typename Scale_, typename Output_>


void combine_scaled_embeddings(const std::vector<std::size_t>& num_dims, Index_ num_cells, const std::vector<Input_*>& embeddings, const std::vector<Scale_>& scaling, Output_* output) {

    auto nembed = num_dims.size();

    if (embeddings.size() != nembed || scaling.size() != nembed) {

        throw std::runtime_error("'num_dims', 'embeddings' and 'scale' should have the same length");

    }


    std::size_t ntotal = std::accumulate(num_dims.begin(), num_dims.end(), static_cast<std::size_t>(0));

    std::size_t offset = 0;


    for (decltype(nembed) e = 0; e < nembed; ++e) {

        auto curdim = num_dims[e];

        auto inptr = embeddings[e];

        auto s = scaling[e];


        // We use offsets to avoid forming invalid pointers with strided pointers.

        std::size_t in_position = 0;

        std::size_t out_position = offset;


        if (std::isinf(s)) {

            // If the scaling factor is infinite, it implies that the current

            // embedding is all-zero, so we just fill with zeros, and move on.

            for (Index_ c = 0; c < num_cells; ++c, in_position += curdim, out_position += ntotal) {

                std::fill_n(output + out_position, curdim, 0);

            }

        } else {

            for (Index_ c = 0; c < num_cells; ++c, in_position += curdim, out_position += ntotal) {

                for (std::size_t d = 0; d < curdim; ++d) {

                    output[out_position + d] = inptr[in_position + d] * s;

                }

            }

        }


        offset += curdim;

    }

}


}


#endif

knncolle::Builder

knncolle::Builder::build_unique
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const

knncolle::Prebuilt

knncolle::Prebuilt::num_observations
virtual Index_ num_observations() const=0

knncolle::SimpleMatrix

knncolle.hpp

knncolle::parallelize
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)

knncolle::cap_k
int cap_k(int k, Index_ num_observations)

mumosa
Scale multi-modal embeddings to adjust for differences in variance.

mumosa::compute_scale
Distance_ compute_scale(const std::pair< Distance_, Distance_ > &ref, const std::pair< Distance_, Distance_ > &target)
Definition mumosa.hpp:150

mumosa::combine_scaled_embeddings
void combine_scaled_embeddings(const std::vector< std::size_t > &num_dims, Index_ num_cells, const std::vector< Input_ * > &embeddings, const std::vector< Scale_ > &scaling, Output_ *output)
Definition mumosa.hpp:226

mumosa::compute_distance
std::pair< Distance_, Distance_ > compute_distance(Index_ num_cells, Distance_ *distances)
Definition mumosa.hpp:57

mumosa::Options
Options for compute_distance().
Definition mumosa.hpp:28

mumosa::Options::num_threads
int num_threads
Definition mumosa.hpp:39

mumosa::Options::num_neighbors
int num_neighbors
Definition mumosa.hpp:33