mumosa
Multi-modal analyses of single-cell data
Loading...
Searching...
No Matches
mumosa.hpp
Go to the documentation of this file.
1#ifndef MUMOSA_HPP
2#define MUMOSA_HPP
3
4#include <vector>
5#include <stdexcept>
6#include <cmath>
7#include <algorithm>
8#include <limits>
9#include <cstddef>
10
11#include "knncolle/knncolle.hpp"
12#include "tatami_stats/tatami_stats.hpp"
13
23namespace mumosa {
24
28struct Options {
33 int num_neighbors = 20;
34
39 int num_threads = 1;
40};
41
56template<typename Index_, typename Distance_>
57std::pair<Distance_, Distance_> compute_distance(Index_ num_cells, Distance_* distances) {
58 Distance_ med = tatami_stats::medians::direct(distances, num_cells, /* skip_nan = */ false);
59 Distance_ rmsd = 0;
60 for (Index_ i = 0; i < num_cells; ++i) {
61 auto d = distances[i];
62 rmsd += d * d;
63 }
64 rmsd = std::sqrt(rmsd);
65 return std::make_pair(med, rmsd);
66}
67
81template<typename Index_, typename Input_, typename Distance_>
82std::pair<Distance_, Distance_> compute_distance(const knncolle::Prebuilt<Index_, Input_, Distance_>& prebuilt, const Options& options) {
83 Index_ nobs = prebuilt.num_observations();
84 auto capped_k = knncolle::cap_k(options.num_neighbors, nobs);
85 std::vector<double> dist(nobs);
86
87 knncolle::parallelize(options.num_threads, nobs, [&](int, Index_ start, Index_ length) -> void {
88 auto searcher = prebuilt.initialize();
89 std::vector<Distance_> distances;
90 for (Index_ i = start, end = start + length; i < end; ++i) {
91 searcher->search(i, capped_k, NULL, &distances);
92 if (distances.size()) {
93 dist[i] = distances.back();
94 }
95 }
96 });
97
98 return compute_distance(nobs, dist.data());
99}
100
119template<typename Index_, typename Input_, typename Distance_, class Matrix_ = knncolle::Matrix<Index_, Input_> >
120std::pair<Distance_, Distance_> compute_distance(
121 std::size_t num_dim,
122 Index_ num_cells,
123 const Input_* data,
125 const Options& options)
126{
127 auto prebuilt = builder.build_unique(knncolle::SimpleMatrix(num_dim, num_cells, data));
128 return compute_distance(*prebuilt, options);
129}
130
149template<typename Distance_>
150Distance_ compute_scale(const std::pair<Distance_, Distance_>& ref, const std::pair<Distance_, Distance_>& target) {
151 if (target.first == 0 || ref.first == 0) {
152 if (target.second == 0) {
153 return std::numeric_limits<Distance_>::infinity();
154 } else if (ref.second == 0) {
155 return 0;
156 } else {
157 return ref.second / target.second;
158 }
159 } else {
160 return ref.first / target.first;
161 }
162}
163
177template<typename Distance_>
178std::vector<Distance_> compute_scale(const std::vector<std::pair<Distance_, Distance_> >& distances) {
179 std::vector<Distance_> output(distances.size());
180
181 // Use the first entry with a non-zero RMSD as the reference.
182 bool found_ref = false;
183 auto ndist = distances.size();
184 decltype(ndist) ref = 0;
185 for (decltype(ndist) e = 0; e < ndist; ++e) {
186 if (distances[e].second) {
187 found_ref = true;
188 ref = e;
189 break;
190 }
191 }
192
193 // If all of them have a zero RMSD, then all scalings are zero, because it doesn't matter.
194 if (found_ref) {
195 const auto& dref = distances[ref];
196 for (decltype(ndist) e = 0; e < ndist; ++e) {
197 output[e] = (e == ref ? 1 : compute_scale(dref, distances[e]));
198 }
199 }
200
201 return output;
202}
203
225template<typename Index_, typename Input_, typename Scale_, typename Output_>
226void combine_scaled_embeddings(const std::vector<std::size_t>& num_dims, Index_ num_cells, const std::vector<Input_*>& embeddings, const std::vector<Scale_>& scaling, Output_* output) {
227 auto nembed = num_dims.size();
228 if (embeddings.size() != nembed || scaling.size() != nembed) {
229 throw std::runtime_error("'num_dims', 'embeddings' and 'scale' should have the same length");
230 }
231
232 std::size_t ntotal = std::accumulate(num_dims.begin(), num_dims.end(), static_cast<std::size_t>(0));
233 std::size_t offset = 0;
234
235 for (decltype(nembed) e = 0; e < nembed; ++e) {
236 auto curdim = num_dims[e];
237 auto inptr = embeddings[e];
238 auto s = scaling[e];
239
240 // We use offsets to avoid forming invalid pointers with strided pointers.
241 std::size_t in_position = 0;
242 std::size_t out_position = offset;
243
244 if (std::isinf(s)) {
245 // If the scaling factor is infinite, it implies that the current
246 // embedding is all-zero, so we just fill with zeros, and move on.
247 for (Index_ c = 0; c < num_cells; ++c, in_position += curdim, out_position += ntotal) {
248 std::fill_n(output + out_position, curdim, 0);
249 }
250 } else {
251 for (Index_ c = 0; c < num_cells; ++c, in_position += curdim, out_position += ntotal) {
252 for (std::size_t d = 0; d < curdim; ++d) {
253 output[out_position + d] = inptr[in_position + d] * s;
254 }
255 }
256 }
257
258 offset += curdim;
259 }
260}
261
262}
263
264#endif
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
virtual Index_ num_observations() const=0
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
int cap_k(int k, Index_ num_observations)
Scale multi-modal embeddings to adjust for differences in variance.
Distance_ compute_scale(const std::pair< Distance_, Distance_ > &ref, const std::pair< Distance_, Distance_ > &target)
Definition mumosa.hpp:150
void combine_scaled_embeddings(const std::vector< std::size_t > &num_dims, Index_ num_cells, const std::vector< Input_ * > &embeddings, const std::vector< Scale_ > &scaling, Output_ *output)
Definition mumosa.hpp:226
std::pair< Distance_, Distance_ > compute_distance(Index_ num_cells, Distance_ *distances)
Definition mumosa.hpp:57
Options for compute_distance().
Definition mumosa.hpp:28
int num_threads
Definition mumosa.hpp:39
int num_neighbors
Definition mumosa.hpp:33