mumosa
Multi-modal analyses of single-cell data
Loading...
Searching...
No Matches
mumosa.hpp
Go to the documentation of this file.
1#ifndef MUMOSA_HPP
2#define MUMOSA_HPP
3
4#include <vector>
5#include <stdexcept>
6#include <cmath>
7#include <algorithm>
8#include <limits>
9
10#include "knncolle/knncolle.hpp"
11#include "tatami_stats/tatami_stats.hpp"
12
22namespace mumosa {
23
27struct Options {
32 int num_neighbors = 20;
33
38 int num_threads = 1;
39};
40
55template<typename Index_, typename Float_>
56std::pair<Float_, Float_> compute_distance(Index_ num_cells, Float_* distances) {
57 Float_ med = tatami_stats::medians::direct(distances, num_cells, /* skip_nan = */ false);
58 Float_ rmsd = 0;
59 for (Index_ i = 0; i < num_cells; ++i) {
60 auto d = distances[i];
61 rmsd += d * d;
62 }
63 rmsd = std::sqrt(rmsd);
64 return std::make_pair(med, rmsd);
65}
66
79template<typename Dim_, typename Index_, typename Float_>
80std::pair<Float_, Float_> compute_distance(const knncolle::Prebuilt<Dim_, Index_, Float_>& prebuilt, const Options& options) {
81 size_t nobs = prebuilt.num_observations();
82 std::vector<double> dist(nobs);
83
84 knncolle::parallelize(options.num_threads, nobs, [&](size_t, size_t start, size_t length) -> void {
85 auto searcher = prebuilt.initialize();
86 std::vector<Float_> distances;
87 for (size_t i = start, end = start + length; i < end; ++i) {
88 searcher->search(i, options.num_neighbors, NULL, &distances);
89 if (distances.size()) {
90 dist[i] = distances.back();
91 }
92 }
93 });
94
95 return compute_distance(nobs, dist.data());
96}
97
114template<typename Dim_, typename Index_, typename Float_>
115std::pair<Float_, Float_> compute_distance(
116 Dim_ num_dim,
117 Index_ num_cells,
118 const Float_* data,
120 const Options& options)
121{
122 auto prebuilt = builder.build_unique(knncolle::SimpleMatrix(num_dim, num_cells, data));
123 return compute_distance(*prebuilt, options);
124}
125
144template<typename Float_>
145Float_ compute_scale(const std::pair<Float_, Float_>& ref, const std::pair<Float_, Float_>& target) {
146 if (target.first == 0 || ref.first == 0) {
147 if (target.second == 0) {
148 return std::numeric_limits<Float_>::infinity();
149 } else if (ref.second == 0) {
150 return 0;
151 } else {
152 return ref.second / target.second;
153 }
154 } else {
155 return ref.first / target.first;
156 }
157}
158
172template<typename Float_>
173std::vector<Float_> compute_scale(const std::vector<std::pair<Float_, Float_> >& distances) {
174 std::vector<Float_> output(distances.size());
175
176 // Use the first entry with a non-zero RMSD as the reference.
177 bool found_ref = false;
178 size_t ref = 0;
179 for (size_t e = 0; e < distances.size(); ++e) {
180 if (distances[e].second) {
181 found_ref = true;
182 ref = e;
183 break;
184 }
185 }
186
187 // If all of them have a zero RMSD, then all scalings are zero, because it doesn't matter.
188 if (found_ref) {
189 const auto& dref = distances[ref];
190 for (size_t e = 0; e < distances.size(); ++e) {
191 output[e] = (e == ref ? 1 : compute_scale(dref, distances[e]));
192 }
193 }
194
195 return output;
196}
197
220template<typename Dim_, typename Index_, typename Input_, typename Scale_, typename Output_>
221void combine_scaled_embeddings(const std::vector<Dim_>& num_dims, Index_ num_cells, const std::vector<Input_*>& embeddings, const std::vector<Scale_>& scaling, Output_* output) {
222 size_t nembed = num_dims.size();
223 if (embeddings.size() != nembed || scaling.size() != nembed) {
224 throw std::runtime_error("'num_dims', 'embeddings' and 'scale' should have the same length");
225 }
226
227 size_t ntotal = std::accumulate(num_dims.begin(), num_dims.end(), static_cast<size_t>(0));
228 size_t offset = 0;
229
230 for (size_t e = 0; e < nembed; ++e) {
231 Dim_ curdim = num_dims[e];
232 auto inptr = embeddings[e];
233 auto s = scaling[e];
234
235 // We use offsets to avoid forming invalid pointers with strided pointers.
236 size_t in_position = 0;
237 size_t out_position = offset;
238
239 if (std::isinf(s)) {
240 // If the scaling factor is infinite, it implies that the current
241 // embedding is all-zero, so we just fill with zeros, and move on.
242 for (Index_ c = 0; c < num_cells; ++c, in_position += curdim, out_position += ntotal) {
243 std::fill_n(output + out_position, curdim, 0);
244 }
245 } else {
246 for (Index_ c = 0; c < num_cells; ++c, in_position += curdim, out_position += ntotal) {
247 for (Dim_ d = 0; d < curdim; ++d) {
248 output[out_position + d] = inptr[in_position + d] * s;
249 }
250 }
251 }
252
253 offset += curdim;
254 }
255}
256
257}
258
259#endif
virtual Index_ num_observations() const=0
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
Scale multi-modal embeddings to adjust for differences in variance.
std::pair< Float_, Float_ > compute_distance(Index_ num_cells, Float_ *distances)
Definition mumosa.hpp:56
void combine_scaled_embeddings(const std::vector< Dim_ > &num_dims, Index_ num_cells, const std::vector< Input_ * > &embeddings, const std::vector< Scale_ > &scaling, Output_ *output)
Definition mumosa.hpp:221
Float_ compute_scale(const std::pair< Float_, Float_ > &ref, const std::pair< Float_, Float_ > &target)
Definition mumosa.hpp:145
Options for compute_distance().
Definition mumosa.hpp:27
int num_threads
Definition mumosa.hpp:38
int num_neighbors
Definition mumosa.hpp:32