scran_markers
Marker detection for single-cell data
Loading...
Searching...
No Matches
summarize_comparisons.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MARKERS_SUMMARIZE_COMPARISONS_HPP
2#define SCRAN_MARKERS_SUMMARIZE_COMPARISONS_HPP
3
4#include <algorithm>
5#include <numeric>
6#include <vector>
7#include <cmath>
8#include <cstddef>
9#include <optional>
10#include <cassert>
11
12#include "tatami_stats/tatami_stats.hpp"
13#include "sanisizer/sanisizer.hpp"
15
16#include "utils.hpp"
17
23namespace scran_markers {
24
31template<typename Stat_ = double, typename Rank_ = int>
38 Stat_* min = NULL;
39
45 Stat_* mean = NULL;
46
52 Stat_* median = NULL;
53
59 Stat_* max = NULL;
60
74 std::optional<std::vector<Stat_*> > quantiles;
75
81 Rank_* min_rank = NULL;
82};
83
90template<typename Stat_ = double, typename Rank_ = int>
96 std::vector<Stat_> min;
97
102 std::vector<Stat_> mean;
103
108 std::vector<Stat_> median;
109
114 std::vector<Stat_> max;
115
122 std::optional<std::vector<std::vector<Stat_> > > quantiles;
123
128 std::vector<Rank_> min_rank;
129};
130
134namespace internal {
135
136inline void validate_quantiles(const std::optional<std::vector<double> >& probs) {
137 if (!probs.has_value()) {
138 return;
139 }
140
141 const auto val = probs->front();
142 if (val < 0 || val > 1) {
143 throw std::runtime_error("quantile probabilities should be in [0, 1]");
144 }
145
146 const auto nprobs = probs->size();
147 for (I<decltype(nprobs)> i = 1; i < nprobs; ++i) {
148 const auto val = (*probs)[i];
149 if (val < 0 || val > 1) {
150 throw std::runtime_error("quantile probabilities should be in [0, 1]");
151 }
152 if (val < (*probs)[i - 1]) {
153 throw std::runtime_error("quantile probabilities should be sorted");
154 }
155 }
156}
157
158template<typename Stat_, class Iterator_>
159class MultipleQuantiles {
160public:
161 MultipleQuantiles(const std::vector<double>& probs, const std::size_t max_len) :
162 my_stacks(sanisizer::cast<I<decltype(my_stacks.size())> >(max_len)),
163 my_probs(&probs)
164 {
165 sanisizer::can_ptrdiff<Iterator_>(max_len);
166 }
167
168 struct Details {
169 std::vector<std::size_t> index;
170 std::vector<double> lower_fraction;
171 std::vector<double> upper_fraction;
172 };
173
174private:
175 std::vector<std::optional<Details> > my_stacks;
176 const std::vector<double>* my_probs; // avoid unnecessary copies of the quantile probabilities.
177
178private:
179 Details& initialize_stack(const std::size_t len) {
180 // len is guaranteed to be > 1 from summarize_comparisons().
181 assert(len > 1);
182 auto& raw_stack = my_stacks[len - 1];
183 if (raw_stack.has_value()) {
184 return *raw_stack;
185 }
186
187 raw_stack.emplace();
188 auto& stack = *raw_stack;
189 const auto nprobs = my_probs->size();
190 stack.index.reserve(nprobs);
191 stack.lower_fraction.reserve(nprobs);
192 stack.upper_fraction.reserve(nprobs);
193
194 for (const auto prob : *my_probs) {
195 const double frac = static_cast<double>(len - 1) * static_cast<double>(prob);
196 const double base = std::floor(frac);
197 stack.index.push_back(base); // cast is known-safe if can_ptrdiff passes and 0 <= quantile <= 1.
198 stack.upper_fraction.push_back(frac - base);
199 stack.lower_fraction.push_back(static_cast<double>(1) - stack.upper_fraction.back());
200 }
201
202 return stack;
203 }
204
205public:
206 template<class OutputFun_>
207 void compute(const std::size_t len, Iterator_ begin, Iterator_ end, OutputFun_ output) {
208 // len is assumed to be > 1 and equal to 'end - begin'.
209 // We just accept it as an argument to avoid recomputing it.
210 assert(len > 1);
211 assert(len == static_cast<std::size_t>(end - begin));
212 auto& stack = initialize_stack(len);
213
214 // Here, the assumption is that we're computing quantiles by increasing probability.
215 // We keep track of how much we've already sorted so that we don't have to call
216 // nth_element for a given index if we already computed for a previous probability.
217 // This allows us to avoid near-redundant sorts for similar probabilities.
218 std::size_t sorted_up_to = 0;
219
220 const auto nprobs = stack.index.size();
221 for (I<decltype(nprobs)> p = 0; p < nprobs; ++p) {
222 const auto curindex = stack.index[p];
223 const auto curlower = stack.lower_fraction[p];
224 const auto curupper = stack.upper_fraction[p];
225
226 const auto target = begin + curindex;
227 if (curindex >= sorted_up_to) { // avoid re-searching for the nth element if we already found what we wanted for a lower probability.
228 std::nth_element(begin + sorted_up_to, target, end);
229 sorted_up_to = curindex + 1;
230 }
231 const auto lower_val = *target;
232
233 if (curupper != 0) {
234 const auto curindex_p1 = curindex + 1;
235 const auto target_p1 = begin + curindex_p1;
236 if (curindex_p1 >= sorted_up_to) {
237 // Basically mimics nth_element(target_p1, target_p1, end).
238 std::swap(*target_p1, *std::min_element(target_p1, end));
239 sorted_up_to = curindex_p1 + 1;
240 }
241 const auto upper_val = *target_p1;
242 output(p, lower_val * curlower + upper_val * curupper);
243 } else {
244 output(p, lower_val);
245 }
246 }
247 }
248};
249
250template<typename Stat_>
251using MaybeMultipleQuantiles = std::optional<MultipleQuantiles<Stat_, Stat_*> >;
252
253template<typename Stat_>
254MaybeMultipleQuantiles<Stat_> setup_multiple_quantiles(const std::optional<std::vector<double> >& requested, const std::size_t ngroups) {
255 MaybeMultipleQuantiles<Stat_> output;
256 if (requested.has_value()) {
257 output.emplace(*requested, ngroups);
258 }
259 return output;
260}
261
262template<typename Stat_, typename Gene_, typename Rank_>
263void summarize_comparisons(
264 const std::size_t ngroups,
265 const Stat_* const effects,
266 const std::size_t group,
267 const Gene_ gene,
268 const SummaryBuffers<Stat_, Rank_>& output,
269 MaybeMultipleQuantiles<Stat_>& quantile_calculators,
270 std::vector<Stat_>& buffer
271) {
272 // Ignoring the self comparison and pruning out NaNs.
273 std::size_t ncomps = 0;
274 for (I<decltype(ngroups)> r = 0; r < ngroups; ++r) {
275 if (r == group || std::isnan(effects[r])) {
276 continue;
277 }
278 buffer[ncomps] = effects[r];
279 ++ncomps;
280 }
281
282 if (ncomps <= 1) {
283 Stat_ val = (ncomps == 0 ? std::numeric_limits<Stat_>::quiet_NaN() : buffer[0]);
284 if (output.min) {
285 output.min[gene] = val;
286 }
287 if (output.mean) {
288 output.mean[gene] = val;
289 }
290 if (output.max) {
291 output.max[gene] = val;
292 }
293 if (output.median) {
294 output.median[gene] = val;
295 }
296 if (output.quantiles.has_value()) {
297 for (const auto& quan : *(output.quantiles)) {
298 quan[gene] = val;
299 }
300 }
301
302 } else {
303 const auto ebegin = buffer.data(), elast = ebegin + ncomps;
304 if (output.min) {
305 output.min[gene] = *std::min_element(ebegin, elast);
306 }
307 if (output.mean) {
308 output.mean[gene] = std::accumulate(ebegin, elast, static_cast<Stat_>(0)) / ncomps;
309 }
310 if (output.max) {
311 output.max[gene] = *std::max_element(ebegin, elast);
312 }
313 // This following calculations mutate the buffer, so we put this last to avoid surprises.
314 if (output.median) {
315 output.median[gene] = tatami_stats::medians::direct(ebegin, ncomps, /* skip_nan = */ false);
316 }
317 if (output.quantiles.has_value()) {
318 quantile_calculators->compute(
319 ncomps,
320 ebegin,
321 elast,
322 [&](const std::size_t i, const Stat_ value) -> void {
323 (*output.quantiles)[i][gene] = value;
324 }
325 );
326 }
327 }
328}
329
330template<typename Gene_, typename Stat_, typename Rank_>
331void summarize_comparisons(
332 const Gene_ ngenes,
333 const std::size_t ngroups,
334 const Stat_* const effects,
335 const std::optional<std::vector<double> >& compute_quantiles,
336 const std::vector<SummaryBuffers<Stat_, Rank_> >& output,
337 const int threads
338) {
339 tatami::parallelize([&](const int, const Gene_ start, const Gene_ length) -> void {
340 auto summary_qcalcs = setup_multiple_quantiles<Stat_>(compute_quantiles, ngroups);
341 auto buffer = sanisizer::create<std::vector<Stat_> >(ngroups);
342
343 for (Gene_ gene = start, end = start + length; gene < end; ++gene) {
344 for (I<decltype(ngroups)> l = 0; l < ngroups; ++l) {
345 const auto current_effects = effects + sanisizer::nd_offset<std::size_t>(0, ngroups, l, ngroups, gene);
346 summarize_comparisons(ngroups, current_effects, l, gene, output[l], summary_qcalcs, buffer);
347 }
348 }
349 }, ngenes, threads);
350}
351
352template<typename Stat_, typename Gene_>
353Gene_ fill_and_sort_rank_buffer(const Stat_* const effects, const std::size_t stride, std::vector<std::pair<Stat_, Gene_> >& buffer) {
354 Gene_ counter = 0;
355 for (Gene_ i = 0, end = buffer.size(); i < end; ++i) {
356 const auto cureffect = effects[sanisizer::product_unsafe<std::size_t>(i, stride)];
357 if (!std::isnan(cureffect)) {
358 auto& current = buffer[counter];
359 current.first = cureffect;
360 current.second = i;
361 ++counter;
362 }
363 }
364
365 std::sort(
366 buffer.begin(),
367 buffer.begin() + counter,
368 [&](const std::pair<Stat_, Gene_>& left, const std::pair<Stat_, Gene_>& right) -> bool {
369 // Sort by decreasing first element, then break ties by increasing second element.
370 if (left.first == right.first) {
371 return left.second < right.second;
372 } else {
373 return left.first > right.first;
374 }
375 }
376 );
377
378 return counter;
379}
380
381template<typename Stat_, typename Gene_, typename Rank_>
382void compute_min_rank_pairwise(
383 const Gene_ ngenes,
384 const std::size_t ngroups,
385 const Stat_* const effects,
386 const std::vector<SummaryBuffers<Stat_, Rank_> >& output,
387 const bool preserve_ties,
388 const int threads
389) {
390 const auto ngroups2 = sanisizer::product_unsafe<std::size_t>(ngroups, ngroups);
391 const auto maxrank_placeholder = sanisizer::cast<Rank_>(ngenes); // using the maximum possible rank (i.e., 'ngenes') as the default.
392
393 tatami::parallelize([&](const int, const std::size_t start, const std::size_t length) -> void {
394 auto buffer = sanisizer::create<std::vector<std::pair<Stat_, Gene_> > >(ngenes);
395 for (I<decltype(start)> g = start, end = start + length; g < end; ++g) {
396 const auto target = output[g].min_rank;
397 if (target == NULL) {
398 continue;
399 }
400 std::fill_n(target, ngenes, maxrank_placeholder);
401
402 for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
403 if (g == g2) {
404 continue;
405 }
406 const auto offset = sanisizer::nd_offset<std::size_t>(g2, ngroups, g);
407 const auto used = fill_and_sort_rank_buffer(effects + offset, ngroups2, buffer);
408
409 if (!preserve_ties) {
410 Rank_ counter = 1;
411 for (Gene_ i = 0; i < used; ++i) {
412 auto& current = target[buffer[i].second];
413 if (counter < current) {
414 current = counter;
415 }
416 ++counter;
417 }
418 } else {
419 Rank_ counter = 1;
420 Gene_ i = 0;
421 while (i < used) {
422 const auto original = i;
423 const auto val = buffer[i].first;
424
425 auto& current = target[buffer[i].second];
426 if (counter < current) {
427 current = counter;
428 }
429
430 while (++i < used && buffer[i].first == val) {
431 auto& current = target[buffer[i].second];
432 if (counter < current) {
433 current = counter;
434 }
435 }
436
437 counter += i - original;
438 }
439 }
440 }
441 }
442 }, ngroups, threads);
443}
444
445template<typename Gene_, typename Stat_, typename Rank_>
446SummaryBuffers<Stat_, Rank_> fill_summary_results(
447 const Gene_ ngenes,
448 SummaryResults<Stat_, Rank_>& out,
449 const bool compute_min,
450 const bool compute_mean,
451 const bool compute_median,
452 const bool compute_max,
453 const std::optional<std::vector<double> >& compute_quantiles,
454 const bool compute_min_rank
455) {
456 SummaryBuffers<Stat_, Rank_> ptr;
457 const auto out_len = sanisizer::cast<typename std::vector<Stat_>::size_type>(ngenes);
458
459 if (compute_min) {
460 out.min.resize(out_len
461#ifdef SCRAN_MARKERS_TEST_INIT
462 , SCRAN_MARKERS_TEST_INIT
463#endif
464 );
465 ptr.min = out.min.data();
466 }
467
468 if (compute_mean) {
469 out.mean.resize(out_len
470#ifdef SCRAN_MARKERS_TEST_INIT
471 , SCRAN_MARKERS_TEST_INIT
472#endif
473 );
474 ptr.mean = out.mean.data();
475 }
476
477 if (compute_median) {
478 out.median.resize(out_len
479#ifdef SCRAN_MARKERS_TEST_INIT
480 , SCRAN_MARKERS_TEST_INIT
481#endif
482 );
483 ptr.median = out.median.data();
484 }
485
486 if (compute_max) {
487 out.max.resize(out_len
488#ifdef SCRAN_MARKERS_TEST_INIT
489 , SCRAN_MARKERS_TEST_INIT
490#endif
491 );
492 ptr.max = out.max.data();
493 }
494
495 if (compute_quantiles.has_value()) {
496 out.quantiles.emplace();
497 ptr.quantiles.emplace();
498 out.quantiles->reserve(compute_quantiles->size());
499 ptr.quantiles->reserve(compute_quantiles->size());
500 for ([[maybe_unused]] const auto quan : *compute_quantiles) {
501 out.quantiles->emplace_back(out_len
502#ifdef SCRAN_MARKERS_TEST_INIT
503 , SCRAN_MARKERS_TEST_INIT
504#endif
505 );
506 ptr.quantiles->push_back(out.quantiles->back().data());
507 }
508 }
509
510 if (compute_min_rank) {
511 out.min_rank.resize(out_len
512#ifdef SCRAN_MARKERS_TEST_INIT
513 , SCRAN_MARKERS_TEST_INIT
514#endif
515 );
516 ptr.min_rank = out.min_rank.data();
517 }
518
519 return ptr;
520}
521
522template<typename Gene_, typename Stat_, typename Rank_>
523std::vector<SummaryBuffers<Stat_, Rank_> > fill_summary_results(
524 Gene_ ngenes,
525 const std::size_t ngroups,
526 std::vector<SummaryResults<Stat_, Rank_> >& outputs,
527 const bool compute_min,
528 const bool compute_mean,
529 const bool compute_median,
530 const bool compute_max,
531 const std::optional<std::vector<double> >& compute_quantiles,
532 const bool compute_min_rank
533) {
534 sanisizer::resize(outputs, ngroups);
535 std::vector<SummaryBuffers<Stat_, Rank_> > ptrs;
536 ptrs.reserve(ngroups);
537 for (I<decltype(ngroups)> g = 0; g < ngroups; ++g) {
538 ptrs.emplace_back(
539 fill_summary_results(
540 ngenes,
541 outputs[g],
542 compute_min,
543 compute_mean,
544 compute_median,
545 compute_max,
546 compute_quantiles,
547 compute_min_rank
548 )
549 );
550 }
551 return ptrs;
552}
553
554}
559}
560
561#endif
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:26
void parallelize(Function_ fun, const Index_ tasks, const int threads)
Pointers to arrays to hold the summary statistics.
Definition summarize_comparisons.hpp:32
Stat_ * mean
Definition summarize_comparisons.hpp:45
Stat_ * median
Definition summarize_comparisons.hpp:52
Stat_ * min
Definition summarize_comparisons.hpp:38
Stat_ * max
Definition summarize_comparisons.hpp:59
Rank_ * min_rank
Definition summarize_comparisons.hpp:81
std::optional< std::vector< Stat_ * > > quantiles
Definition summarize_comparisons.hpp:74
Container for the summary statistics.
Definition summarize_comparisons.hpp:91
std::vector< Stat_ > mean
Definition summarize_comparisons.hpp:102
std::vector< Stat_ > median
Definition summarize_comparisons.hpp:108
std::optional< std::vector< std::vector< Stat_ > > > quantiles
Definition summarize_comparisons.hpp:122
std::vector< Stat_ > min
Definition summarize_comparisons.hpp:96
std::vector< Rank_ > min_rank
Definition summarize_comparisons.hpp:128
std::vector< Stat_ > max
Definition summarize_comparisons.hpp:114