scran_markers
Marker detection for single-cell data
Loading...
Searching...
No Matches
score_markers_best.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
2#define SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
3
4#include <vector>
5#include <cstddef>
6
8#include "tatami/tatami.hpp"
9#include "tatami_stats/tatami_stats.hpp"
10#include "sanisizer/sanisizer.hpp"
11#include "topicks/topicks.hpp"
12
13#include "scan_matrix.hpp"
14#include "average_group_stats.hpp"
15#include "block_averages.hpp"
16#include "create_combinations.hpp"
17#include "cohens_d.hpp"
18#include "simple_diff.hpp"
19#include "utils.hpp"
20
26namespace scran_markers {
27
37 double threshold = 0;
38
43 int num_threads = 1;
44
48 bool compute_group_mean = true;
49
54
58 bool compute_cohens_d = true;
59
63 bool compute_auc = true;
64
68 bool compute_delta_mean = true;
69
74
79 bool largest_cohens_d = true;
80
85 bool largest_auc = true;
86
91 bool largest_delta_mean = true;
92
98
104 std::optional<double> threshold_cohens_d = 0;
105
111 std::optional<double> threshold_auc = 0.5;
112
118 std::optional<double> threshold_delta_mean = 0;
119
125 std::optional<double> threshold_delta_detected = 0;
126
130 bool keep_ties = false;
131
137 BlockAveragePolicy block_average_policy = BlockAveragePolicy::MEAN;
138
151 scran_blocks::WeightPolicy block_weight_policy = scran_blocks::WeightPolicy::VARIABLE;
152
159
164 double block_quantile = 0.5;
165};
166
172template<typename Stat_, typename Index_>
178 std::vector<std::vector<Stat_> > mean;
179
184 std::vector<std::vector<Stat_> > detected;
185
196 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > cohens_d;
197
208 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > auc;
209
220 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_mean;
221
232 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_detected;
233};
234
238namespace internal {
239
240template<typename Stat_, typename Index_>
241using PairwiseTopQueues = std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >;
242
243template<typename Stat_, typename Index_>
244void allocate_best_top_queues(
245 PairwiseTopQueues<Stat_, Index_>& pqueues,
246 const std::size_t ngroups,
247 const Index_ top,
248 const bool larger,
249 const bool keep_ties,
250 const std::optional<Stat_>& bound
251) {
253 opt.check_nan = true;
254 opt.keep_ties = keep_ties;
255 if (bound.has_value()) {
256 opt.bound = *bound;
257 }
258
259 sanisizer::resize(pqueues, ngroups);
260 for (auto& x : pqueues) {
261 x.reserve(ngroups);
262 for (I<decltype(ngroups)> g = 0; g < ngroups; ++g) {
263 x.emplace_back(top, larger, opt);
264 }
265 }
266}
267
268template<typename Stat_, typename Index_>
269void add_best_top_queues(
270 PairwiseTopQueues<Stat_, Index_>& pqueues,
271 const Index_ gene,
272 std::size_t ngroups,
273 const std::vector<Stat_>& effects
274) {
275 for (I<decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
276 for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
277 const auto val = effects[sanisizer::nd_offset<std::size_t>(g2, ngroups, g1)];
278 if (g1 != g2) {
279 pqueues[g1][g2].emplace(val, gene);
280 }
281 }
282 }
283}
284
285template<typename Stat_, typename Index_>
286void report_best_top_queues(
287 std::vector<PairwiseTopQueues<Stat_, Index_> >& pqueues,
288 std::size_t ngroups,
289 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > >& output
290) {
291 // We know it fits into an 'int' as this is what we got originally.
292 const int num_threads = pqueues.size();
293
294 // Consolidating all of the thread-specific queues into a single queue.
295 auto& true_pqueue = pqueues.front(); // we better have at least one thread.
296 for (int t = 1; t < num_threads; ++t) {
297 for (I<decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
298 for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
299 auto& current_in = pqueues[t][g1][g2];
300 auto& current_out = true_pqueue[g1][g2];
301 while (!current_in.empty()) {
302 current_out.push(current_in.top());
303 current_in.pop();
304 }
305 }
306 }
307 }
308
309 // Now spilling them out into a single vector.
310 sanisizer::resize(output, ngroups);
311 for (I<decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
312 sanisizer::resize(output[g1], ngroups);
313 for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
314 if (g1 == g2) {
315 continue;
316 }
317 auto& current_in = true_pqueue[g1][g2];
318 auto& current_out = output[g1][g2];
319 while (!current_in.empty()) {
320 const auto& best = current_in.top();
321 current_out.emplace_back(best.second, best.first);
322 current_in.pop();
323 }
324 std::reverse(current_out.begin(), current_out.end()); // earliest element should have the strongest effect sizes.
325 }
326 }
327}
328
329template<typename Index_, typename Stat_>
330void find_best_simple_best_effects(
331 const Index_ ngenes,
332 const std::size_t ngroups,
333 const std::size_t nblocks,
334 const std::size_t ncombos,
335 const std::vector<Stat_>& combo_means,
336 const std::vector<Stat_>& combo_vars,
337 const std::vector<Stat_>& combo_detected,
338 const BlockAverageInfo<Stat_>& average_info,
339 const Index_ top,
340 const ScoreMarkersBestOptions& options,
341 ScoreMarkersBestResults<Stat_, Index_>& output
342) {
343 std::optional<std::vector<Stat_> > total_weights_per_group;
344 const Stat_* total_weights_ptr = NULL;
345 if (average_info.use_mean()) {
346 if (options.compute_group_mean || options.compute_group_detected) {
347 if (nblocks > 1) {
348 total_weights_per_group = compute_total_weight_per_group(ngroups, nblocks, average_info.combo_weights().data());
349 total_weights_ptr = total_weights_per_group->data();
350 } else {
351 total_weights_ptr = average_info.combo_weights().data();
352 }
353 }
354 }
355
356 std::vector<Stat_*> mptrs;
357 if (options.compute_group_mean) {
358 mptrs.reserve(ngroups);
359 sanisizer::resize(output.mean, ngroups);
360 for (auto& x : output.mean) {
361 sanisizer::resize(x, ngenes);
362 mptrs.push_back(x.data());
363 }
364 }
365
366 std::vector<Stat_*> dptrs;
367 if (options.compute_group_detected) {
368 dptrs.reserve(ngroups);
369 sanisizer::resize(output.detected, ngroups);
370 for (auto& x : output.detected) {
371 sanisizer::resize(x, ngenes);
372 dptrs.push_back(x.data());
373 }
374 }
375
376 std::optional<PrecomputedPairwiseWeights<Stat_> > preweights;
377 if (average_info.use_mean()) {
378 if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {
379 preweights.emplace(ngroups, nblocks, average_info.combo_weights().data());
380 }
381 }
382
383 // Setting up the output queues.
384 std::vector<PairwiseTopQueues<Stat_, Index_> > cohens_d_queues, delta_detected_queues, delta_mean_queues;
385 if (options.compute_cohens_d) {
386 sanisizer::resize(cohens_d_queues, options.num_threads);
387 }
388 if (options.compute_delta_mean) {
389 sanisizer::resize(delta_mean_queues, options.num_threads);
390 }
391 if (options.compute_delta_detected) {
392 sanisizer::resize(delta_detected_queues, options.num_threads);
393 }
394
395 const auto ngroups2 = sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups);
396
397 tatami::parallelize([&](const int t, const Index_ start, const Index_ length) -> void {
398 if (options.compute_cohens_d) {
399 allocate_best_top_queues(cohens_d_queues[t], ngroups, top, options.largest_cohens_d, options.keep_ties, options.threshold_cohens_d);
400 }
401 if (options.compute_delta_mean) {
402 allocate_best_top_queues(delta_mean_queues[t], ngroups, top, options.largest_delta_mean, options.keep_ties, options.threshold_delta_mean);
403 }
404 if (options.compute_delta_detected) {
405 allocate_best_top_queues(delta_detected_queues[t], ngroups, top, options.largest_delta_detected, options.keep_ties, options.threshold_delta_detected);
406 }
407 std::vector<Stat_> buffer;
408 if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {
409 buffer.resize(ngroups2);
410 }
411
412 std::optional<std::vector<Stat_> > qbuffer, qrevbuffer;
413 std::optional<scran_blocks::SingleQuantileVariable<Stat_, typename std::vector<Stat_>::iterator> > qcalc;
414 if (!average_info.use_mean()) {
415 qbuffer.emplace();
416 qrevbuffer.emplace();
417 qcalc.emplace(nblocks, average_info.quantile());
418 }
419
420 for (Index_ gene = start, end = start + length; gene < end; ++gene) {
421 auto in_offset = sanisizer::product_unsafe<std::size_t>(gene, ncombos);
422
423 if (options.compute_group_mean) {
424 const auto tmp_means = combo_means.data() + in_offset;
425 if (average_info.use_mean()) {
426 average_group_stats_blockmean(gene, ngroups, nblocks, tmp_means, average_info.combo_weights().data(), total_weights_ptr, mptrs);
427 } else {
428 average_group_stats_blockquantile(gene, ngroups, nblocks, tmp_means, *qbuffer, *qcalc, mptrs);
429 }
430 }
431
432 if (options.compute_group_detected) {
433 const auto tmp_detected = combo_detected.data() + in_offset;
434 if (average_info.use_mean()) {
435 average_group_stats_blockmean(gene, ngroups, nblocks, tmp_detected, average_info.combo_weights().data(), total_weights_ptr, dptrs);
436 } else {
437 average_group_stats_blockquantile(gene, ngroups, nblocks, tmp_detected, *qbuffer, *qcalc, dptrs);
438 }
439 }
440
441 // Computing the effect sizes.
442 if (options.compute_cohens_d) {
443 const auto tmp_means = combo_means.data() + in_offset;
444 const auto tmp_variances = combo_vars.data() + in_offset;
445 if (average_info.use_mean()) {
446 compute_pairwise_cohens_d_blockmean(tmp_means, tmp_variances, ngroups, nblocks, options.threshold, *preweights, buffer.data());
447 } else {
448 compute_pairwise_cohens_d_blockquantile(tmp_means, tmp_variances, ngroups, nblocks, options.threshold, *qbuffer, *qrevbuffer, *qcalc, buffer.data());
449 }
450 add_best_top_queues(cohens_d_queues[t], gene, ngroups, buffer);
451 }
452
453 if (options.compute_delta_mean) {
454 const auto tmp_means = combo_means.data() + in_offset;
455 if (average_info.use_mean()) {
456 compute_pairwise_simple_diff_blockmean(tmp_means, ngroups, nblocks, *preweights, buffer.data());
457 } else {
458 compute_pairwise_simple_diff_blockquantile(tmp_means, ngroups, nblocks, *qbuffer, *qcalc, buffer.data());
459 }
460 add_best_top_queues(delta_mean_queues[t], gene, ngroups, buffer);
461 }
462
463 if (options.compute_delta_detected) {
464 const auto tmp_detected = combo_detected.data() + in_offset;
465 if (average_info.use_mean()) {
466 compute_pairwise_simple_diff_blockmean(tmp_detected, ngroups, nblocks, *preweights, buffer.data());
467 } else {
468 compute_pairwise_simple_diff_blockquantile(tmp_detected, ngroups, nblocks, *qbuffer, *qcalc, buffer.data());
469 }
470 add_best_top_queues(delta_detected_queues[t], gene, ngroups, buffer);
471 }
472 }
473 }, ngenes, options.num_threads);
474
475 // Now figuring out which of these are the top dogs.
476 if (options.compute_cohens_d) {
477 report_best_top_queues(cohens_d_queues, ngroups, output.cohens_d);
478 }
479
480 if (options.compute_delta_mean) {
481 report_best_top_queues(delta_mean_queues, ngroups, output.delta_mean);
482 }
483
484 if (options.compute_delta_detected) {
485 report_best_top_queues(delta_detected_queues, ngroups, output.delta_detected);
486 }
487}
488
489template<
490 bool single_block_,
491 typename Stat_,
492 typename Value_,
493 typename Index_,
494 typename Group_,
495 typename Block_
496>
497ScoreMarkersBestResults<Stat_, Index_> score_markers_best(
498 const tatami::Matrix<Value_, Index_>& matrix,
499 const std::size_t ngroups,
500 const Group_* const group,
501 const std::size_t nblocks,
502 const Block_* const block,
503 const std::size_t ncombos,
504 const std::size_t* const combo,
505 const std::vector<Index_>& combo_sizes,
506 const Index_ top,
507 const ScoreMarkersBestOptions& options
508) {
509 const auto ngenes = matrix.nrow();
510 const auto payload_size = sanisizer::product<typename std::vector<Stat_>::size_type>(ngenes, ncombos);
511 std::vector<Stat_> combo_means, combo_vars, combo_detected;
512 if (options.compute_group_mean || options.compute_cohens_d || options.compute_delta_mean) {
513 combo_means.resize(payload_size);
514 }
515 if (options.compute_cohens_d) {
516 combo_vars.resize(payload_size);
517 }
518 if (options.compute_group_detected || options.compute_delta_detected) {
519 combo_detected.resize(payload_size);
520 }
521
522 // For a single block, this usually doesn't really matter, but we do it for consistency with the multi-block case,
523 // and to account for variable weighting where non-zero block sizes get zero weight.
524 BlockAverageInfo<Stat_> average_info;
525 if (options.block_average_policy == BlockAveragePolicy::MEAN) {
526 average_info = BlockAverageInfo<Stat_>(
528 combo_sizes,
529 options.block_weight_policy,
530 options.variable_block_weight_parameters
531 )
532 );
533 } else {
534 average_info = BlockAverageInfo<Stat_>(options.block_quantile);
535 }
536
537 ScoreMarkersBestResults<Stat_, Index_> output;
538
539 if (options.compute_auc) {
540 auto auc_queues = sanisizer::create<std::vector<PairwiseTopQueues<Stat_, Index_> > >(options.num_threads);
541
542 struct AucResultWorkspace {
543 AucResultWorkspace(const std::size_t ngroups, PairwiseTopQueues<Stat_, Index_>& pqueue) :
544 pairwise_buffer(sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups)),
545 queue_ptr(&pqueue)
546 {};
547 std::vector<Stat_> pairwise_buffer;
548 PairwiseTopQueues<Stat_, Index_>* queue_ptr;
549 };
550
551 scan_matrix_by_row_custom_auc<single_block_>(
552 matrix,
553 ngroups,
554 group,
555 nblocks,
556 block,
557 ncombos,
558 combo,
559 combo_sizes,
560 average_info,
561 combo_means,
562 combo_vars,
563 combo_detected,
564 /* do_auc = */ true,
565 /* auc_result_initialize = */ [&](int t) -> AucResultWorkspace {
566 allocate_best_top_queues(auc_queues[t], ngroups, top, options.largest_auc, options.keep_ties, options.threshold_auc);
567 return AucResultWorkspace(ngroups, auc_queues[t]);
568 },
569 /* auc_result_process = */ [&](const Index_ gene, AucScanWorkspace<Value_, Group_, Stat_, Index_>& auc_work, AucResultWorkspace& res_work) -> void {
570 process_auc_for_rows(auc_work, ngroups, nblocks, options.threshold, res_work.pairwise_buffer.data());
571 add_best_top_queues(*(res_work.queue_ptr), gene, ngroups, res_work.pairwise_buffer);
572 },
573 options.num_threads
574 );
575
576 report_best_top_queues(auc_queues, ngroups, output.auc);
577
578 } else if (matrix.prefer_rows()) {
579 scan_matrix_by_row_full_auc<single_block_>(
580 matrix,
581 ngroups,
582 group,
583 nblocks,
584 block,
585 ncombos,
586 combo,
587 combo_sizes,
588 average_info,
589 combo_means,
590 combo_vars,
591 combo_detected,
592 static_cast<Stat_*>(NULL),
593 options.threshold,
594 options.num_threads
595 );
596
597 } else {
598 scan_matrix_by_column(
599 matrix,
600 [&]{
601 if constexpr(single_block_) {
602 return ngroups;
603 } else {
604 return ncombos;
605 }
606 }(),
607 [&]{
608 if constexpr(single_block_) {
609 return group;
610 } else {
611 return combo;
612 }
613 }(),
614 combo_sizes,
615 combo_means,
616 combo_vars,
617 combo_detected,
618 options.num_threads
619 );
620 }
621
622 find_best_simple_best_effects(
623 matrix.nrow(),
624 ngroups,
625 nblocks,
626 ncombos,
627 combo_means,
628 combo_vars,
629 combo_detected,
630 average_info,
631 top,
632 options,
633 output
634 );
635
636 return output;
637}
638
639}
666template<typename Stat_, typename Value_, typename Index_, typename Group_>
668 const tatami::Matrix<Value_, Index_>& matrix,
669 const Group_* const group,
670 const Index_ top,
671 const ScoreMarkersBestOptions& options
672) {
673 const Index_ NC = matrix.ncol();
674 const auto group_sizes = tatami_stats::tabulate_groups(group, NC);
675 const auto ngroups = sanisizer::cast<std::size_t>(group_sizes.size());
676
677 return internal::score_markers_best<true, Stat_>(
678 matrix,
679 ngroups,
680 group,
681 1,
682 static_cast<int*>(NULL),
683 ngroups,
684 static_cast<std::size_t*>(NULL),
685 group_sizes,
686 top,
687 options
688 );
689}
690
717template<typename Stat_, typename Value_, typename Index_, typename Group_, typename Block_>
719 const tatami::Matrix<Value_, Index_>& matrix,
720 const Group_* const group,
721 const Block_* const block,
722 const Index_ top,
723 const ScoreMarkersBestOptions& options
724) {
725 const Index_ NC = matrix.ncol();
726 const auto ngroups = tatami_stats::total_groups(group, NC);
727 const auto nblocks = tatami_stats::total_groups(block, NC);
728
729 const auto combinations = internal::create_combinations(ngroups, group, block, NC);
730 const auto combo_sizes = internal::tabulate_combinations<Index_>(ngroups, nblocks, combinations);
731 const auto ncombos = combo_sizes.size();
732
733 return internal::score_markers_best<false, Stat_>(
734 matrix,
735 sanisizer::cast<std::size_t>(ngroups),
736 group,
737 sanisizer::cast<std::size_t>(nblocks),
738 block,
739 sanisizer::cast<std::size_t>(ncombos),
740 combinations.data(),
741 combo_sizes,
742 top,
743 options
744 );
745}
746
747}
748
749#endif
Averaging statistics over blocks.
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
void compute_weights(const std::size_t num_blocks, const Size_ *const sizes, const WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *const weights)
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:26
BlockAveragePolicy
Definition block_averages.hpp:27
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best_blocked(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Block_ *const block, const Index_ top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:718
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Index_ top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:667
void parallelize(Function_ fun, const Index_ tasks, const int threads)
Options for score_markers_best() and friends.
Definition score_markers_best.hpp:31
bool compute_cohens_d
Definition score_markers_best.hpp:58
std::optional< double > threshold_delta_mean
Definition score_markers_best.hpp:118
double threshold
Definition score_markers_best.hpp:37
bool compute_auc
Definition score_markers_best.hpp:63
bool largest_cohens_d
Definition score_markers_best.hpp:79
BlockAveragePolicy block_average_policy
Definition score_markers_best.hpp:137
bool compute_delta_mean
Definition score_markers_best.hpp:68
bool largest_delta_mean
Definition score_markers_best.hpp:91
bool compute_group_detected
Definition score_markers_best.hpp:53
bool compute_group_mean
Definition score_markers_best.hpp:48
double block_quantile
Definition score_markers_best.hpp:164
std::optional< double > threshold_cohens_d
Definition score_markers_best.hpp:104
std::optional< double > threshold_delta_detected
Definition score_markers_best.hpp:125
bool largest_delta_detected
Definition score_markers_best.hpp:97
bool largest_auc
Definition score_markers_best.hpp:85
scran_blocks::WeightPolicy block_weight_policy
Definition score_markers_best.hpp:151
std::optional< double > threshold_auc
Definition score_markers_best.hpp:111
int num_threads
Definition score_markers_best.hpp:43
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition score_markers_best.hpp:158
bool compute_delta_detected
Definition score_markers_best.hpp:73
bool keep_ties
Definition score_markers_best.hpp:130
Results for score_markers_best() and friends.
Definition score_markers_best.hpp:173
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > auc
Definition score_markers_best.hpp:208
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_mean
Definition score_markers_best.hpp:220
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > cohens_d
Definition score_markers_best.hpp:196
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_detected
Definition score_markers_best.hpp:232
std::vector< std::vector< Stat_ > > mean
Definition score_markers_best.hpp:178
std::vector< std::vector< Stat_ > > detected
Definition score_markers_best.hpp:184
std::optional< Stat_ > bound