scran_markers
Marker detection for single-cell data
Loading...
Searching...
No Matches
score_markers_best.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
2#define SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
3
4#include <vector>
5#include <cstddef>
6
8#include "tatami/tatami.hpp"
9#include "tatami_stats/tatami_stats.hpp"
10#include "sanisizer/sanisizer.hpp"
11#include "topicks/topicks.hpp"
12
13#include "scan_matrix.hpp"
14#include "average_group_stats.hpp"
15#include "PrecomputedPairwiseWeights.hpp"
16#include "create_combinations.hpp"
17#include "cohens_d.hpp"
18#include "simple_diff.hpp"
19#include "utils.hpp"
20
26namespace scran_markers {
27
37 double threshold = 0;
38
43 int num_threads = 1;
44
48 bool compute_group_mean = true;
49
54
58 bool compute_cohens_d = true;
59
63 bool compute_auc = true;
64
68 bool compute_delta_mean = true;
69
74
79 bool largest_cohens_d = true;
80
85 bool largest_auc = true;
86
91 bool largest_delta_mean = true;
92
98
104 std::optional<double> threshold_cohens_d = 0;
105
111 std::optional<double> threshold_auc = 0.5;
112
118 std::optional<double> threshold_delta_mean = 0;
119
125 std::optional<double> threshold_delta_detected = 0;
126
130 bool keep_ties = false;
131
142 scran_blocks::WeightPolicy block_weight_policy = scran_blocks::WeightPolicy::VARIABLE;
143
149};
150
156template<typename Stat_, typename Index_>
162 std::vector<std::vector<Stat_> > mean;
163
168 std::vector<std::vector<Stat_> > detected;
169
180 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > cohens_d;
181
192 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > auc;
193
204 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_mean;
205
216 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_detected;
217};
218
222namespace internal {
223
224template<typename Stat_, typename Index_>
225using PairwiseTopQueues = std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >;
226
227template<typename Stat_, typename Index_>
228void allocate_best_top_queues(
229 PairwiseTopQueues<Stat_, Index_>& pqueues,
230 const std::size_t ngroups,
231 const int top,
232 const bool larger,
233 const bool keep_ties,
234 const std::optional<Stat_>& bound
235) {
237 opt.check_nan = true;
238 opt.keep_ties = keep_ties;
239 if (bound.has_value()) {
240 opt.bound = *bound;
241 }
242
243 sanisizer::resize(pqueues, ngroups);
244 for (auto& x : pqueues) {
245 x.reserve(ngroups);
246 for (decltype(I(ngroups)) g = 0; g < ngroups; ++g) {
247 x.emplace_back(top, larger, opt);
248 }
249 }
250}
251
252template<typename Stat_, typename Index_>
253void add_best_top_queues(
254 PairwiseTopQueues<Stat_, Index_>& pqueues,
255 const Index_ gene,
256 std::size_t ngroups,
257 const std::vector<Stat_>& effects
258) {
259 for (decltype(I(ngroups)) g1 = 0; g1 < ngroups; ++g1) {
260 for (decltype(I(ngroups)) g2 = 0; g2 < ngroups; ++g2) {
261 const auto val = effects[sanisizer::nd_offset<std::size_t>(g2, ngroups, g1)];
262 if (g1 != g2) {
263 pqueues[g1][g2].emplace(val, gene);
264 }
265 }
266 }
267}
268
269template<typename Stat_, typename Index_>
270void report_best_top_queues(
271 std::vector<PairwiseTopQueues<Stat_, Index_> >& pqueues,
272 std::size_t ngroups,
273 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > >& output
274) {
275 // We know it fits int an 'int' as this is what we got originally.
276 const int num_threads = pqueues.size();
277
278 // Consolidating all of the thread-specific queues into a single queue.
279 auto& true_pqueue = pqueues.front(); // we better have at least one thread.
280 for (int t = 1; t < num_threads; ++t) {
281 for (decltype(I(ngroups)) g1 = 0; g1 < ngroups; ++g1) {
282 for (decltype(I(ngroups)) g2 = 0; g2 < ngroups; ++g2) {
283 auto& current_in = pqueues[t][g1][g2];
284 auto& current_out = true_pqueue[g1][g2];
285 while (!current_in.empty()) {
286 current_out.push(current_in.top());
287 current_in.pop();
288 }
289 }
290 }
291 }
292
293 // Now spilling them out into a single vector.
294 sanisizer::resize(output, ngroups);
295 for (decltype(I(ngroups)) g1 = 0; g1 < ngroups; ++g1) {
296 sanisizer::resize(output[g1], ngroups);
297 for (decltype(I(ngroups)) g2 = 0; g2 < ngroups; ++g2) {
298 if (g1 == g2) {
299 continue;
300 }
301 auto& current_in = true_pqueue[g1][g2];
302 auto& current_out = output[g1][g2];
303 while (!current_in.empty()) {
304 const auto& best = current_in.top();
305 current_out.emplace_back(best.second, best.first);
306 current_in.pop();
307 }
308 std::reverse(current_out.begin(), current_out.end()); // earliest element should have the strongest effect sizes.
309 }
310 }
311}
312
313template<typename Index_, typename Stat_>
314void find_best_simple_best_effects(
315 const Index_ ngenes,
316 const std::size_t ngroups,
317 const std::size_t nblocks,
318 const std::size_t ncombos,
319 const std::vector<Stat_>& combo_weights,
320 std::vector<Stat_>& combo_means,
321 std::vector<Stat_>& combo_vars,
322 std::vector<Stat_>& combo_detected,
323 ScoreMarkersBestResults<Stat_, Index_>& output,
324 int top,
325 const ScoreMarkersBestOptions& options
326) {
327 std::vector<Stat_> total_weights_per_group;
328 const Stat_* total_weights_ptr = combo_weights.data();
329 if (nblocks > 1) {
330 total_weights_per_group = compute_total_weight_per_group(ngroups, nblocks, combo_weights.data());
331 total_weights_ptr = total_weights_per_group.data();
332 }
333 PrecomputedPairwiseWeights<Stat_> preweights(ngroups, nblocks, combo_weights.data());
334
335 std::vector<Stat_*> mptrs;
336 if (options.compute_group_mean) {
337 mptrs.reserve(ngroups);
338 sanisizer::resize(output.mean, ngroups);
339 for (auto& x : output.mean) {
340 sanisizer::resize(x, ngenes);
341 mptrs.push_back(x.data());
342 }
343 }
344
345 std::vector<Stat_*> dptrs;
346 if (options.compute_group_detected) {
347 dptrs.reserve(ngroups);
348 sanisizer::resize(output.detected, ngroups);
349 for (auto& x : output.detected) {
350 sanisizer::resize(x, ngenes);
351 dptrs.push_back(x.data());
352 }
353 }
354
355 // Setting up the output queues.
356 std::vector<PairwiseTopQueues<Stat_, Index_> > cohens_d_queues, delta_detected_queues, delta_mean_queues;
357 if (options.compute_cohens_d) {
358 sanisizer::resize(cohens_d_queues, options.num_threads);
359 }
360 if (options.compute_delta_mean) {
361 sanisizer::resize(delta_mean_queues, options.num_threads);
362 }
363 if (options.compute_delta_detected) {
364 sanisizer::resize(delta_detected_queues, options.num_threads);
365 }
366
367 const auto ngroups2 = sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups);
368
369 tatami::parallelize([&](const int t, const Index_ start, const Index_ length) -> void {
370 if (options.compute_cohens_d) {
371 allocate_best_top_queues(cohens_d_queues[t], ngroups, top, options.largest_cohens_d, options.keep_ties, options.threshold_cohens_d);
372 }
373 if (options.compute_delta_mean) {
374 allocate_best_top_queues(delta_mean_queues[t], ngroups, top, options.largest_delta_mean, options.keep_ties, options.threshold_delta_mean);
375 }
376 if (options.compute_delta_detected) {
377 allocate_best_top_queues(delta_detected_queues[t], ngroups, top, options.largest_delta_detected, options.keep_ties, options.threshold_delta_detected);
378 }
379 std::vector<Stat_> buffer;
380 if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {
381 buffer.resize(ngroups2);
382 }
383
384 for (Index_ gene = start, end = start + length; gene < end; ++gene) {
385 auto in_offset = sanisizer::product_unsafe<std::size_t>(gene, ncombos);
386
387 if (options.compute_group_mean) {
388 const auto tmp_means = combo_means.data() + in_offset;
389 average_group_stats(gene, ngroups, nblocks, tmp_means, combo_weights.data(), total_weights_ptr, mptrs);
390 }
391 if (options.compute_group_detected) {
392 const auto tmp_detected = combo_detected.data() + in_offset;
393 average_group_stats(gene, ngroups, nblocks, tmp_detected, combo_weights.data(), total_weights_ptr, dptrs);
394 }
395
396 // Computing the effect sizes.
397 if (options.compute_cohens_d) {
398 const auto tmp_means = combo_means.data() + in_offset;
399 const auto tmp_variances = combo_vars.data() + in_offset;
400 compute_pairwise_cohens_d(tmp_means, tmp_variances, ngroups, nblocks, preweights, options.threshold, buffer.data());
401 add_best_top_queues(cohens_d_queues[t], gene, ngroups, buffer);
402 }
403
404 if (options.compute_delta_mean) {
405 const auto tmp_means = combo_means.data() + in_offset;
406 compute_pairwise_simple_diff(tmp_means, ngroups, nblocks, preweights, buffer.data());
407 add_best_top_queues(delta_mean_queues[t], gene, ngroups, buffer);
408 }
409
410 if (options.compute_delta_detected) {
411 const auto tmp_detected = combo_detected.data() + in_offset;
412 compute_pairwise_simple_diff(tmp_detected, ngroups, nblocks, preweights, buffer.data());
413 add_best_top_queues(delta_detected_queues[t], gene, ngroups, buffer);
414 }
415 }
416 }, ngenes, options.num_threads);
417
418 // Now figuring out which of these are the top dogs.
419 if (options.compute_cohens_d) {
420 report_best_top_queues(cohens_d_queues, ngroups, output.cohens_d);
421 }
422
423 if (options.compute_delta_mean) {
424 report_best_top_queues(delta_mean_queues, ngroups, output.delta_mean);
425 }
426
427 if (options.compute_delta_detected) {
428 report_best_top_queues(delta_detected_queues, ngroups, output.delta_detected);
429 }
430}
431
432template<
433 bool single_block_,
434 typename Stat_,
435 typename Value_,
436 typename Index_,
437 typename Group_,
438 typename Block_
439>
440ScoreMarkersBestResults<Stat_, Index_> score_markers_best(
441 const tatami::Matrix<Value_, Index_>& matrix,
442 const std::size_t ngroups,
443 const Group_* const group,
444 const std::size_t nblocks,
445 const Block_* const block,
446 const std::size_t ncombos,
447 const std::size_t* const combo,
448 const std::vector<Index_>& combo_sizes,
449 int top,
450 const ScoreMarkersBestOptions& options
451) {
452 const auto ngenes = matrix.nrow();
453 const auto payload_size = sanisizer::product<typename std::vector<Stat_>::size_type>(ngenes, ncombos);
454 std::vector<Stat_> combo_means, combo_vars, combo_detected;
455 if (options.compute_group_mean || options.compute_cohens_d || options.compute_delta_mean) {
456 combo_means.resize(payload_size);
457 }
458 if (options.compute_cohens_d) {
459 combo_vars.resize(payload_size);
460 }
461 if (options.compute_group_detected || options.compute_delta_detected) {
462 combo_detected.resize(payload_size);
463 }
464
465 // For a single block, this usually doesn't really matter, but we do it for consistency with the multi-block case,
466 // and to account for variable weighting where non-zero block sizes get zero weight.
467 const auto combo_weights = scran_blocks::compute_weights<Stat_>(
468 combo_sizes,
469 options.block_weight_policy,
470 options.variable_block_weight_parameters
471 );
472
473 ScoreMarkersBestResults<Stat_, Index_> output;
474
475 if (options.compute_auc) {
476 auto auc_queues = sanisizer::create<std::vector<PairwiseTopQueues<Stat_, Index_> > >(options.num_threads);
477
478 struct AucResultWorkspace {
479 AucResultWorkspace() = default;
480 AucResultWorkspace(const std::size_t ngroups, PairwiseTopQueues<Stat_, Index_>& pqueue) :
481 pairwise_buffer(sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups)),
482 queue_ptr(&pqueue)
483 {};
484 std::vector<Stat_> pairwise_buffer;
485 PairwiseTopQueues<Stat_, Index_>* queue_ptr;
486 };
487
488 scan_matrix_by_row_custom_auc<single_block_>(
489 matrix,
490 ngroups,
491 group,
492 nblocks,
493 block,
494 ncombos,
495 combo,
496 combo_means,
497 combo_vars,
498 combo_detected,
499 /* do_auc = */ true,
500 /* auc_result_initialize = */ [&](int t) -> AucResultWorkspace {
501 allocate_best_top_queues(auc_queues[t], ngroups, top, options.largest_auc, options.keep_ties, options.threshold_auc);
502 return AucResultWorkspace(ngroups, auc_queues[t]);
503 },
504 /* auc_result_process = */ [&](const Index_ gene, AucScanWorkspace<Value_, Group_, Index_, Stat_>& auc_work, AucResultWorkspace& res_work) -> void {
505 process_auc_for_rows(auc_work, ngroups, nblocks, options.threshold, res_work.pairwise_buffer.data());
506 add_best_top_queues(*(res_work.queue_ptr), gene, ngroups, res_work.pairwise_buffer);
507 },
508 combo_sizes,
509 combo_weights,
510 options.num_threads
511 );
512
513 report_best_top_queues(auc_queues, ngroups, output.auc);
514
515 } else if (matrix.prefer_rows()) {
516 scan_matrix_by_row_full_auc<single_block_>(
517 matrix,
518 ngroups,
519 group,
520 nblocks,
521 block,
522 ncombos,
523 combo,
524 combo_means,
525 combo_vars,
526 combo_detected,
527 static_cast<Stat_*>(NULL),
528 combo_sizes,
529 combo_weights,
530 options.threshold,
531 options.num_threads
532 );
533
534 } else {
535 scan_matrix_by_column(
536 matrix,
537 [&]{
538 if constexpr(single_block_) {
539 return ngroups;
540 } else {
541 return ncombos;
542 }
543 }(),
544 [&]{
545 if constexpr(single_block_) {
546 return group;
547 } else {
548 return combo;
549 }
550 }(),
551 combo_means,
552 combo_vars,
553 combo_detected,
554 combo_sizes,
555 options.num_threads
556 );
557 }
558
559 find_best_simple_best_effects(
560 matrix.nrow(),
561 ngroups,
562 nblocks,
563 ncombos,
564 combo_weights,
565 combo_means,
566 combo_vars,
567 combo_detected,
568 output,
569 top,
570 options
571 );
572
573 return output;
574}
575
576}
603template<typename Stat_, typename Value_, typename Index_, typename Group_>
605 const tatami::Matrix<Value_, Index_>& matrix,
606 const Group_* const group,
607 int top,
608 const ScoreMarkersBestOptions& options
609) {
610 const Index_ NC = matrix.ncol();
611 const auto group_sizes = tatami_stats::tabulate_groups(group, NC);
612 const auto ngroups = sanisizer::cast<std::size_t>(group_sizes.size());
613
614 return internal::score_markers_best<true, Stat_>(
615 matrix,
616 ngroups,
617 group,
618 1,
619 static_cast<int*>(NULL),
620 ngroups,
621 static_cast<std::size_t*>(NULL),
622 group_sizes,
623 top,
624 options
625 );
626}
627
654template<typename Stat_, typename Value_, typename Index_, typename Group_, typename Block_>
656 const tatami::Matrix<Value_, Index_>& matrix,
657 const Group_* const group,
658 const Block_* const block,
659 int top,
660 const ScoreMarkersBestOptions& options
661) {
662 const Index_ NC = matrix.ncol();
663 const auto ngroups = tatami_stats::total_groups(group, NC);
664 const auto nblocks = tatami_stats::total_groups(block, NC);
665
666 const auto combinations = internal::create_combinations(ngroups, group, block, NC);
667 const auto combo_sizes = internal::tabulate_combinations<Index_>(ngroups, nblocks, combinations);
668 const auto ncombos = combo_sizes.size();
669
670 return internal::score_markers_best<false, Stat_>(
671 matrix,
672 sanisizer::cast<std::size_t>(ngroups),
673 group,
674 sanisizer::cast<std::size_t>(nblocks),
675 block,
676 sanisizer::cast<std::size_t>(ncombos),
677 combinations.data(),
678 combo_sizes,
679 top,
680 options
681 );
682}
683
684}
685
686#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
void compute_weights(const std::size_t num_blocks, const Size_ *const sizes, const WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *const weights)
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:25
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, int top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:604
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best_blocked(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Block_ *const block, int top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:655
void parallelize(Function_ fun, const Index_ tasks, const int threads)
Options for score_markers_best() and friends.
Definition score_markers_best.hpp:31
bool compute_cohens_d
Definition score_markers_best.hpp:58
std::optional< double > threshold_delta_mean
Definition score_markers_best.hpp:118
double threshold
Definition score_markers_best.hpp:37
bool compute_auc
Definition score_markers_best.hpp:63
bool largest_cohens_d
Definition score_markers_best.hpp:79
bool compute_delta_mean
Definition score_markers_best.hpp:68
bool largest_delta_mean
Definition score_markers_best.hpp:91
bool compute_group_detected
Definition score_markers_best.hpp:53
bool compute_group_mean
Definition score_markers_best.hpp:48
std::optional< double > threshold_cohens_d
Definition score_markers_best.hpp:104
std::optional< double > threshold_delta_detected
Definition score_markers_best.hpp:125
bool largest_delta_detected
Definition score_markers_best.hpp:97
bool largest_auc
Definition score_markers_best.hpp:85
scran_blocks::WeightPolicy block_weight_policy
Definition score_markers_best.hpp:142
std::optional< double > threshold_auc
Definition score_markers_best.hpp:111
int num_threads
Definition score_markers_best.hpp:43
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition score_markers_best.hpp:148
bool compute_delta_detected
Definition score_markers_best.hpp:73
bool keep_ties
Definition score_markers_best.hpp:130
Results for score_markers_best() and friends.
Definition score_markers_best.hpp:157
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > auc
Definition score_markers_best.hpp:192
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_mean
Definition score_markers_best.hpp:204
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > cohens_d
Definition score_markers_best.hpp:180
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_detected
Definition score_markers_best.hpp:216
std::vector< std::vector< Stat_ > > mean
Definition score_markers_best.hpp:162
std::vector< std::vector< Stat_ > > detected
Definition score_markers_best.hpp:168
std::optional< Stat_ > bound