scran_markers
Marker detection for single-cell data
Loading...
Searching...
No Matches
score_markers_best.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
2#define SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
3
4#include <vector>
5#include <cstddef>
6
8#include "tatami/tatami.hpp"
9#include "tatami_stats/tatami_stats.hpp"
10#include "sanisizer/sanisizer.hpp"
11#include "topicks/topicks.hpp"
12
13#include "scan_matrix.hpp"
14#include "average_group_stats.hpp"
15#include "PrecomputedPairwiseWeights.hpp"
16#include "create_combinations.hpp"
17#include "cohens_d.hpp"
18#include "simple_diff.hpp"
19#include "utils.hpp"
20
26namespace scran_markers {
27
37 double threshold = 0;
38
43 int num_threads = 1;
44
48 bool compute_cohens_d = true;
49
53 bool compute_auc = true;
54
58 bool compute_delta_mean = true;
59
64
69 bool largest_cohens_d = true;
70
75 bool largest_auc = true;
76
81 bool largest_delta_mean = true;
82
88
94 std::optional<double> threshold_cohens_d = 0;
95
101 std::optional<double> threshold_auc = 0.5;
102
108 std::optional<double> threshold_delta_mean = 0;
109
115 std::optional<double> threshold_delta_detected = 0;
116
120 bool keep_ties = true;
121
132 scran_blocks::WeightPolicy block_weight_policy = scran_blocks::WeightPolicy::VARIABLE;
133
139};
140
146template<typename Stat_, typename Index_>
152 std::vector<std::vector<Stat_> > mean;
153
158 std::vector<std::vector<Stat_> > detected;
159
170 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > cohens_d;
171
182 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > auc;
183
194 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_mean;
195
206 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_detected;
207};
208
212namespace internal {
213
214template<typename Stat_, typename Index_>
215using PairwiseTopQueues = std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >;
216
217template<typename Stat_, typename Index_>
218void allocate_best_top_queues(
219 PairwiseTopQueues<Stat_, Index_>& pqueues,
220 const std::size_t ngroups,
221 const int top,
222 const bool larger,
223 const bool keep_ties,
224 const std::optional<Stat_>& bound
225) {
227 opt.check_nan = true;
228 opt.keep_ties = keep_ties;
229 if (bound.has_value()) {
230 opt.bound = *bound;
231 }
232
233 sanisizer::resize(pqueues, ngroups);
234 for (auto& x : pqueues) {
235 x.reserve(ngroups);
236 for (decltype(I(ngroups)) g = 0; g < ngroups; ++g) {
237 x.emplace_back(top, larger, opt);
238 }
239 }
240}
241
242template<typename Stat_, typename Index_>
243void add_best_top_queues(
244 PairwiseTopQueues<Stat_, Index_>& pqueues,
245 const Index_ gene,
246 std::size_t ngroups,
247 const std::vector<Stat_>& effects
248) {
249 for (decltype(I(ngroups)) g1 = 0; g1 < ngroups; ++g1) {
250 for (decltype(I(ngroups)) g2 = 0; g2 < ngroups; ++g2) {
251 const auto val = effects[sanisizer::nd_offset<std::size_t>(g2, ngroups, g1)];
252 if (g1 != g2) {
253 pqueues[g1][g2].emplace(val, gene);
254 }
255 }
256 }
257}
258
259template<typename Stat_, typename Index_>
260void report_best_top_queues(
261 std::vector<PairwiseTopQueues<Stat_, Index_> >& pqueues,
262 std::size_t ngroups,
263 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > >& output
264) {
265 // We know it fits int an 'int' as this is what we got originally.
266 const int num_threads = pqueues.size();
267
268 // Consolidating all of the thread-specific queues into a single queue.
269 auto& true_pqueue = pqueues.front(); // we better have at least one thread.
270 for (int t = 1; t < num_threads; ++t) {
271 for (decltype(I(ngroups)) g1 = 0; g1 < ngroups; ++g1) {
272 for (decltype(I(ngroups)) g2 = 0; g2 < ngroups; ++g2) {
273 auto& current_in = pqueues[t][g1][g2];
274 auto& current_out = true_pqueue[g1][g2];
275 while (!current_in.empty()) {
276 current_out.push(current_in.top());
277 current_in.pop();
278 }
279 }
280 }
281 }
282
283 // Now spilling them out into a single vector.
284 sanisizer::resize(output, ngroups);
285 for (decltype(I(ngroups)) g1 = 0; g1 < ngroups; ++g1) {
286 sanisizer::resize(output[g1], ngroups);
287 for (decltype(I(ngroups)) g2 = 0; g2 < ngroups; ++g2) {
288 if (g1 == g2) {
289 continue;
290 }
291 auto& current_in = true_pqueue[g1][g2];
292 auto& current_out = output[g1][g2];
293 while (!current_in.empty()) {
294 const auto& best = current_in.top();
295 current_out.emplace_back(best.second, best.first);
296 current_in.pop();
297 }
298 std::reverse(current_out.begin(), current_out.end()); // earliest element should have the strongest effect sizes.
299 }
300 }
301}
302
303template<typename Index_, typename Stat_>
304void find_best_simple_best_effects(
305 const Index_ ngenes,
306 const std::size_t ngroups,
307 const std::size_t nblocks,
308 const std::size_t ncombos,
309 const std::vector<Stat_>& combo_weights,
310 std::vector<Stat_>& combo_means,
311 std::vector<Stat_>& combo_vars,
312 std::vector<Stat_>& combo_detected,
313 ScoreMarkersBestResults<Stat_, Index_>& output,
314 int top,
315 const ScoreMarkersBestOptions& options
316) {
317 std::vector<Stat_> total_weights_per_group;
318 const Stat_* total_weights_ptr = combo_weights.data();
319 if (nblocks > 1) {
320 total_weights_per_group = compute_total_weight_per_group(ngroups, nblocks, combo_weights.data());
321 total_weights_ptr = total_weights_per_group.data();
322 }
323 PrecomputedPairwiseWeights<Stat_> preweights(ngroups, nblocks, combo_weights.data());
324
325 std::vector<Stat_*> mptrs;
326 mptrs.reserve(ngroups);
327 sanisizer::resize(output.mean, ngroups);
328 for (auto& x : output.mean) {
329 sanisizer::resize(x, ngenes);
330 mptrs.push_back(x.data());
331 }
332
333 std::vector<Stat_*> dptrs;
334 dptrs.reserve(ngroups);
335 sanisizer::resize(output.detected, ngroups);
336 for (auto& x : output.detected) {
337 sanisizer::resize(x, ngenes);
338 dptrs.push_back(x.data());
339 }
340
341 // Setting up the output queues.
342 std::vector<PairwiseTopQueues<Stat_, Index_> > cohens_d_queues, delta_detected_queues, delta_mean_queues;
343 if (options.compute_cohens_d) {
344 sanisizer::resize(cohens_d_queues, options.num_threads);
345 }
346 if (options.compute_delta_mean) {
347 sanisizer::resize(delta_mean_queues, options.num_threads);
348 }
349 if (options.compute_delta_detected) {
350 sanisizer::resize(delta_detected_queues, options.num_threads);
351 }
352
353 const auto ngroups2 = sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups);
354
355 tatami::parallelize([&](const int t, const Index_ start, const Index_ length) -> void {
356 if (options.compute_cohens_d) {
357 allocate_best_top_queues(cohens_d_queues[t], ngroups, top, options.largest_cohens_d, options.keep_ties, options.threshold_cohens_d);
358 }
359 if (options.compute_delta_mean) {
360 allocate_best_top_queues(delta_mean_queues[t], ngroups, top, options.largest_delta_mean, options.keep_ties, options.threshold_delta_mean);
361 }
362 if (options.compute_delta_detected) {
363 allocate_best_top_queues(delta_detected_queues[t], ngroups, top, options.largest_delta_detected, options.keep_ties, options.threshold_delta_detected);
364 }
365 std::vector<Stat_> buffer(ngroups2);
366
367 for (Index_ gene = start, end = start + length; gene < end; ++gene) {
368 auto in_offset = sanisizer::product_unsafe<std::size_t>(gene, ncombos);
369 const auto tmp_means = combo_means.data() + in_offset;
370 const auto tmp_variances = combo_vars.data() + in_offset;
371 const auto tmp_detected = combo_detected.data() + in_offset;
372 average_group_stats(gene, ngroups, nblocks, tmp_means, tmp_detected, combo_weights.data(), total_weights_ptr, mptrs, dptrs);
373
374 // Computing the effect sizes.
375 if (options.compute_cohens_d) {
376 compute_pairwise_cohens_d(tmp_means, tmp_variances, ngroups, nblocks, preweights, options.threshold, buffer.data());
377 add_best_top_queues(cohens_d_queues[t], gene, ngroups, buffer);
378 }
379
380 if (options.compute_delta_mean) {
381 compute_pairwise_simple_diff(tmp_means, ngroups, nblocks, preweights, buffer.data());
382 add_best_top_queues(delta_mean_queues[t], gene, ngroups, buffer);
383 }
384
385 if (options.compute_delta_detected) {
386 compute_pairwise_simple_diff(tmp_detected, ngroups, nblocks, preweights, buffer.data());
387 add_best_top_queues(delta_detected_queues[t], gene, ngroups, buffer);
388 }
389 }
390 }, ngenes, options.num_threads);
391
392 // Now figuring out which of these are the top dogs.
393 if (options.compute_cohens_d) {
394 report_best_top_queues(cohens_d_queues, ngroups, output.cohens_d);
395 }
396
397 if (options.compute_delta_mean) {
398 report_best_top_queues(delta_mean_queues, ngroups, output.delta_mean);
399 }
400
401 if (options.compute_delta_detected) {
402 report_best_top_queues(delta_detected_queues, ngroups, output.delta_detected);
403 }
404}
405
406template<
407 bool single_block_,
408 typename Stat_,
409 typename Value_,
410 typename Index_,
411 typename Group_,
412 typename Block_
413>
414ScoreMarkersBestResults<Stat_, Index_> score_markers_best(
415 const tatami::Matrix<Value_, Index_>& matrix,
416 const std::size_t ngroups,
417 const Group_* const group,
418 const std::size_t nblocks,
419 const Block_* const block,
420 const std::size_t ncombos,
421 const std::size_t* const combo,
422 const std::vector<Index_>& combo_sizes,
423 int top,
424 const ScoreMarkersBestOptions& options
425) {
426 const auto ngenes = matrix.nrow();
427 const auto payload_size = sanisizer::product<typename std::vector<Stat_>::size_type>(ngenes, ncombos);
428 std::vector<Stat_> combo_means(payload_size), combo_vars(payload_size), combo_detected(payload_size);
429
430 // For a single block, this usually doesn't really matter, but we do it for consistency with the multi-block case,
431 // and to account for variable weighting where non-zero block sizes get zero weight.
432 const auto combo_weights = scran_blocks::compute_weights<Stat_>(
433 combo_sizes,
434 options.block_weight_policy,
435 options.variable_block_weight_parameters
436 );
437
438 ScoreMarkersBestResults<Stat_, Index_> output;
439
440 if (options.compute_auc) {
441 auto auc_queues = sanisizer::create<std::vector<PairwiseTopQueues<Stat_, Index_> > >(options.num_threads);
442
443 struct AucResultWorkspace {
444 AucResultWorkspace() = default;
445 AucResultWorkspace(const std::size_t ngroups, PairwiseTopQueues<Stat_, Index_>& pqueue) :
446 pairwise_buffer(sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups)),
447 queue_ptr(&pqueue)
448 {};
449 std::vector<Stat_> pairwise_buffer;
450 PairwiseTopQueues<Stat_, Index_>* queue_ptr;
451 };
452
453 scan_matrix_by_row_custom_auc<single_block_>(
454 matrix,
455 ngroups,
456 group,
457 nblocks,
458 block,
459 ncombos,
460 combo,
461 combo_means,
462 combo_vars,
463 combo_detected,
464 /* do_auc = */ true,
465 /* auc_result_initialize = */ [&](int t) -> AucResultWorkspace {
466 allocate_best_top_queues(auc_queues[t], ngroups, top, options.largest_auc, options.keep_ties, options.threshold_auc);
467 return AucResultWorkspace(ngroups, auc_queues[t]);
468 },
469 /* auc_result_process = */ [&](const Index_ gene, AucScanWorkspace<Value_, Group_, Index_, Stat_>& auc_work, AucResultWorkspace& res_work) -> void {
470 process_auc_for_rows(auc_work, ngroups, nblocks, options.threshold, res_work.pairwise_buffer.data());
471 add_best_top_queues(*(res_work.queue_ptr), gene, ngroups, res_work.pairwise_buffer);
472 },
473 combo_sizes,
474 combo_weights,
475 options.num_threads
476 );
477
478 report_best_top_queues(auc_queues, ngroups, output.auc);
479
480 } else if (matrix.prefer_rows()) {
481 scan_matrix_by_row_full_auc<single_block_>(
482 matrix,
483 ngroups,
484 group,
485 nblocks,
486 block,
487 ncombos,
488 combo,
489 combo_means,
490 combo_vars,
491 combo_detected,
492 static_cast<Stat_*>(NULL),
493 combo_sizes,
494 combo_weights,
495 options.threshold,
496 options.num_threads
497 );
498
499 } else {
500 scan_matrix_by_column(
501 matrix,
502 [&]{
503 if constexpr(single_block_) {
504 return ngroups;
505 } else {
506 return ncombos;
507 }
508 }(),
509 [&]{
510 if constexpr(single_block_) {
511 return group;
512 } else {
513 return combo;
514 }
515 }(),
516 combo_means,
517 combo_vars,
518 combo_detected,
519 combo_sizes,
520 options.num_threads
521 );
522 }
523
524 find_best_simple_best_effects(
525 matrix.nrow(),
526 ngroups,
527 nblocks,
528 ncombos,
529 combo_weights,
530 combo_means,
531 combo_vars,
532 combo_detected,
533 output,
534 top,
535 options
536 );
537
538 return output;
539}
540
541}
568template<typename Stat_, typename Value_, typename Index_, typename Group_>
570 const tatami::Matrix<Value_, Index_>& matrix,
571 const Group_* const group,
572 int top,
573 const ScoreMarkersBestOptions& options
574) {
575 const Index_ NC = matrix.ncol();
576 const auto group_sizes = tatami_stats::tabulate_groups(group, NC);
577 const auto ngroups = sanisizer::cast<std::size_t>(group_sizes.size());
578
579 return internal::score_markers_best<true, Stat_>(
580 matrix,
581 ngroups,
582 group,
583 1,
584 static_cast<int*>(NULL),
585 ngroups,
586 static_cast<std::size_t*>(NULL),
587 group_sizes,
588 top,
589 options
590 );
591}
592
619template<typename Stat_, typename Value_, typename Index_, typename Group_, typename Block_>
621 const tatami::Matrix<Value_, Index_>& matrix,
622 const Group_* const group,
623 const Block_* const block,
624 int top,
625 const ScoreMarkersBestOptions& options
626) {
627 const Index_ NC = matrix.ncol();
628 const auto ngroups = tatami_stats::total_groups(group, NC);
629 const auto nblocks = tatami_stats::total_groups(block, NC);
630
631 const auto combinations = internal::create_combinations(ngroups, group, block, NC);
632 const auto combo_sizes = internal::tabulate_combinations<Index_>(ngroups, nblocks, combinations);
633 const auto ncombos = combo_sizes.size();
634
635 return internal::score_markers_best<false, Stat_>(
636 matrix,
637 sanisizer::cast<std::size_t>(ngroups),
638 group,
639 sanisizer::cast<std::size_t>(nblocks),
640 block,
641 sanisizer::cast<std::size_t>(ncombos),
642 combinations.data(),
643 combo_sizes,
644 top,
645 options
646 );
647}
648
649}
650
651#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
void compute_weights(const std::size_t num_blocks, const Size_ *const sizes, const WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *const weights)
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:25
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, int top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:569
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best_blocked(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Block_ *const block, int top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:620
void parallelize(Function_ fun, const Index_ tasks, const int threads)
Options for score_markers_best() and friends.
Definition score_markers_best.hpp:31
bool compute_cohens_d
Definition score_markers_best.hpp:48
std::optional< double > threshold_delta_mean
Definition score_markers_best.hpp:108
double threshold
Definition score_markers_best.hpp:37
bool compute_auc
Definition score_markers_best.hpp:53
bool largest_cohens_d
Definition score_markers_best.hpp:69
bool compute_delta_mean
Definition score_markers_best.hpp:58
bool largest_delta_mean
Definition score_markers_best.hpp:81
std::optional< double > threshold_cohens_d
Definition score_markers_best.hpp:94
std::optional< double > threshold_delta_detected
Definition score_markers_best.hpp:115
bool largest_delta_detected
Definition score_markers_best.hpp:87
bool largest_auc
Definition score_markers_best.hpp:75
scran_blocks::WeightPolicy block_weight_policy
Definition score_markers_best.hpp:132
std::optional< double > threshold_auc
Definition score_markers_best.hpp:101
int num_threads
Definition score_markers_best.hpp:43
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition score_markers_best.hpp:138
bool compute_delta_detected
Definition score_markers_best.hpp:63
bool keep_ties
Definition score_markers_best.hpp:120
Results for score_markers_best() and friends.
Definition score_markers_best.hpp:147
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > auc
Definition score_markers_best.hpp:182
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_mean
Definition score_markers_best.hpp:194
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > cohens_d
Definition score_markers_best.hpp:170
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_detected
Definition score_markers_best.hpp:206
std::vector< std::vector< Stat_ > > mean
Definition score_markers_best.hpp:152
std::vector< std::vector< Stat_ > > detected
Definition score_markers_best.hpp:158
std::optional< Stat_ > bound