1#ifndef SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
2#define SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP
10#include "tatami_stats/tatami_stats.hpp"
11#include "sanisizer/sanisizer.hpp"
13#include "quickstats/quickstats.hpp"
15#include "scan_matrix.hpp"
16#include "average_group_stats.hpp"
18#include "create_combinations.hpp"
19#include "cohens_d.hpp"
20#include "simple_diff.hpp"
174template<
typename Stat_,
typename Index_>
180 std::vector<std::vector<Stat_> >
mean;
201 std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >
cohens_d;
216 std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >
auc;
231 std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >
delta_mean;
261template<
typename Stat_,
typename Index_>
263 std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > output;
264 const auto ngroups = queued.size();
265 sanisizer::resize(output, ngroups);
267 for (I<
decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
268 sanisizer::resize(output[g1], ngroups);
269 for (I<
decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
274 auto& current_in = queued[g1][g2];
275 auto& current_out = output[g1][g2];
276 current_out.reserve(current_in.size());
278 while (!current_in.empty()) {
279 const auto& best = current_in.top();
280 current_out.emplace_back(best.second, best.first);
283 std::reverse(current_out.begin(), current_out.end());
295template<
typename Stat_,
typename Index_>
296using PairwiseTopQueues = std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >;
298template<
typename Stat_,
typename Index_>
299void allocate_best_top_queues(
300 PairwiseTopQueues<Stat_, Index_>& pqueues,
301 const std::size_t ngroups,
304 const bool keep_ties,
305 const std::optional<Stat_>& bound
310 if (bound.has_value()) {
314 sanisizer::resize(pqueues, ngroups);
315 for (I<
decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
316 auto& x = pqueues[g1];
318 for (I<
decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
320 x.emplace_back(0, larger, opt);
322 x.emplace_back(top, larger, opt);
328template<
typename Stat_,
typename Index_>
329void add_best_top_queues(
330 PairwiseTopQueues<Stat_, Index_>& pqueues,
333 const std::vector<Stat_>& effects
335 for (I<
decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
336 for (I<
decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
337 const auto val = effects[sanisizer::nd_offset<std::size_t>(g2, ngroups, g1)];
339 pqueues[g1][g2].emplace(val, gene);
345template<
typename Stat_,
typename Index_>
346void report_best_top_queues(
347 std::vector<std::optional<PairwiseTopQueues<Stat_, Index_> > >& pqueues,
352 const int num_available = pqueues.size();
355 if (num_available == 0) {
356 sanisizer::resize(output, ngroups);
358 for (I<
decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
359 sanisizer::resize(output[g1], ngroups, placeholder);
365 auto& true_pqueue = *(pqueues.front());
366 for (
int t = 1; t < num_available; ++t) {
367 auto& current_pqueue = *(pqueues[t]);
368 for (I<
decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {
369 for (I<
decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
370 auto& current_in = current_pqueue[g1][g2];
371 auto& current_out = true_pqueue[g1][g2];
372 while (!current_in.empty()) {
373 current_out.push(current_in.top());
380 output = std::move(true_pqueue);
383template<
typename Index_,
typename Stat_>
384void find_best_simple_best_effects(
386 const std::size_t ngroups,
387 const std::size_t nblocks,
388 const std::size_t ncombos,
389 const std::vector<Stat_>& combo_means,
390 const std::vector<Stat_>& combo_vars,
391 const std::vector<Stat_>& combo_detected,
392 const BlockAverageInfo<Stat_>& average_info,
394 const ScoreMarkersBestOptions& options,
395 ScoreMarkersBestResults<Stat_, Index_>& output
397 std::optional<std::vector<Stat_> > total_weights_per_group;
398 const Stat_* total_weights_ptr = NULL;
399 if (average_info.use_mean()) {
400 if (options.compute_group_mean || options.compute_group_detected) {
402 total_weights_per_group = compute_total_weight_per_group(ngroups, nblocks, average_info.combo_weights().data());
403 total_weights_ptr = total_weights_per_group->data();
405 total_weights_ptr = average_info.combo_weights().data();
410 std::vector<Stat_*> mptrs;
411 if (options.compute_group_mean) {
412 mptrs.reserve(ngroups);
413 sanisizer::resize(output.mean, ngroups);
414 for (
auto& x : output.mean) {
415 sanisizer::resize(x, ngenes);
416 mptrs.push_back(x.data());
420 std::vector<Stat_*> dptrs;
421 if (options.compute_group_detected) {
422 dptrs.reserve(ngroups);
423 sanisizer::resize(output.detected, ngroups);
424 for (
auto& x : output.detected) {
425 sanisizer::resize(x, ngenes);
426 dptrs.push_back(x.data());
430 std::optional<PrecomputedPairwiseWeights<Stat_> > preweights;
431 if (average_info.use_mean()) {
432 if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {
433 preweights.emplace(ngroups, nblocks, average_info.combo_weights().data());
438 std::optional<std::vector<std::optional<PairwiseTopQueues<Stat_, Index_> > > > threaded_cohens_d_queues, threaded_delta_detected_queues, threaded_delta_mean_queues;
439 if (options.compute_cohens_d) {
440 threaded_cohens_d_queues.emplace(sanisizer::cast<I<
decltype(threaded_cohens_d_queues->size())> >(options.num_threads));
442 if (options.compute_delta_mean) {
443 threaded_delta_mean_queues.emplace(sanisizer::cast<I<
decltype(threaded_delta_mean_queues->size())> >(options.num_threads));
445 if (options.compute_delta_detected) {
446 threaded_delta_detected_queues.emplace(sanisizer::cast<I<
decltype(threaded_delta_detected_queues->size())> >(options.num_threads));
449 const auto ngroups2 = sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups);
451 int num_used =
tatami::parallelize([&](
const int t,
const Index_ start,
const Index_ length) ->
void {
452 std::optional<PairwiseTopQueues<Stat_, Index_> > local_cohens_d_queue, local_delta_mean_queue, local_delta_detected_queue;
453 if (options.compute_cohens_d) {
454 local_cohens_d_queue.emplace();
455 allocate_best_top_queues(*local_cohens_d_queue, ngroups, top, options.largest_cohens_d, options.keep_ties, options.threshold_cohens_d);
457 if (options.compute_delta_mean) {
458 local_delta_mean_queue.emplace();
459 allocate_best_top_queues(*local_delta_mean_queue, ngroups, top, options.largest_delta_mean, options.keep_ties, options.threshold_delta_mean);
461 if (options.compute_delta_detected) {
462 local_delta_detected_queue.emplace();
463 allocate_best_top_queues(*local_delta_detected_queue, ngroups, top, options.largest_delta_detected, options.keep_ties, options.threshold_delta_detected);
466 std::vector<Stat_> buffer;
467 if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {
468 buffer.resize(ngroups2);
471 std::optional<std::vector<Stat_> > qbuffer, qrevbuffer;
472 std::optional<quickstats::SingleQuantileVariableNumber<Stat_, std::size_t> > qcalc;
473 if (!average_info.use_mean()) {
475 qrevbuffer.emplace();
476 qcalc.emplace(nblocks, average_info.quantile());
479 for (Index_ gene = start, end = start + length; gene < end; ++gene) {
480 auto in_offset = sanisizer::product_unsafe<std::size_t>(gene, ncombos);
482 if (options.compute_group_mean) {
483 const auto tmp_means = combo_means.data() + in_offset;
484 if (average_info.use_mean()) {
485 average_group_stats_blockmean(gene, ngroups, nblocks, tmp_means, average_info.combo_weights().data(), total_weights_ptr, mptrs);
487 average_group_stats_blockquantile(gene, ngroups, nblocks, tmp_means, *qbuffer, *qcalc, mptrs);
491 if (options.compute_group_detected) {
492 const auto tmp_detected = combo_detected.data() + in_offset;
493 if (average_info.use_mean()) {
494 average_group_stats_blockmean(gene, ngroups, nblocks, tmp_detected, average_info.combo_weights().data(), total_weights_ptr, dptrs);
496 average_group_stats_blockquantile(gene, ngroups, nblocks, tmp_detected, *qbuffer, *qcalc, dptrs);
501 if (options.compute_cohens_d) {
502 const auto tmp_means = combo_means.data() + in_offset;
503 const auto tmp_variances = combo_vars.data() + in_offset;
504 if (average_info.use_mean()) {
505 compute_pairwise_cohens_d_blockmean(tmp_means, tmp_variances, ngroups, nblocks, options.threshold, *preweights, buffer.data());
507 compute_pairwise_cohens_d_blockquantile(tmp_means, tmp_variances, ngroups, nblocks, options.threshold, *qbuffer, *qrevbuffer, *qcalc, buffer.data());
509 add_best_top_queues(*local_cohens_d_queue, gene, ngroups, buffer);
512 if (options.compute_delta_mean) {
513 const auto tmp_means = combo_means.data() + in_offset;
514 if (average_info.use_mean()) {
515 compute_pairwise_simple_diff_blockmean(tmp_means, ngroups, nblocks, *preweights, buffer.data());
517 compute_pairwise_simple_diff_blockquantile(tmp_means, ngroups, nblocks, *qbuffer, *qcalc, buffer.data());
519 add_best_top_queues(*local_delta_mean_queue, gene, ngroups, buffer);
522 if (options.compute_delta_detected) {
523 const auto tmp_detected = combo_detected.data() + in_offset;
524 if (average_info.use_mean()) {
525 compute_pairwise_simple_diff_blockmean(tmp_detected, ngroups, nblocks, *preweights, buffer.data());
527 compute_pairwise_simple_diff_blockquantile(tmp_detected, ngroups, nblocks, *qbuffer, *qcalc, buffer.data());
529 add_best_top_queues(*local_delta_detected_queue, gene, ngroups, buffer);
534 if (options.compute_cohens_d) {
535 (*threaded_cohens_d_queues)[t] = std::move(local_cohens_d_queue);
537 if (options.compute_delta_mean) {
538 (*threaded_delta_mean_queues)[t] = std::move(local_delta_mean_queue);
540 if (options.compute_delta_detected) {
541 (*threaded_delta_detected_queues)[t] = std::move(local_delta_detected_queue);
543 }, ngenes, options.num_threads);
546 if (options.compute_cohens_d) {
547 threaded_cohens_d_queues->resize(num_used);
548 report_best_top_queues(*threaded_cohens_d_queues, ngroups, output.cohens_d);
550 if (options.compute_delta_mean) {
551 threaded_delta_mean_queues->resize(num_used);
552 report_best_top_queues(*threaded_delta_mean_queues, ngroups, output.delta_mean);
554 if (options.compute_delta_detected) {
555 threaded_delta_detected_queues->resize(num_used);
556 report_best_top_queues(*threaded_delta_detected_queues, ngroups, output.delta_detected);
570 const std::size_t ngroups,
571 const Group_*
const group,
572 const std::size_t nblocks,
573 const Block_*
const block,
574 const std::size_t ncombos,
575 const std::size_t*
const combo,
576 const std::vector<Index_>& combo_sizes,
578 const ScoreMarkersBestOptions& options
580 const auto ngenes = matrix.
nrow();
581 const auto payload_size = sanisizer::product<typename std::vector<Stat_>::size_type>(ngenes, ncombos);
582 std::vector<Stat_> combo_means, combo_vars, combo_detected;
583 if (options.compute_group_mean || options.compute_cohens_d || options.compute_delta_mean) {
584 combo_means.resize(payload_size);
586 if (options.compute_cohens_d) {
587 combo_vars.resize(payload_size);
589 if (options.compute_group_detected || options.compute_delta_detected) {
590 combo_detected.resize(payload_size);
595 BlockAverageInfo<Stat_> average_info;
596 if (options.block_average_policy == BlockAveragePolicy::MEAN) {
597 average_info = BlockAverageInfo<Stat_>(
600 options.block_weight_policy,
601 options.variable_block_weight_parameters
605 average_info = BlockAverageInfo<Stat_>(options.block_quantile);
608 ScoreMarkersBestResults<Stat_, Index_> output;
610 if (options.compute_auc) {
611 auto auc_queues = sanisizer::create<std::vector<std::optional<PairwiseTopQueues<Stat_, Index_> > > >(options.num_threads);
613 struct AucResultWorkspace {
614 AucResultWorkspace(
const std::size_t ngroups) : pairwise_buffer(sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups)) {};
615 std::vector<Stat_> pairwise_buffer;
616 PairwiseTopQueues<Stat_, Index_> queue;
619 const auto num_used = scan_matrix_by_row_custom_auc<single_block_>(
633 [&](
const int) -> AucResultWorkspace {
634 AucResultWorkspace res_work(ngroups);
635 allocate_best_top_queues(res_work.queue, ngroups, top, options.largest_auc, options.keep_ties, options.threshold_auc);
638 [&](
const Index_ gene, AucScanWorkspace<Value_, Group_, Stat_, Index_>& auc_work, AucResultWorkspace& res_work) ->
void {
639 process_auc_for_rows(auc_work, ngroups, nblocks, options.threshold, res_work.pairwise_buffer.data());
640 add_best_top_queues(res_work.queue, gene, ngroups, res_work.pairwise_buffer);
642 [&](
const int t, AucResultWorkspace& res_work) ->
void {
643 auc_queues[t] = std::move(res_work.queue);
648 auc_queues.resize(num_used);
649 report_best_top_queues(auc_queues, ngroups, output.auc);
652 scan_matrix_by_row_full_auc<single_block_>(
665 static_cast<Stat_*
>(NULL),
671 scan_matrix_by_column(
674 if constexpr(single_block_) {
681 if constexpr(single_block_) {
695 find_best_simple_best_effects(
738template<
typename Stat_,
typename Value_,
typename Index_,
typename Group_>
741 const Group_*
const group,
745 const Index_ NC = matrix.
ncol();
746 const auto group_sizes = tatami_stats::tabulate_groups(group, NC);
747 const auto ngroups = sanisizer::cast<std::size_t>(group_sizes.size());
749 return internal::score_markers_best<true, Stat_>(
754 static_cast<int*
>(NULL),
756 static_cast<std::size_t*
>(NULL),
787template<
typename Stat_,
typename Value_,
typename Index_,
typename Group_,
typename Block_>
790 const Group_*
const group,
791 const Block_*
const block,
795 const Index_ NC = matrix.
ncol();
796 const auto ngroups = tatami_stats::total_groups(group, NC);
797 const auto nblocks = tatami_stats::total_groups(block, NC);
799 const auto combinations = internal::create_combinations(ngroups, group, nblocks, block, NC);
800 const auto combo_sizes = internal::tabulate_combinations<Index_>(ngroups, nblocks, combinations);
801 const auto ncombos = combo_sizes.size();
803 return internal::score_markers_best<false, Stat_>(
805 sanisizer::cast<std::size_t>(ngroups),
807 sanisizer::cast<std::size_t>(nblocks),
809 sanisizer::cast<std::size_t>(ncombos),
Averaging statistics over blocks.
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
void compute_weights(const std::size_t num_blocks, const Size_ *const sizes, const WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *const weights)
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:27
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > queues_to_vectors(std::vector< std::vector< topicks::TopQueue< Stat_, Index_ > > > &queued)
Definition score_markers_best.hpp:262
BlockAveragePolicy
Definition block_averages.hpp:27
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best_blocked(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Block_ *const block, const Index_ top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:788
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Index_ top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:739
int parallelize(Function_ fun, const Index_ tasks, const int workers)
std::optional< Stat_ > bound