scran_markers/score__markers__best_8hpp_source.html

#ifndef SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP

#define SCRAN_MARKERS_SCORE_MARKERS_BEST_HPP


#include <vector>

#include <cstddef>


#include "scran_blocks/scran_blocks.hpp"

#include "tatami/tatami.hpp"

#include "tatami_stats/tatami_stats.hpp"

#include "sanisizer/sanisizer.hpp"

#include "topicks/topicks.hpp"


#include "scan_matrix.hpp"

#include "average_group_stats.hpp"

#include "block_averages.hpp"

#include "create_combinations.hpp"

#include "cohens_d.hpp"

#include "simple_diff.hpp"

#include "utils.hpp"


namespace scran_markers {


struct ScoreMarkersBestOptions {

    double threshold = 0;


    int num_threads = 1;


    bool compute_group_mean = true;


    bool compute_group_detected = true;


    bool compute_cohens_d = true;


    bool compute_auc = true;


    bool compute_delta_mean = true;


    bool compute_delta_detected = true;


    bool largest_cohens_d = true;


    bool largest_auc = true;


    bool largest_delta_mean = true;


    bool largest_delta_detected = true;


    std::optional<double> threshold_cohens_d = 0;


    std::optional<double> threshold_auc = 0.5;


    std::optional<double> threshold_delta_mean = 0;


    std::optional<double> threshold_delta_detected = 0;


    bool keep_ties = false;


    BlockAveragePolicy block_average_policy = BlockAveragePolicy::MEAN;


    scran_blocks::WeightPolicy block_weight_policy = scran_blocks::WeightPolicy::VARIABLE;


    scran_blocks::VariableWeightParameters variable_block_weight_parameters;


    double block_quantile = 0.5;

};


template<typename Stat_, typename Index_>


struct ScoreMarkersBestResults {

    std::vector<std::vector<Stat_> > mean;


    std::vector<std::vector<Stat_> > detected;


    std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > cohens_d;


    std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > auc;


    std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_mean;


    std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > > delta_detected;

};


namespace internal {


template<typename Stat_, typename Index_>

using PairwiseTopQueues = std::vector<std::vector<topicks::TopQueue<Stat_, Index_> > >;


template<typename Stat_, typename Index_>

void allocate_best_top_queues(

    PairwiseTopQueues<Stat_, Index_>& pqueues,

    const std::size_t ngroups,

    const Index_ top,

    const bool larger,

    const bool keep_ties,

    const std::optional<Stat_>& bound

) {

    topicks::TopQueueOptions<Stat_> opt;

    opt.check_nan = true;

    opt.keep_ties = keep_ties;

    if (bound.has_value()) {

        opt.bound = *bound;

    }


    sanisizer::resize(pqueues, ngroups);

    for (auto& x : pqueues) {

        x.reserve(ngroups);

        for (I<decltype(ngroups)> g = 0; g < ngroups; ++g) {

            x.emplace_back(top, larger, opt);

        }

    }

}


template<typename Stat_, typename Index_>

void add_best_top_queues(

    PairwiseTopQueues<Stat_, Index_>& pqueues,

    const Index_ gene,

    std::size_t ngroups,

    const std::vector<Stat_>& effects

) {

    for (I<decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {

        for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {

            const auto val = effects[sanisizer::nd_offset<std::size_t>(g2, ngroups, g1)];

            if (g1 != g2) {

                pqueues[g1][g2].emplace(val, gene);

            }

        }

    }

}


template<typename Stat_, typename Index_>

void report_best_top_queues(

    std::vector<PairwiseTopQueues<Stat_, Index_> >& pqueues,

    std::size_t ngroups,

    std::vector<std::vector<std::vector<std::pair<Index_, Stat_> > > >& output

) {

    // We know it fits into an 'int' as this is what we got originally.

    const int num_threads = pqueues.size();


    // Consolidating all of the thread-specific queues into a single queue.

    auto& true_pqueue = pqueues.front(); // we better have at least one thread.

    for (int t = 1; t < num_threads; ++t) {

        for (I<decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {

            for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {

                auto& current_in = pqueues[t][g1][g2];

                auto& current_out = true_pqueue[g1][g2];

                while (!current_in.empty()) {

                    current_out.push(current_in.top());

                    current_in.pop();

                }

            }

        }

    }


    // Now spilling them out into a single vector.

    sanisizer::resize(output, ngroups);

    for (I<decltype(ngroups)> g1 = 0; g1 < ngroups; ++g1) {

        sanisizer::resize(output[g1], ngroups);

        for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {

            if (g1 == g2) {

                continue;

            }

            auto& current_in = true_pqueue[g1][g2];

            auto& current_out = output[g1][g2];

            while (!current_in.empty()) {

                const auto& best = current_in.top();

                current_out.emplace_back(best.second, best.first);

                current_in.pop();

            }

            std::reverse(current_out.begin(), current_out.end()); // earliest element should have the strongest effect sizes.

        }

    }

}


template<typename Index_, typename Stat_>

void find_best_simple_best_effects(

    const Index_ ngenes,

    const std::size_t ngroups,

    const std::size_t nblocks,

    const std::size_t ncombos,

    const std::vector<Stat_>& combo_means,

    const std::vector<Stat_>& combo_vars,

    const std::vector<Stat_>& combo_detected,

    const BlockAverageInfo<Stat_>& average_info,

    const Index_ top,

    const ScoreMarkersBestOptions& options,

    ScoreMarkersBestResults<Stat_, Index_>& output

) {

    std::optional<std::vector<Stat_> > total_weights_per_group;

    const Stat_* total_weights_ptr = NULL;

    if (average_info.use_mean()) {

        if (options.compute_group_mean || options.compute_group_detected) {

            if (nblocks > 1) {

                total_weights_per_group = compute_total_weight_per_group(ngroups, nblocks, average_info.combo_weights().data());

                total_weights_ptr = total_weights_per_group->data();

            } else {

                total_weights_ptr = average_info.combo_weights().data();

            }

        }

    }


    std::vector<Stat_*> mptrs;

    if (options.compute_group_mean) {

        mptrs.reserve(ngroups);

        sanisizer::resize(output.mean, ngroups);

        for (auto& x : output.mean) {

            sanisizer::resize(x, ngenes);

            mptrs.push_back(x.data());

        }

    }


    std::vector<Stat_*> dptrs;

    if (options.compute_group_detected) {

        dptrs.reserve(ngroups);

        sanisizer::resize(output.detected, ngroups);

        for (auto& x : output.detected) {

            sanisizer::resize(x, ngenes);

            dptrs.push_back(x.data());

        }

    }


    std::optional<PrecomputedPairwiseWeights<Stat_> > preweights;

    if (average_info.use_mean()) {

        if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {

            preweights.emplace(ngroups, nblocks, average_info.combo_weights().data());

        }

    }


    // Setting up the output queues.

    std::vector<PairwiseTopQueues<Stat_, Index_> > cohens_d_queues, delta_detected_queues, delta_mean_queues;

    if (options.compute_cohens_d) {

        sanisizer::resize(cohens_d_queues, options.num_threads);

    }

    if (options.compute_delta_mean) {

        sanisizer::resize(delta_mean_queues, options.num_threads);

    }

    if (options.compute_delta_detected) {

        sanisizer::resize(delta_detected_queues, options.num_threads);

    }


    const auto ngroups2 = sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups);


    tatami::parallelize([&](const int t, const Index_ start, const Index_ length) -> void {

        if (options.compute_cohens_d) {

            allocate_best_top_queues(cohens_d_queues[t], ngroups, top, options.largest_cohens_d, options.keep_ties, options.threshold_cohens_d);

        }

        if (options.compute_delta_mean) {

            allocate_best_top_queues(delta_mean_queues[t], ngroups, top, options.largest_delta_mean, options.keep_ties, options.threshold_delta_mean);

        }

        if (options.compute_delta_detected) {

            allocate_best_top_queues(delta_detected_queues[t], ngroups, top, options.largest_delta_detected, options.keep_ties, options.threshold_delta_detected);

        }

        std::vector<Stat_> buffer;

        if (options.compute_cohens_d || options.compute_delta_mean || options.compute_delta_detected) {

            buffer.resize(ngroups2);

        }


        std::optional<std::vector<Stat_> > qbuffer, qrevbuffer;

        std::optional<scran_blocks::SingleQuantileVariable<Stat_, typename std::vector<Stat_>::iterator> > qcalc;

        if (!average_info.use_mean()) {

            qbuffer.emplace();

            qrevbuffer.emplace();

            qcalc.emplace(nblocks, average_info.quantile());

        }


        for (Index_ gene = start, end = start + length; gene < end; ++gene) {

            auto in_offset = sanisizer::product_unsafe<std::size_t>(gene, ncombos);


            if (options.compute_group_mean) {

                const auto tmp_means = combo_means.data() + in_offset;

                if (average_info.use_mean()) {

                    average_group_stats_blockmean(gene, ngroups, nblocks, tmp_means, average_info.combo_weights().data(), total_weights_ptr, mptrs);

                } else {

                    average_group_stats_blockquantile(gene, ngroups, nblocks, tmp_means, *qbuffer, *qcalc, mptrs);

                }

            }


            if (options.compute_group_detected) {

                const auto tmp_detected = combo_detected.data() + in_offset;

                if (average_info.use_mean()) {

                    average_group_stats_blockmean(gene, ngroups, nblocks, tmp_detected, average_info.combo_weights().data(), total_weights_ptr, dptrs);

                } else {

                    average_group_stats_blockquantile(gene, ngroups, nblocks, tmp_detected, *qbuffer, *qcalc, dptrs);

                }

            }


            // Computing the effect sizes.

            if (options.compute_cohens_d) {

                const auto tmp_means = combo_means.data() + in_offset;

                const auto tmp_variances = combo_vars.data() + in_offset;

                if (average_info.use_mean()) {

                    compute_pairwise_cohens_d_blockmean(tmp_means, tmp_variances, ngroups, nblocks, options.threshold, *preweights, buffer.data());

                } else {

                    compute_pairwise_cohens_d_blockquantile(tmp_means, tmp_variances, ngroups, nblocks, options.threshold, *qbuffer, *qrevbuffer, *qcalc, buffer.data());

                }

                add_best_top_queues(cohens_d_queues[t], gene, ngroups, buffer);

            }


            if (options.compute_delta_mean) {

                const auto tmp_means = combo_means.data() + in_offset;

                if (average_info.use_mean()) {

                    compute_pairwise_simple_diff_blockmean(tmp_means, ngroups, nblocks, *preweights, buffer.data());

                } else {

                    compute_pairwise_simple_diff_blockquantile(tmp_means, ngroups, nblocks, *qbuffer, *qcalc, buffer.data());

                }

                add_best_top_queues(delta_mean_queues[t], gene, ngroups, buffer);

            }


            if (options.compute_delta_detected) {

                const auto tmp_detected = combo_detected.data() + in_offset;

                if (average_info.use_mean()) {

                    compute_pairwise_simple_diff_blockmean(tmp_detected, ngroups, nblocks, *preweights, buffer.data());

                } else {

                    compute_pairwise_simple_diff_blockquantile(tmp_detected, ngroups, nblocks, *qbuffer, *qcalc, buffer.data());

                }

                add_best_top_queues(delta_detected_queues[t], gene, ngroups, buffer);

            }

        }

    }, ngenes, options.num_threads);


    // Now figuring out which of these are the top dogs.

    if (options.compute_cohens_d) {

        report_best_top_queues(cohens_d_queues, ngroups, output.cohens_d);

    }


    if (options.compute_delta_mean) {

        report_best_top_queues(delta_mean_queues, ngroups, output.delta_mean);

    }


    if (options.compute_delta_detected) {

        report_best_top_queues(delta_detected_queues, ngroups, output.delta_detected);

    }

}


template<

    bool single_block_,

    typename Stat_,

    typename Value_,

    typename Index_,

    typename Group_,

    typename Block_

>

ScoreMarkersBestResults<Stat_, Index_> score_markers_best(

    const tatami::Matrix<Value_, Index_>& matrix,

    const std::size_t ngroups,

    const Group_* const group,

    const std::size_t nblocks,

    const Block_* const block,

    const std::size_t ncombos,

    const std::size_t* const combo,

    const std::vector<Index_>& combo_sizes,

    const Index_ top,

    const ScoreMarkersBestOptions& options

) {

    const auto ngenes = matrix.nrow();

    const auto payload_size = sanisizer::product<typename std::vector<Stat_>::size_type>(ngenes, ncombos);

    std::vector<Stat_> combo_means, combo_vars, combo_detected;

    if (options.compute_group_mean || options.compute_cohens_d || options.compute_delta_mean) {

        combo_means.resize(payload_size);

    }

    if (options.compute_cohens_d) {

        combo_vars.resize(payload_size);

    }

    if (options.compute_group_detected || options.compute_delta_detected) {

        combo_detected.resize(payload_size);

    }


    // For a single block, this usually doesn't really matter, but we do it for consistency with the multi-block case,

    // and to account for variable weighting where non-zero block sizes get zero weight.

    BlockAverageInfo<Stat_> average_info;

    if (options.block_average_policy == BlockAveragePolicy::MEAN) {

        average_info = BlockAverageInfo<Stat_>(

            scran_blocks::compute_weights<Stat_>(

                combo_sizes,

                options.block_weight_policy,

                options.variable_block_weight_parameters

            )

        );

    } else {

        average_info = BlockAverageInfo<Stat_>(options.block_quantile);

    }


    ScoreMarkersBestResults<Stat_, Index_> output;


    if (options.compute_auc) {

        auto auc_queues = sanisizer::create<std::vector<PairwiseTopQueues<Stat_, Index_> > >(options.num_threads);


        struct AucResultWorkspace {

            AucResultWorkspace(const std::size_t ngroups, PairwiseTopQueues<Stat_, Index_>& pqueue) :

                pairwise_buffer(sanisizer::product<typename std::vector<Stat_>::size_type>(ngroups, ngroups)),

                queue_ptr(&pqueue)

            {};

            std::vector<Stat_> pairwise_buffer;

            PairwiseTopQueues<Stat_, Index_>* queue_ptr;

        };


        scan_matrix_by_row_custom_auc<single_block_>(

            matrix,

            ngroups,

            group,

            nblocks,

            block,

            ncombos,

            combo,

            combo_sizes,

            average_info,

            combo_means,

            combo_vars,

            combo_detected,

            /* do_auc = */ true,

            /* auc_result_initialize = */ [&](int t) -> AucResultWorkspace {

                allocate_best_top_queues(auc_queues[t], ngroups, top, options.largest_auc, options.keep_ties, options.threshold_auc);

                return AucResultWorkspace(ngroups, auc_queues[t]);

            },

            /* auc_result_process = */ [&](const Index_ gene, AucScanWorkspace<Value_, Group_, Stat_, Index_>& auc_work, AucResultWorkspace& res_work) -> void {

                process_auc_for_rows(auc_work, ngroups, nblocks, options.threshold, res_work.pairwise_buffer.data());

                add_best_top_queues(*(res_work.queue_ptr), gene, ngroups, res_work.pairwise_buffer);

            },

            options.num_threads

        );


        report_best_top_queues(auc_queues, ngroups, output.auc);


    } else if (matrix.prefer_rows()) {

        scan_matrix_by_row_full_auc<single_block_>(

            matrix,

            ngroups,

            group,

            nblocks,

            block,

            ncombos,

            combo,

            combo_sizes,

            average_info,

            combo_means,

            combo_vars,

            combo_detected,

            static_cast<Stat_*>(NULL),

            options.threshold,

            options.num_threads

        );


    } else {

        scan_matrix_by_column(

            matrix,

            [&]{

                if constexpr(single_block_) {

                    return ngroups;

                } else {

                    return ncombos;

                }

            }(),

            [&]{

                if constexpr(single_block_) {

                    return group;

                } else {

                    return combo;

                }

            }(),

            combo_sizes,

            combo_means,

            combo_vars,

            combo_detected,

            options.num_threads

        );

    }


    find_best_simple_best_effects(

        matrix.nrow(),

        ngroups,

        nblocks,

        ncombos,

        combo_means,

        combo_vars,

        combo_detected,

        average_info,

        top,

        options,

        output

    );


    return output;

}


}

template<typename Stat_, typename Value_, typename Index_, typename Group_>


ScoreMarkersBestResults<Stat_, Index_> score_markers_best(

    const tatami::Matrix<Value_, Index_>& matrix,

    const Group_* const group,

    const Index_ top,

    const ScoreMarkersBestOptions& options

) {

    const Index_ NC = matrix.ncol();

    const auto group_sizes = tatami_stats::tabulate_groups(group, NC);

    const auto ngroups = sanisizer::cast<std::size_t>(group_sizes.size());


    return internal::score_markers_best<true, Stat_>(

        matrix,

        ngroups,

        group,

        1,

        static_cast<int*>(NULL),

        ngroups,

        static_cast<std::size_t*>(NULL),

        group_sizes,

        top,

        options

    );

}


template<typename Stat_, typename Value_, typename Index_, typename Group_, typename Block_>


ScoreMarkersBestResults<Stat_, Index_> score_markers_best_blocked(

    const tatami::Matrix<Value_, Index_>& matrix,

    const Group_* const group,

    const Block_* const block,

    const Index_ top,

    const ScoreMarkersBestOptions& options

) {

    const Index_ NC = matrix.ncol();

    const auto ngroups = tatami_stats::total_groups(group, NC);

    const auto nblocks = tatami_stats::total_groups(block, NC);


    const auto combinations = internal::create_combinations(ngroups, group, block, NC);

    const auto combo_sizes = internal::tabulate_combinations<Index_>(ngroups, nblocks, combinations);

    const auto ncombos = combo_sizes.size();


    return internal::score_markers_best<false, Stat_>(

        matrix,

        sanisizer::cast<std::size_t>(ngroups),

        group,

        sanisizer::cast<std::size_t>(nblocks),

        block,

        sanisizer::cast<std::size_t>(ncombos),

        combinations.data(),

        combo_sizes,

        top,

        options

    );

}


}


#endif

block_averages.hpp
Averaging statistics over blocks.

tatami::Matrix

tatami::Matrix::ncol
virtual Index_ ncol() const=0

tatami::Matrix::nrow
virtual Index_ nrow() const=0

tatami::Matrix::prefer_rows
virtual bool prefer_rows() const=0

scran_blocks::compute_weights
void compute_weights(const std::size_t num_blocks, const Size_ *const sizes, const WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *const weights)

scran_blocks::WeightPolicy
WeightPolicy

scran_markers
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:26

scran_markers::BlockAveragePolicy
BlockAveragePolicy
Definition block_averages.hpp:27

scran_markers::score_markers_best_blocked
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best_blocked(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Block_ *const block, const Index_ top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:718

scran_markers::score_markers_best
ScoreMarkersBestResults< Stat_, Index_ > score_markers_best(const tatami::Matrix< Value_, Index_ > &matrix, const Group_ *const group, const Index_ top, const ScoreMarkersBestOptions &options)
Definition score_markers_best.hpp:667

tatami::parallelize
void parallelize(Function_ fun, const Index_ tasks, const int threads)

scran_blocks.hpp

scran_blocks::VariableWeightParameters

scran_markers::ScoreMarkersBestOptions
Options for score_markers_best() and friends.
Definition score_markers_best.hpp:31

scran_markers::ScoreMarkersBestOptions::compute_cohens_d
bool compute_cohens_d
Definition score_markers_best.hpp:58

scran_markers::ScoreMarkersBestOptions::threshold_delta_mean
std::optional< double > threshold_delta_mean
Definition score_markers_best.hpp:118

scran_markers::ScoreMarkersBestOptions::threshold
double threshold
Definition score_markers_best.hpp:37

scran_markers::ScoreMarkersBestOptions::compute_auc
bool compute_auc
Definition score_markers_best.hpp:63

scran_markers::ScoreMarkersBestOptions::largest_cohens_d
bool largest_cohens_d
Definition score_markers_best.hpp:79

scran_markers::ScoreMarkersBestOptions::block_average_policy
BlockAveragePolicy block_average_policy
Definition score_markers_best.hpp:137

scran_markers::ScoreMarkersBestOptions::compute_delta_mean
bool compute_delta_mean
Definition score_markers_best.hpp:68

scran_markers::ScoreMarkersBestOptions::largest_delta_mean
bool largest_delta_mean
Definition score_markers_best.hpp:91

scran_markers::ScoreMarkersBestOptions::compute_group_detected
bool compute_group_detected
Definition score_markers_best.hpp:53

scran_markers::ScoreMarkersBestOptions::compute_group_mean
bool compute_group_mean
Definition score_markers_best.hpp:48

scran_markers::ScoreMarkersBestOptions::block_quantile
double block_quantile
Definition score_markers_best.hpp:164

scran_markers::ScoreMarkersBestOptions::threshold_cohens_d
std::optional< double > threshold_cohens_d
Definition score_markers_best.hpp:104

scran_markers::ScoreMarkersBestOptions::threshold_delta_detected
std::optional< double > threshold_delta_detected
Definition score_markers_best.hpp:125

scran_markers::ScoreMarkersBestOptions::largest_delta_detected
bool largest_delta_detected
Definition score_markers_best.hpp:97

scran_markers::ScoreMarkersBestOptions::largest_auc
bool largest_auc
Definition score_markers_best.hpp:85

scran_markers::ScoreMarkersBestOptions::block_weight_policy
scran_blocks::WeightPolicy block_weight_policy
Definition score_markers_best.hpp:151

scran_markers::ScoreMarkersBestOptions::threshold_auc
std::optional< double > threshold_auc
Definition score_markers_best.hpp:111

scran_markers::ScoreMarkersBestOptions::num_threads
int num_threads
Definition score_markers_best.hpp:43

scran_markers::ScoreMarkersBestOptions::variable_block_weight_parameters
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition score_markers_best.hpp:158

scran_markers::ScoreMarkersBestOptions::compute_delta_detected
bool compute_delta_detected
Definition score_markers_best.hpp:73

scran_markers::ScoreMarkersBestOptions::keep_ties
bool keep_ties
Definition score_markers_best.hpp:130

scran_markers::ScoreMarkersBestResults
Results for score_markers_best() and friends.
Definition score_markers_best.hpp:173

scran_markers::ScoreMarkersBestResults::auc
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > auc
Definition score_markers_best.hpp:208

scran_markers::ScoreMarkersBestResults::delta_mean
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_mean
Definition score_markers_best.hpp:220

scran_markers::ScoreMarkersBestResults::cohens_d
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > cohens_d
Definition score_markers_best.hpp:196

scran_markers::ScoreMarkersBestResults::delta_detected
std::vector< std::vector< std::vector< std::pair< Index_, Stat_ > > > > delta_detected
Definition score_markers_best.hpp:232

scran_markers::ScoreMarkersBestResults::mean
std::vector< std::vector< Stat_ > > mean
Definition score_markers_best.hpp:178

scran_markers::ScoreMarkersBestResults::detected
std::vector< std::vector< Stat_ > > detected
Definition score_markers_best.hpp:184

topicks::TopQueueOptions

topicks::TopQueueOptions::bound
std::optional< Stat_ > bound

topicks::TopQueueOptions::check_nan
bool check_nan

topicks::TopQueueOptions::keep_ties
bool keep_ties

tatami.hpp

topicks.hpp