scran_aggregate/aggregate__across__genes_8hpp_source.html

#ifndef SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP

#define SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP


#include <algorithm>

#include <vector>

#include <unordered_set>

#include <stdexcept>


#include "tatami/tatami.hpp"

#include "tatami_stats/tatami_stats.hpp"


namespace scran_aggregate {


struct AggregateAcrossGenesOptions {

    int num_threads = 1;


    bool average = false;

};


template <typename Sum_>


struct AggregateAcrossGenesBuffers {

    std::vector<Sum_*> sum;

};


template <typename Sum_>


struct AggregateAcrossGenesResults {

    std::vector<std::vector<Sum_> > sum;

};


namespace aggregate_across_genes_internal {


template<typename Index_, typename Gene_, typename Weight_>

std::vector<Gene_> create_subset(const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets, Index_ nrow) {

    std::unordered_set<Gene_> of_interest;

    for (const auto& set : gene_sets) {

        auto set_size = std::get<0>(set);

        auto set_genes = std::get<1>(set);

        of_interest.insert(set_genes, set_genes + set_size);

    }


    std::vector<Index_> subset(of_interest.begin(), of_interest.end());

    if (!subset.empty()) {

        std::sort(subset.begin(), subset.end());

        if (subset.front() < 0 || subset.back() >= nrow) {

            throw std::runtime_error("set indices are out of range");

        }

    }


    return subset;

}


template<typename Index_>

std::pair<std::vector<Index_>, Index_> create_subset_mapping(const std::vector<Index_>& subset) {

    Index_ offset = 0;

    size_t span = subset.back() - offset + 1;

    std::vector<Index_> mapping(span);

    size_t nsubs = subset.size();

    for (size_t i = 0; i < nsubs; ++i) {

        mapping[subset[i] - offset] = i;

    }

    return std::make_pair(std::move(mapping), offset);

}


template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>

void compute_aggregate_by_column(

    const tatami::Matrix<Data_, Index_>& p,

    const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,

    const AggregateAcrossGenesBuffers<Sum_>& buffers,

    const AggregateAcrossGenesOptions& options)

{

    // Identifying the subset of rows that actually need to be extracted.

    tatami::VectorPtr<Index_> subset_of_interest = std::make_shared<std::vector<Index_> >(create_subset<Index_>(gene_sets, p.nrow()));

    const auto& subset = *subset_of_interest;

    size_t nsubs = subset.size();


    // Creating a mapping back to the gene indices in the subset.

    const size_t num_sets = gene_sets.size();

    std::vector<std::pair<std::vector<Index_>, const Weight_*> > remapping(num_sets);

    if (nsubs) {

        auto sub_mapping = create_subset_mapping(subset);

        const auto& mapping = sub_mapping.first;

        Gene_ offset = sub_mapping.second;


        for (size_t s = 0; s < num_sets; ++s) {

            const auto& set = gene_sets[s];

            auto set_size = std::get<0>(set);

            auto set_genes = std::get<1>(set);


            auto& remapped = remapping[s].first;

            remapped.reserve(set_size);

            for (size_t g = 0; g < set_size; ++g) {

                remapped.push_back(mapping[set_genes[g] - offset]);

            }

            remapping[s].second = std::get<2>(set);

        }

    }


    tatami::parallelize([&](size_t, Index_ start, Index_ length) -> void {

        // We extract as sparse even if it is dense, as it's just

        // easier to index from a dense vector.

        auto ext = tatami::consecutive_extractor<false>(&p, false, start, length, subset_of_interest);

        std::vector<Data_> vbuffer(nsubs);


        for (Index_ x = start, end = start + length; x < end; ++x) {

            auto ptr = ext->fetch(vbuffer.data());

            for (size_t s = 0; s < num_sets; ++s) {

                const auto& set = remapping[s];


                Sum_ value = 0;

                if (set.second) {

                    for (size_t i = 0, send = set.first.size(); i < send; ++i) {

                        value += ptr[set.first[i]] * set.second[i];

                    }

                } else {

                    for (auto ix : set.first) {

                        value += ptr[ix];

                    }

                }


                buffers.sum[s][x] = value;

            }

        }


    }, p.ncol(), options.num_threads);

}


template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>

void compute_aggregate_by_row(

    const tatami::Matrix<Data_, Index_>& p,

    const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,

    const AggregateAcrossGenesBuffers<Sum_>& buffers,

    const AggregateAcrossGenesOptions& options)

{

    // Identifying the subset of rows that actually need to be extracted.

    auto subset = create_subset<Index_>(gene_sets, p.nrow());

    size_t nsubs = subset.size();

    auto sub_oracle = std::make_shared<tatami::FixedViewOracle<Index_> >(subset.data(), nsubs);


    const size_t num_sets = gene_sets.size();

    std::vector<std::vector<std::pair<size_t, Weight_> > > remapping(nsubs);

    if (nsubs) {

        auto sub_mapping = create_subset_mapping(subset);

        const auto& mapping = sub_mapping.first;

        Gene_ offset = sub_mapping.second;


        for (size_t s = 0; s < num_sets; ++s) {

            const auto& set = gene_sets[s];

            auto set_size = std::get<0>(set);

            auto set_genes = std::get<1>(set);

            auto set_weights = std::get<2>(set);


            if (set_weights) {

                for (size_t g = 0; g < set_size; ++g) {

                    remapping[mapping[set_genes[g] - offset]].emplace_back(s, set_weights[g]);

                }

            } else {

                for (size_t g = 0; g < set_size; ++g) {

                    remapping[mapping[set_genes[g] - offset]].emplace_back(s, 1);

                }

            }

        }

    }


    tatami::parallelize([&](size_t t, Index_ start, Index_ length) -> void {

        auto get_sum = [&](Index_ i) -> Sum_* { return buffers.sum[i]; };

        tatami_stats::LocalOutputBuffers<Sum_, decltype(get_sum)> local_sums(t, num_sets, start, length, std::move(get_sum));


        if (p.sparse()) {

            auto ext = tatami::new_extractor<true, true>(&p, true, sub_oracle, start, length);

            std::vector<Data_> vbuffer(length);

            std::vector<Index_> ibuffer(length);


            for (size_t sub = 0; sub < nsubs; ++sub) {

                auto range = ext->fetch(vbuffer.data(), ibuffer.data());


                for (const auto& sw : remapping[sub]) {

                    auto outptr = local_sums.data(sw.first);

                    auto wt = sw.second;

                    for (Index_ c = 0; c < range.number; ++c) {

                        outptr[range.index[c] - start] += range.value[c] * wt;

                    }

                }

            }


        } else {

            auto ext = tatami::new_extractor<false, true>(&p, true, sub_oracle, start, length);

            std::vector<Data_> vbuffer(length);


            for (size_t sub = 0; sub < nsubs; ++sub) {

                auto ptr = ext->fetch(vbuffer.data());

                for (const auto& sw : remapping[sub]) {

                    auto outptr = local_sums.data(sw.first);

                    auto wt = sw.second;

                    for (Index_ cell = 0; cell < length; ++cell) {

                        outptr[cell] += ptr[cell] * wt;

                    }

                }

            }

        }


        local_sums.transfer();

    }, p.ncol(), options.num_threads);

}


}

template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>


void aggregate_across_genes(

    const tatami::Matrix<Data_, Index_>& input,

    const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,

    const AggregateAcrossGenesBuffers<Sum_>& buffers,

    const AggregateAcrossGenesOptions& options)

{

    if (input.prefer_rows()) {

        aggregate_across_genes_internal::compute_aggregate_by_row(input, gene_sets, buffers, options);

    } else {

        aggregate_across_genes_internal::compute_aggregate_by_column(input, gene_sets, buffers, options);

    }


    if (options.average) {

        size_t nsets = gene_sets.size();

        tatami::parallelize([&](int, size_t start, size_t length) -> void {

            size_t NC = input.ncol();

            for (size_t s = start, end = start + length; s < end; ++s) {

                const auto& set = gene_sets[s];

                auto set_size = std::get<0>(set);


                Sum_ denom = 0;

                auto set_weights = std::get<2>(set);

                if (set_weights) {

                    denom = std::accumulate(set_weights, set_weights + set_size, static_cast<Sum_>(0));

                } else {

                    denom = set_size;

                }


                auto current = buffers.sum[s];

                for (size_t c = 0; c < NC; ++c) {

                    current[c] /= denom;

                }

            }

        }, nsets, options.num_threads);

    }

}


template<typename Sum_ = double, typename Data_, typename Index_, typename Gene_, typename Weight_>


AggregateAcrossGenesResults<Sum_> aggregate_across_genes(

    const tatami::Matrix<Data_, Index_>& input,

    const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,

    const AggregateAcrossGenesOptions& options)

{

    AggregateAcrossGenesResults<Sum_> output;

    AggregateAcrossGenesBuffers<Sum_> buffers;


    size_t NC = input.ncol();

    size_t nsets = gene_sets.size();

    output.sum.resize(nsets);

    buffers.sum.resize(nsets);


    for (size_t s = 0; s < nsets; ++s) {

        output.sum[s].resize(NC

#ifdef SCRAN_AGGREGATE_TEST_INIT

        , SCRAN_AGGREGATE_TEST_INIT

#endif

        );

        buffers.sum[s] = output.sum[s].data();

    }


    aggregate_across_genes(input, gene_sets, buffers, options);

    return output;

}


}


#endif

tatami::Matrix

tatami::Matrix::ncol
virtual Index_ ncol() const=0

tatami::Matrix::nrow
virtual Index_ nrow() const=0

tatami::Matrix::prefer_rows
virtual bool prefer_rows() const=0

tatami::Matrix::sparse
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0

scran_aggregate
Aggregate single-cell expression values.
Definition aggregate_across_cells.hpp:15

scran_aggregate::aggregate_across_genes
void aggregate_across_genes(const tatami::Matrix< Data_, Index_ > &input, const std::vector< std::tuple< size_t, const Gene_ *, const Weight_ * > > &gene_sets, const AggregateAcrossGenesBuffers< Sum_ > &buffers, const AggregateAcrossGenesOptions &options)
Definition aggregate_across_genes.hpp:268

tatami::VectorPtr
std::shared_ptr< const std::vector< Index_ > > VectorPtr

tatami::parallelize
void parallelize(Function_ fun, Index_ tasks, int threads)

tatami::new_extractor
auto new_extractor(const Matrix< Value_, Index_ > &matrix, bool row, MaybeOracle< oracle_, Index_ > oracle, Args_ &&... args)

tatami::consecutive_extractor
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)

scran_aggregate::AggregateAcrossGenesBuffers
Buffers for aggregate_across_genes().
Definition aggregate_across_genes.hpp:41

scran_aggregate::AggregateAcrossGenesBuffers::sum
std::vector< Sum_ * > sum
Definition aggregate_across_genes.hpp:47

scran_aggregate::AggregateAcrossGenesOptions
Options for aggregate_across_genes().
Definition aggregate_across_genes.hpp:22

scran_aggregate::AggregateAcrossGenesOptions::average
bool average
Definition aggregate_across_genes.hpp:33

scran_aggregate::AggregateAcrossGenesOptions::num_threads
int num_threads
Definition aggregate_across_genes.hpp:27

scran_aggregate::AggregateAcrossGenesResults
Results of aggregate_across_genes().
Definition aggregate_across_genes.hpp:55

scran_aggregate::AggregateAcrossGenesResults::sum
std::vector< std::vector< Sum_ > > sum
Definition aggregate_across_genes.hpp:61

tatami.hpp