Loading [MathJax]/jax/output/HTML-CSS/config.js
scran_aggregate
Aggregate expression values across cells
All Classes Namespaces Files Functions Variables Typedefs Enumerations Friends Pages
aggregate_across_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
2#define SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
3
4#include <algorithm>
5#include <vector>
6#include <unordered_set>
7#include <stdexcept>
8
9#include "tatami/tatami.hpp"
10#include "tatami_stats/tatami_stats.hpp"
11
17namespace scran_aggregate {
18
27 int num_threads = 1;
28
33 bool average = false;
34};
35
40template <typename Sum_>
47 std::vector<Sum_*> sum;
48};
49
54template <typename Sum_>
61 std::vector<std::vector<Sum_> > sum;
62};
63
67namespace aggregate_across_genes_internal {
68
69template<typename Index_, typename Gene_, typename Weight_>
70std::vector<Gene_> create_subset(const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets, Index_ nrow) {
71 std::unordered_set<Gene_> of_interest;
72 for (const auto& set : gene_sets) {
73 auto set_size = std::get<0>(set);
74 auto set_genes = std::get<1>(set);
75 of_interest.insert(set_genes, set_genes + set_size);
76 }
77
78 std::vector<Index_> subset(of_interest.begin(), of_interest.end());
79 if (!subset.empty()) {
80 std::sort(subset.begin(), subset.end());
81 if (subset.front() < 0 || subset.back() >= nrow) {
82 throw std::runtime_error("set indices are out of range");
83 }
84 }
85
86 return subset;
87}
88
89template<typename Index_>
90std::pair<std::vector<Index_>, Index_> create_subset_mapping(const std::vector<Index_>& subset) {
91 Index_ offset = 0;
92 size_t span = subset.back() - offset + 1;
93 std::vector<Index_> mapping(span);
94 size_t nsubs = subset.size();
95 for (size_t i = 0; i < nsubs; ++i) {
96 mapping[subset[i] - offset] = i;
97 }
98 return std::make_pair(std::move(mapping), offset);
99}
100
101template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
102void compute_aggregate_by_column(
104 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
105 const AggregateAcrossGenesBuffers<Sum_>& buffers,
106 const AggregateAcrossGenesOptions& options)
107{
108 // Identifying the subset of rows that actually need to be extracted.
109 tatami::VectorPtr<Index_> subset_of_interest = std::make_shared<std::vector<Index_> >(create_subset<Index_>(gene_sets, p.nrow()));
110 const auto& subset = *subset_of_interest;
111 size_t nsubs = subset.size();
112
113 // Creating a mapping back to the gene indices in the subset.
114 const size_t num_sets = gene_sets.size();
115 std::vector<std::pair<std::vector<Index_>, const Weight_*> > remapping(num_sets);
116 if (nsubs) {
117 auto sub_mapping = create_subset_mapping(subset);
118 const auto& mapping = sub_mapping.first;
119 Gene_ offset = sub_mapping.second;
120
121 for (size_t s = 0; s < num_sets; ++s) {
122 const auto& set = gene_sets[s];
123 auto set_size = std::get<0>(set);
124 auto set_genes = std::get<1>(set);
125
126 auto& remapped = remapping[s].first;
127 remapped.reserve(set_size);
128 for (size_t g = 0; g < set_size; ++g) {
129 remapped.push_back(mapping[set_genes[g] - offset]);
130 }
131 remapping[s].second = std::get<2>(set);
132 }
133 }
134
135 tatami::parallelize([&](size_t, Index_ start, Index_ length) -> void {
136 // We extract as sparse even if it is dense, as it's just
137 // easier to index from a dense vector.
138 auto ext = tatami::consecutive_extractor<false>(&p, false, start, length, subset_of_interest);
139 std::vector<Data_> vbuffer(nsubs);
140
141 for (Index_ x = start, end = start + length; x < end; ++x) {
142 auto ptr = ext->fetch(vbuffer.data());
143 for (size_t s = 0; s < num_sets; ++s) {
144 const auto& set = remapping[s];
145
146 Sum_ value = 0;
147 if (set.second) {
148 for (size_t i = 0, send = set.first.size(); i < send; ++i) {
149 value += ptr[set.first[i]] * set.second[i];
150 }
151 } else {
152 for (auto ix : set.first) {
153 value += ptr[ix];
154 }
155 }
156
157 buffers.sum[s][x] = value;
158 }
159 }
160
161 }, p.ncol(), options.num_threads);
162}
163
164template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
165void compute_aggregate_by_row(
167 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
168 const AggregateAcrossGenesBuffers<Sum_>& buffers,
169 const AggregateAcrossGenesOptions& options)
170{
171 // Identifying the subset of rows that actually need to be extracted.
172 auto subset = create_subset<Index_>(gene_sets, p.nrow());
173 size_t nsubs = subset.size();
174 auto sub_oracle = std::make_shared<tatami::FixedViewOracle<Index_> >(subset.data(), nsubs);
175
176 const size_t num_sets = gene_sets.size();
177 std::vector<std::vector<std::pair<size_t, Weight_> > > remapping(nsubs);
178 if (nsubs) {
179 auto sub_mapping = create_subset_mapping(subset);
180 const auto& mapping = sub_mapping.first;
181 Gene_ offset = sub_mapping.second;
182
183 for (size_t s = 0; s < num_sets; ++s) {
184 const auto& set = gene_sets[s];
185 auto set_size = std::get<0>(set);
186 auto set_genes = std::get<1>(set);
187 auto set_weights = std::get<2>(set);
188
189 if (set_weights) {
190 for (size_t g = 0; g < set_size; ++g) {
191 remapping[mapping[set_genes[g] - offset]].emplace_back(s, set_weights[g]);
192 }
193 } else {
194 for (size_t g = 0; g < set_size; ++g) {
195 remapping[mapping[set_genes[g] - offset]].emplace_back(s, 1);
196 }
197 }
198 }
199 }
200
201 tatami::parallelize([&](size_t t, Index_ start, Index_ length) -> void {
202 auto get_sum = [&](Index_ i) -> Sum_* { return buffers.sum[i]; };
203 tatami_stats::LocalOutputBuffers<Sum_, decltype(get_sum)> local_sums(t, num_sets, start, length, std::move(get_sum));
204
205 if (p.sparse()) {
206 auto ext = tatami::new_extractor<true, true>(&p, true, sub_oracle, start, length);
207 std::vector<Data_> vbuffer(length);
208 std::vector<Index_> ibuffer(length);
209
210 for (size_t sub = 0; sub < nsubs; ++sub) {
211 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
212
213 for (const auto& sw : remapping[sub]) {
214 auto outptr = local_sums.data(sw.first);
215 auto wt = sw.second;
216 for (Index_ c = 0; c < range.number; ++c) {
217 outptr[range.index[c] - start] += range.value[c] * wt;
218 }
219 }
220 }
221
222 } else {
223 auto ext = tatami::new_extractor<false, true>(&p, true, sub_oracle, start, length);
224 std::vector<Data_> vbuffer(length);
225
226 for (size_t sub = 0; sub < nsubs; ++sub) {
227 auto ptr = ext->fetch(vbuffer.data());
228 for (const auto& sw : remapping[sub]) {
229 auto outptr = local_sums.data(sw.first);
230 auto wt = sw.second;
231 for (Index_ cell = 0; cell < length; ++cell) {
232 outptr[cell] += ptr[cell] * wt;
233 }
234 }
235 }
236 }
237
238 local_sums.transfer();
239 }, p.ncol(), options.num_threads);
240}
241
242}
267template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
270 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
272 const AggregateAcrossGenesOptions& options)
273{
274 if (input.prefer_rows()) {
275 aggregate_across_genes_internal::compute_aggregate_by_row(input, gene_sets, buffers, options);
276 } else {
277 aggregate_across_genes_internal::compute_aggregate_by_column(input, gene_sets, buffers, options);
278 }
279
280 if (options.average) {
281 size_t nsets = gene_sets.size();
282 tatami::parallelize([&](int, size_t start, size_t length) -> void {
283 size_t NC = input.ncol();
284 for (size_t s = start, end = start + length; s < end; ++s) {
285 const auto& set = gene_sets[s];
286 auto set_size = std::get<0>(set);
287
288 Sum_ denom = 0;
289 auto set_weights = std::get<2>(set);
290 if (set_weights) {
291 denom = std::accumulate(set_weights, set_weights + set_size, static_cast<Sum_>(0));
292 } else {
293 denom = set_size;
294 }
295
296 auto current = buffers.sum[s];
297 for (size_t c = 0; c < NC; ++c) {
298 current[c] /= denom;
299 }
300 }
301 }, nsets, options.num_threads);
302 }
303}
304
324template<typename Sum_ = double, typename Data_, typename Index_, typename Gene_, typename Weight_>
327 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
328 const AggregateAcrossGenesOptions& options)
329{
332
333 size_t NC = input.ncol();
334 size_t nsets = gene_sets.size();
335 output.sum.resize(nsets);
336 buffers.sum.resize(nsets);
337
338 for (size_t s = 0; s < nsets; ++s) {
339 output.sum[s].resize(NC
340#ifdef SCRAN_AGGREGATE_TEST_INIT
341 , SCRAN_AGGREGATE_TEST_INIT
342#endif
343 );
344 buffers.sum[s] = output.sum[s].data();
345 }
346
347 aggregate_across_genes(input, gene_sets, buffers, options);
348 return output;
349}
350
351}
352
353#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Aggregate single-cell expression values.
Definition aggregate_across_cells.hpp:15
void aggregate_across_genes(const tatami::Matrix< Data_, Index_ > &input, const std::vector< std::tuple< size_t, const Gene_ *, const Weight_ * > > &gene_sets, const AggregateAcrossGenesBuffers< Sum_ > &buffers, const AggregateAcrossGenesOptions &options)
Definition aggregate_across_genes.hpp:268
std::shared_ptr< const std::vector< Index_ > > VectorPtr
void parallelize(Function_ fun, Index_ tasks, int threads)
auto new_extractor(const Matrix< Value_, Index_ > *ptr, bool row, MaybeOracle< oracle_, Index_ > oracle, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Buffers for aggregate_across_genes().
Definition aggregate_across_genes.hpp:41
std::vector< Sum_ * > sum
Definition aggregate_across_genes.hpp:47
Options for aggregate_across_genes().
Definition aggregate_across_genes.hpp:22
bool average
Definition aggregate_across_genes.hpp:33
int num_threads
Definition aggregate_across_genes.hpp:27
Results of aggregate_across_genes().
Definition aggregate_across_genes.hpp:55
std::vector< std::vector< Sum_ > > sum
Definition aggregate_across_genes.hpp:61