scran_aggregate
Aggregate expression values across cells
Loading...
Searching...
No Matches
aggregate_across_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
2#define SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
3
4#include <algorithm>
5#include <vector>
6#include <unordered_set>
7#include <stdexcept>
8#include <cstddef>
9
10#include "tatami/tatami.hpp"
11#include "tatami_stats/tatami_stats.hpp"
12#include "sanisizer/sanisizer.hpp"
13
19namespace scran_aggregate {
20
29 int num_threads = 1;
30
35 bool average = false;
36};
37
42template <typename Sum_>
49 std::vector<Sum_*> sum;
50};
51
56template <typename Sum_>
63 std::vector<std::vector<Sum_> > sum;
64};
65
69namespace aggregate_across_genes_internal {
70
71template<typename Index_, typename Gene_, typename Weight_>
72std::vector<Gene_> create_subset(const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets, Index_ nrow) {
73 std::unordered_set<Gene_> of_interest;
74 for (const auto& set : gene_sets) {
75 auto set_size = std::get<0>(set);
76 auto set_genes = std::get<1>(set);
77 of_interest.insert(set_genes, set_genes + set_size);
78 }
79
80 std::vector<Index_> subset(of_interest.begin(), of_interest.end());
81 if (!subset.empty()) {
82 std::sort(subset.begin(), subset.end());
83 if (subset.front() < 0 || subset.back() >= nrow) {
84 throw std::runtime_error("set indices are out of range");
85 }
86 }
87
88 return subset;
89}
90
91template<typename Index_>
92std::pair<std::vector<Index_>, Index_> create_subset_mapping(const std::vector<Index_>& subset) {
93 Index_ offset = subset.front();
94 Index_ span = subset.back() - offset + 1;
96 auto nsubs = subset.size();
97 for (decltype(nsubs) i = 0; i < nsubs; ++i) {
98 mapping[subset[i] - offset] = i;
99 }
100 return std::make_pair(std::move(mapping), offset);
101}
102
103template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
104void compute_aggregate_by_column(
106 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
107 const AggregateAcrossGenesBuffers<Sum_>& buffers,
108 const AggregateAcrossGenesOptions& options)
109{
110 // Identifying the subset of rows that actually need to be extracted.
111 tatami::VectorPtr<Index_> subset_of_interest = std::make_shared<std::vector<Index_> >(create_subset<Index_>(gene_sets, p.nrow()));
112 const auto& subset = *subset_of_interest;
113 Index_ nsubs = subset.size();
114
115 // Creating a mapping back to the gene indices in the subset.
116 auto num_sets = gene_sets.size();
117 auto remapping = sanisizer::create<std::vector<std::pair<std::vector<Index_>, const Weight_*> > >(num_sets);
118 if (nsubs) {
119 auto sub_mapping = create_subset_mapping(subset);
120 const auto& mapping = sub_mapping.first;
121 Gene_ offset = sub_mapping.second;
122
123 for (decltype(num_sets) s = 0; s < num_sets; ++s) {
124 const auto& set = gene_sets[s];
125 auto set_size = std::get<0>(set);
126 auto set_genes = std::get<1>(set);
127
128 auto& remapped = remapping[s].first;
129 remapped.reserve(set_size);
130 for (decltype(set_size) g = 0; g < set_size; ++g) {
131 remapped.push_back(mapping[set_genes[g] - offset]);
132 }
133 remapping[s].second = std::get<2>(set);
134 }
135 }
136
137 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
138 // We extract as sparse even if it is dense, as it's just
139 // easier to index from a dense vector.
140 auto ext = tatami::consecutive_extractor<false>(&p, false, start, length, subset_of_interest);
142
143 for (Index_ x = start, end = start + length; x < end; ++x) {
144 auto ptr = ext->fetch(vbuffer.data());
145 for (decltype(num_sets) s = 0; s < num_sets; ++s) {
146 const auto& set = remapping[s];
147
148 Sum_ value = 0;
149 if (set.second) {
150 for (decltype(set.first.size()) i = 0, send = set.first.size(); i < send; ++i) {
151 value += ptr[set.first[i]] * set.second[i];
152 }
153 } else {
154 for (auto ix : set.first) {
155 value += ptr[ix];
156 }
157 }
158
159 buffers.sum[s][x] = value;
160 }
161 }
162
163 }, p.ncol(), options.num_threads);
164}
165
166template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
167void compute_aggregate_by_row(
169 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
170 const AggregateAcrossGenesBuffers<Sum_>& buffers,
171 const AggregateAcrossGenesOptions& options)
172{
173 // Identifying the subset of rows that actually need to be extracted.
174 auto subset = create_subset<Index_>(gene_sets, p.nrow());
175 Index_ nsubs = subset.size();
176 auto sub_oracle = std::make_shared<tatami::FixedViewOracle<Index_> >(subset.data(), nsubs);
177
178 auto num_sets = gene_sets.size();
180 if (nsubs) {
181 auto sub_mapping = create_subset_mapping(subset);
182 const auto& mapping = sub_mapping.first;
183 Gene_ offset = sub_mapping.second;
184
185 for (decltype(num_sets) s = 0; s < num_sets; ++s) {
186 const auto& set = gene_sets[s];
187 auto set_size = std::get<0>(set);
188 auto set_genes = std::get<1>(set);
189 auto set_weights = std::get<2>(set);
190
191 if (set_weights) {
192 for (decltype(set_size) g = 0; g < set_size; ++g) {
193 remapping[mapping[set_genes[g] - offset]].emplace_back(s, set_weights[g]);
194 }
195 } else {
196 for (decltype(set_size) g = 0; g < set_size; ++g) {
197 remapping[mapping[set_genes[g] - offset]].emplace_back(s, 1);
198 }
199 }
200 }
201 }
202
203 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
204 auto get_sum = [&](Index_ i) -> Sum_* { return buffers.sum[i]; };
205 tatami_stats::LocalOutputBuffers<Sum_, decltype(get_sum)> local_sums(t, num_sets, start, length, std::move(get_sum));
206
207 if (p.sparse()) {
208 auto ext = tatami::new_extractor<true, true>(&p, true, sub_oracle, start, length);
211
212 for (Index_ sub = 0; sub < nsubs; ++sub) {
213 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
214
215 for (const auto& sw : remapping[sub]) {
216 auto outptr = local_sums.data(sw.first);
217 auto wt = sw.second;
218 for (Index_ c = 0; c < range.number; ++c) {
219 outptr[range.index[c] - start] += range.value[c] * wt;
220 }
221 }
222 }
223
224 } else {
225 auto ext = tatami::new_extractor<false, true>(&p, true, sub_oracle, start, length);
227
228 for (Index_ sub = 0; sub < nsubs; ++sub) {
229 auto ptr = ext->fetch(vbuffer.data());
230 for (const auto& sw : remapping[sub]) {
231 auto outptr = local_sums.data(sw.first);
232 auto wt = sw.second;
233 for (Index_ cell = 0; cell < length; ++cell) {
234 outptr[cell] += ptr[cell] * wt;
235 }
236 }
237 }
238 }
239
240 local_sums.transfer();
241 }, p.ncol(), options.num_threads);
242}
243
244}
269template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
272 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
274 const AggregateAcrossGenesOptions& options)
275{
276 if (input.prefer_rows()) {
277 aggregate_across_genes_internal::compute_aggregate_by_row(input, gene_sets, buffers, options);
278 } else {
279 aggregate_across_genes_internal::compute_aggregate_by_column(input, gene_sets, buffers, options);
280 }
281
282 if (options.average) {
283 auto nsets = gene_sets.size();
284 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
285 Index_ NC = input.ncol();
286 for (Index_ s = start, end = start + length; s < end; ++s) {
287 const auto& set = gene_sets[s];
288 auto set_size = std::get<0>(set);
289
290 Sum_ denom = 0;
291 auto set_weights = std::get<2>(set);
292 if (set_weights) {
293 denom = std::accumulate(set_weights, set_weights + set_size, static_cast<Sum_>(0));
294 } else {
295 denom = set_size;
296 }
297
298 auto current = buffers.sum[s];
299 for (Index_ c = 0; c < NC; ++c) {
300 current[c] /= denom;
301 }
302 }
303 }, nsets, options.num_threads);
304 }
305}
306
326template<typename Sum_ = double, typename Data_, typename Index_, typename Gene_, typename Weight_>
329 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
330 const AggregateAcrossGenesOptions& options)
331{
334
335 Index_ NC = input.ncol();
336 auto nsets = gene_sets.size();
337 output.sum.resize(sanisizer::cast<decltype(output.sum.size())>(nsets));
338 buffers.sum.resize(sanisizer::cast<decltype(buffers.sum.size())>(nsets));
339
340 for (decltype(nsets) s = 0; s < nsets; ++s) {
342 output.sum[s],
343 NC
344#ifdef SCRAN_AGGREGATE_TEST_INIT
345 , SCRAN_AGGREGATE_TEST_INIT
346#endif
347 );
348 buffers.sum[s] = output.sum[s].data();
349 }
350
351 aggregate_across_genes(input, gene_sets, buffers, options);
352 return output;
353}
354
355}
356
357#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Aggregate single-cell expression values.
Definition aggregate_across_cells.hpp:18
void aggregate_across_genes(const tatami::Matrix< Data_, Index_ > &input, const std::vector< std::tuple< std::size_t, const Gene_ *, const Weight_ * > > &gene_sets, const AggregateAcrossGenesBuffers< Sum_ > &buffers, const AggregateAcrossGenesOptions &options)
Definition aggregate_across_genes.hpp:270
std::shared_ptr< const std::vector< Index_ > > VectorPtr
void parallelize(Function_ fun, Index_ tasks, int threads)
void resize_container_to_Index_size(Container_ &container, Index_ x, Args_ &&... args)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto new_extractor(const Matrix< Value_, Index_ > &matrix, bool row, MaybeOracle< oracle_, Index_ > oracle, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Buffers for aggregate_across_genes().
Definition aggregate_across_genes.hpp:43
std::vector< Sum_ * > sum
Definition aggregate_across_genes.hpp:49
Options for aggregate_across_genes().
Definition aggregate_across_genes.hpp:24
bool average
Definition aggregate_across_genes.hpp:35
int num_threads
Definition aggregate_across_genes.hpp:29
Results of aggregate_across_genes().
Definition aggregate_across_genes.hpp:57
std::vector< std::vector< Sum_ > > sum
Definition aggregate_across_genes.hpp:63