scran_aggregate
Aggregate expression values across cells
Loading...
Searching...
No Matches
aggregate_across_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
2#define SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
3
4#include <algorithm>
5#include <vector>
6#include <unordered_set>
7#include <stdexcept>
8#include <cstddef>
9
10#include "tatami/tatami.hpp"
11#include "tatami_stats/tatami_stats.hpp"
12#include "sanisizer/sanisizer.hpp"
13
14#include "utils.hpp"
15
21namespace scran_aggregate {
22
31 int num_threads = 1;
32
37 bool average = false;
38};
39
44template <typename Sum_>
51 std::vector<Sum_*> sum;
52};
53
58template <typename Sum_>
65 std::vector<std::vector<Sum_> > sum;
66};
67
71namespace aggregate_across_genes_internal {
72
73template<typename Index_, typename Gene_, typename Weight_>
74std::vector<Gene_> create_subset(const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets, const Index_ nrow) {
75 std::unordered_set<Gene_> of_interest;
76 for (const auto& set : gene_sets) {
77 const auto set_size = std::get<0>(set);
78 const auto set_genes = std::get<1>(set);
79 of_interest.insert(set_genes, set_genes + set_size);
80 }
81
82 std::vector<Index_> subset(of_interest.begin(), of_interest.end());
83 if (!subset.empty()) {
84 std::sort(subset.begin(), subset.end());
85 if (subset.front() < 0 || subset.back() >= nrow) {
86 throw std::runtime_error("set indices are out of range");
87 }
88 }
89
90 return subset;
91}
92
93template<typename Index_>
94std::pair<std::vector<Index_>, Index_> create_subset_mapping(const std::vector<Index_>& subset) {
95 const Index_ offset = subset.front();
96 const Index_ span = subset.back() - offset + 1;
98 const auto nsubs = subset.size();
99 for (decltype(I(nsubs)) i = 0; i < nsubs; ++i) {
100 mapping[subset[i] - offset] = i;
101 }
102 return std::make_pair(std::move(mapping), offset);
103}
104
105template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
106void compute_aggregate_by_column(
108 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
109 const AggregateAcrossGenesBuffers<Sum_>& buffers,
110 const AggregateAcrossGenesOptions& options)
111{
112 // Identifying the subset of rows that actually need to be extracted.
113 const tatami::VectorPtr<Index_> subset_of_interest = std::make_shared<std::vector<Index_> >(create_subset<Index_>(gene_sets, p.nrow()));
114 const auto& subset = *subset_of_interest;
115 const Index_ nsubs = subset.size();
116
117 // Creating a mapping back to the gene indices in the subset.
118 const auto num_sets = gene_sets.size();
119 auto remapping = sanisizer::create<std::vector<std::pair<std::vector<Index_>, const Weight_*> > >(num_sets);
120 if (nsubs) {
121 const auto sub_mapping = create_subset_mapping(subset);
122 const auto& mapping = sub_mapping.first;
123 const Gene_ offset = sub_mapping.second;
124
125 for (decltype(I(num_sets)) s = 0; s < num_sets; ++s) {
126 const auto& set = gene_sets[s];
127 const auto set_size = std::get<0>(set);
128 const auto set_genes = std::get<1>(set);
129
130 auto& remapped = remapping[s].first;
131 remapped.reserve(set_size);
132 for (decltype(I(set_size)) g = 0; g < set_size; ++g) {
133 remapped.push_back(mapping[set_genes[g] - offset]);
134 }
135 remapping[s].second = std::get<2>(set);
136 }
137 }
138
139 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
140 // We extract as sparse even if it is dense, as it's just
141 // easier to index from a dense vector.
142 auto ext = tatami::consecutive_extractor<false>(p, false, start, length, subset_of_interest);
144
145 for (Index_ x = start, end = start + length; x < end; ++x) {
146 const auto ptr = ext->fetch(vbuffer.data());
147 for (decltype(I(num_sets)) s = 0; s < num_sets; ++s) {
148 const auto& set = remapping[s];
149
150 Sum_ value = 0;
151 if (set.second) {
152 for (decltype(I(set.first.size())) i = 0, send = set.first.size(); i < send; ++i) {
153 value += ptr[set.first[i]] * set.second[i];
154 }
155 } else {
156 for (const auto ix : set.first) {
157 value += ptr[ix];
158 }
159 }
160
161 buffers.sum[s][x] = value;
162 }
163 }
164
165 }, p.ncol(), options.num_threads);
166}
167
168template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
169void compute_aggregate_by_row(
171 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
172 const AggregateAcrossGenesBuffers<Sum_>& buffers,
173 const AggregateAcrossGenesOptions& options)
174{
175 // Identifying the subset of rows that actually need to be extracted.
176 const auto subset = create_subset<Index_>(gene_sets, p.nrow());
177 const Index_ nsubs = subset.size();
178 const auto sub_oracle = std::make_shared<tatami::FixedViewOracle<Index_> >(subset.data(), nsubs);
179
180 const auto num_sets = gene_sets.size();
182 if (nsubs) {
183 const auto sub_mapping = create_subset_mapping(subset);
184 const auto& mapping = sub_mapping.first;
185 const Gene_ offset = sub_mapping.second;
186
187 for (decltype(I(num_sets)) s = 0; s < num_sets; ++s) {
188 const auto& set = gene_sets[s];
189 const auto set_size = std::get<0>(set);
190 const auto set_genes = std::get<1>(set);
191 const auto set_weights = std::get<2>(set);
192
193 if (set_weights) {
194 for (decltype(I(set_size)) g = 0; g < set_size; ++g) {
195 remapping[mapping[set_genes[g] - offset]].emplace_back(s, set_weights[g]);
196 }
197 } else {
198 for (decltype(I(set_size)) g = 0; g < set_size; ++g) {
199 remapping[mapping[set_genes[g] - offset]].emplace_back(s, 1);
200 }
201 }
202 }
203 }
204
205 tatami::parallelize([&](const int t, const Index_ start, const Index_ length) -> void {
206 auto get_sum = [&](Index_ i) -> Sum_* { return buffers.sum[i]; };
207 tatami_stats::LocalOutputBuffers<Sum_, decltype(I(get_sum))> local_sums(t, num_sets, start, length, std::move(get_sum));
208
209 if (p.sparse()) {
210 auto ext = tatami::new_extractor<true, true>(p, true, sub_oracle, start, length);
213
214 for (Index_ sub = 0; sub < nsubs; ++sub) {
215 const auto range = ext->fetch(vbuffer.data(), ibuffer.data());
216
217 for (const auto& sw : remapping[sub]) {
218 const auto outptr = local_sums.data(sw.first);
219 const auto wt = sw.second;
220 for (Index_ c = 0; c < range.number; ++c) {
221 outptr[range.index[c] - start] += range.value[c] * wt;
222 }
223 }
224 }
225
226 } else {
227 auto ext = tatami::new_extractor<false, true>(&p, true, sub_oracle, start, length);
229
230 for (Index_ sub = 0; sub < nsubs; ++sub) {
231 const auto ptr = ext->fetch(vbuffer.data());
232 for (const auto& sw : remapping[sub]) {
233 const auto outptr = local_sums.data(sw.first);
234 const auto wt = sw.second;
235 for (Index_ cell = 0; cell < length; ++cell) {
236 outptr[cell] += ptr[cell] * wt;
237 }
238 }
239 }
240 }
241
242 local_sums.transfer();
243 }, p.ncol(), options.num_threads);
244}
245
246}
273template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
276 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
278 const AggregateAcrossGenesOptions& options)
279{
280 if (input.prefer_rows()) {
281 aggregate_across_genes_internal::compute_aggregate_by_row(input, gene_sets, buffers, options);
282 } else {
283 aggregate_across_genes_internal::compute_aggregate_by_column(input, gene_sets, buffers, options);
284 }
285
286 if (options.average) {
287 const auto nsets = gene_sets.size();
288 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
289 const Index_ NC = input.ncol();
290 for (Index_ s = start, end = start + length; s < end; ++s) {
291 const auto& set = gene_sets[s];
292 const auto set_size = std::get<0>(set);
293
294 Sum_ denom = 0;
295 const auto set_weights = std::get<2>(set);
296 if (set_weights) {
297 denom = std::accumulate(set_weights, set_weights + set_size, static_cast<Sum_>(0));
298 } else {
299 denom = set_size;
300 }
301
302 const auto current = buffers.sum[s];
303 for (Index_ c = 0; c < NC; ++c) {
304 current[c] /= denom;
305 }
306 }
307 }, nsets, options.num_threads);
308 }
309}
310
330template<typename Sum_ = double, typename Data_, typename Index_, typename Gene_, typename Weight_>
333 const std::vector<std::tuple<std::size_t, const Gene_*, const Weight_*> >& gene_sets,
334 const AggregateAcrossGenesOptions& options)
335{
338
339 const Index_ NC = input.ncol();
340 const auto nsets = gene_sets.size();
341 sanisizer::resize(output.sum, nsets);
342 sanisizer::resize(buffers.sum, nsets);
343
344 for (decltype(I(nsets)) s = 0; s < nsets; ++s) {
346 output.sum[s],
347 NC
348#ifdef SCRAN_AGGREGATE_TEST_INIT
349 , SCRAN_AGGREGATE_TEST_INIT
350#endif
351 );
352 buffers.sum[s] = output.sum[s].data();
353 }
354
355 aggregate_across_genes(input, gene_sets, buffers, options);
356 return output;
357}
358
359}
360
361#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Aggregate single-cell expression values.
Definition aggregate_across_cells.hpp:20
void aggregate_across_genes(const tatami::Matrix< Data_, Index_ > &input, const std::vector< std::tuple< std::size_t, const Gene_ *, const Weight_ * > > &gene_sets, const AggregateAcrossGenesBuffers< Sum_ > &buffers, const AggregateAcrossGenesOptions &options)
Definition aggregate_across_genes.hpp:274
std::shared_ptr< const std::vector< Index_ > > VectorPtr
void parallelize(Function_ fun, Index_ tasks, int threads)
void resize_container_to_Index_size(Container_ &container, Index_ x, Args_ &&... args)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto new_extractor(const Matrix< Value_, Index_ > &matrix, bool row, MaybeOracle< oracle_, Index_ > oracle, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Buffers for aggregate_across_genes().
Definition aggregate_across_genes.hpp:45
std::vector< Sum_ * > sum
Definition aggregate_across_genes.hpp:51
Options for aggregate_across_genes().
Definition aggregate_across_genes.hpp:26
bool average
Definition aggregate_across_genes.hpp:37
int num_threads
Definition aggregate_across_genes.hpp:31
Results of aggregate_across_genes().
Definition aggregate_across_genes.hpp:59
std::vector< std::vector< Sum_ > > sum
Definition aggregate_across_genes.hpp:65