scran_aggregate
Aggregate expression values across cells
Loading...
Searching...
No Matches
aggregate_across_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
2#define SCRAN_AGGREGATE_AGGREGATE_ACROSS_GENES_HPP
3
4#include <algorithm>
5#include <vector>
6#include <unordered_set>
7#include <stdexcept>
8
9#include "tatami/tatami.hpp"
10
16namespace scran_aggregate {
17
26 int num_threads = 1;
27
32 bool average = false;
33};
34
39template <typename Sum_>
46 std::vector<Sum_*> sum;
47};
48
53template <typename Sum_>
60 std::vector<std::vector<Sum_> > sum;
61};
62
66namespace aggregate_across_genes_internal {
67
68template<typename Index_, typename Gene_, typename Weight_>
69std::vector<Gene_> create_subset(const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets, Index_ nrow) {
70 std::unordered_set<Gene_> of_interest;
71 for (const auto& set : gene_sets) {
72 auto set_size = std::get<0>(set);
73 auto set_genes = std::get<1>(set);
74 of_interest.insert(set_genes, set_genes + set_size);
75 }
76
77 std::vector<Index_> subset(of_interest.begin(), of_interest.end());
78 if (!subset.empty()) {
79 std::sort(subset.begin(), subset.end());
80 if (subset.front() < 0 || subset.back() >= nrow) {
81 throw std::runtime_error("set indices are out of range");
82 }
83 }
84
85 return subset;
86}
87
88template<typename Index_>
89std::pair<std::vector<Index_>, Index_> create_subset_mapping(const std::vector<Index_>& subset) {
90 Index_ offset = 0;
91 size_t span = subset.back() - offset + 1;
92 std::vector<Index_> mapping(span);
93 size_t nsubs = subset.size();
94 for (size_t i = 0; i < nsubs; ++i) {
95 mapping[subset[i] - offset] = i;
96 }
97 return std::make_pair(std::move(mapping), offset);
98}
99
100template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
101void compute_aggregate_by_column(
103 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
104 const AggregateAcrossGenesBuffers<Sum_>& buffers,
105 const AggregateAcrossGenesOptions& options)
106{
107 // Identifying the subset of rows that actually need to be extracted.
108 tatami::VectorPtr<Index_> subset_of_interest = std::make_shared<std::vector<Index_> >(create_subset<Index_>(gene_sets, p.nrow()));
109 const auto& subset = *subset_of_interest;
110 size_t nsubs = subset.size();
111
112 // Creating a mapping back to the gene indices in the subset.
113 const size_t num_sets = gene_sets.size();
114 std::vector<std::pair<std::vector<Index_>, const Weight_*> > remapping(num_sets);
115 if (nsubs) {
116 auto sub_mapping = create_subset_mapping(subset);
117 const auto& mapping = sub_mapping.first;
118 Gene_ offset = sub_mapping.second;
119
120 for (size_t s = 0; s < num_sets; ++s) {
121 const auto& set = gene_sets[s];
122 auto set_size = std::get<0>(set);
123 auto set_genes = std::get<1>(set);
124
125 auto& remapped = remapping[s].first;
126 remapped.reserve(set_size);
127 for (size_t g = 0; g < set_size; ++g) {
128 remapped.push_back(mapping[set_genes[g] - offset]);
129 }
130 remapping[s].second = std::get<2>(set);
131 }
132 }
133
134 tatami::parallelize([&](size_t, Index_ start, Index_ length) {
135 // We extract as sparse even if it is dense, as it's just
136 // easier to index from a dense vector.
137 auto ext = tatami::consecutive_extractor<false>(&p, false, start, length, subset_of_interest);
138 std::vector<Data_> vbuffer(nsubs);
139
140 for (Index_ x = start, end = start + length; x < end; ++x) {
141 auto ptr = ext->fetch(vbuffer.data());
142 for (size_t s = 0; s < num_sets; ++s) {
143 const auto& set = remapping[s];
144
145 Sum_ value = 0;
146 if (set.second) {
147 for (size_t i = 0, send = set.first.size(); i < send; ++i) {
148 value += ptr[set.first[i]] * set.second[i];
149 }
150 } else {
151 for (auto ix : set.first) {
152 value += ptr[ix];
153 }
154 }
155
156 buffers.sum[s][x] = value;
157 }
158 }
159
160 }, p.ncol(), options.num_threads);
161}
162
163template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
164void compute_aggregate_by_row(
166 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
167 const AggregateAcrossGenesBuffers<Sum_>& buffers,
168 const AggregateAcrossGenesOptions& options)
169{
170 // Identifying the subset of rows that actually need to be extracted.
171 auto subset = create_subset<Index_>(gene_sets, p.nrow());
172 size_t nsubs = subset.size();
173 auto sub_oracle = std::make_shared<tatami::FixedViewOracle<Index_> >(subset.data(), nsubs);
174
175 const size_t num_sets = gene_sets.size();
176 std::vector<std::vector<std::pair<size_t, Weight_> > > remapping(nsubs);
177 if (nsubs) {
178 auto sub_mapping = create_subset_mapping(subset);
179 const auto& mapping = sub_mapping.first;
180 Gene_ offset = sub_mapping.second;
181
182 for (size_t s = 0; s < num_sets; ++s) {
183 const auto& set = gene_sets[s];
184 auto set_size = std::get<0>(set);
185 auto set_genes = std::get<1>(set);
186 auto set_weights = std::get<2>(set);
187
188 if (set_weights) {
189 for (size_t g = 0; g < set_size; ++g) {
190 remapping[mapping[set_genes[g] - offset]].emplace_back(s, set_weights[g]);
191 }
192 } else {
193 for (size_t g = 0; g < set_size; ++g) {
194 remapping[mapping[set_genes[g] - offset]].emplace_back(s, 1);
195 }
196 }
197 }
198 }
199
200 // Zeroing all of the buffers before iterating.
201 Index_ NC = p.ncol();
202 for (size_t s = 0; s < num_sets; ++s) {
203 std::fill_n(buffers.sum[s], NC, static_cast<Sum_>(0));
204 }
205
206 if (p.sparse()) {
207 tatami::parallelize([&](size_t, Index_ start, Index_ length) {
208 auto ext = tatami::new_extractor<true, true>(&p, true, sub_oracle, start, length);
209 std::vector<Data_> vbuffer(length);
210 std::vector<Index_> ibuffer(length);
211
212 for (size_t sub = 0; sub < nsubs; ++sub) {
213 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
214 for (const auto& sw : remapping[sub]) {
215 auto outptr = buffers.sum[sw.first];
216 auto wt = sw.second;
217 for (Index_ c = 0; c < range.number; ++c) {
218 outptr[range.index[c]] += range.value[c] * wt;
219 }
220 }
221 }
222 }, NC, options.num_threads);
223
224 } else {
225 tatami::parallelize([&](size_t, Index_ start, Index_ length) {
226 auto ext = tatami::new_extractor<false, true>(&p, true, sub_oracle, start, length);
227 std::vector<Data_> vbuffer(length);
228
229 for (size_t sub = 0; sub < nsubs; ++sub) {
230 auto ptr = ext->fetch(vbuffer.data());
231 for (const auto& sw : remapping[sub]) {
232 auto outptr = buffers.sum[sw.first];
233 auto wt = sw.second;
234 for (Index_ cell = 0; cell < length; ++cell) {
235 outptr[cell + start] += ptr[cell] * wt;
236 }
237 }
238 }
239 }, NC, options.num_threads);
240 }
241}
242
243}
268template<typename Data_, typename Index_, typename Gene_, typename Weight_, typename Sum_>
271 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
273 const AggregateAcrossGenesOptions& options)
274{
275 if (input.prefer_rows()) {
276 aggregate_across_genes_internal::compute_aggregate_by_row(input, gene_sets, buffers, options);
277 } else {
278 aggregate_across_genes_internal::compute_aggregate_by_column(input, gene_sets, buffers, options);
279 }
280
281 if (options.average) {
282 size_t nsets = gene_sets.size();
283 tatami::parallelize([&](int, size_t start, size_t length) {
284 size_t NC = input.ncol();
285 for (size_t s = start, end = start + length; s < end; ++s) {
286 const auto& set = gene_sets[s];
287 auto set_size = std::get<0>(set);
288
289 Sum_ denom = 0;
290 auto set_weights = std::get<2>(set);
291 if (set_weights) {
292 denom = std::accumulate(set_weights, set_weights + set_size, static_cast<Sum_>(0));
293 } else {
294 denom = set_size;
295 }
296
297 auto current = buffers.sum[s];
298 for (size_t c = 0; c < NC; ++c) {
299 current[c] /= denom;
300 }
301 }
302 }, nsets, options.num_threads);
303 }
304}
305
325template<typename Sum_ = double, typename Data_, typename Index_, typename Gene_, typename Weight_>
328 const std::vector<std::tuple<size_t, const Gene_*, const Weight_*> >& gene_sets,
329 const AggregateAcrossGenesOptions& options)
330{
333
334 size_t NC = input.ncol();
335 size_t nsets = gene_sets.size();
336 output.sum.resize(nsets);
337 buffers.sum.resize(nsets);
338
339 for (size_t s = 0; s < nsets; ++s) {
340 output.sum[s].resize(NC);
341 buffers.sum[s] = output.sum[s].data();
342 }
343
344 aggregate_across_genes(input, gene_sets, buffers, options);
345 return output;
346}
347
348}
349
350#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Aggregate single-cell expression values.
Definition aggregate_across_cells.hpp:13
void aggregate_across_genes(const tatami::Matrix< Data_, Index_ > &input, const std::vector< std::tuple< size_t, const Gene_ *, const Weight_ * > > &gene_sets, const AggregateAcrossGenesBuffers< Sum_ > &buffers, const AggregateAcrossGenesOptions &options)
Definition aggregate_across_genes.hpp:269
void parallelize(Function_ fun, Index_ tasks, int threads)
std::shared_ptr< const std::vector< Index_ > > VectorPtr
auto consecutive_extractor(const Matrix< Value_, Index_ > *mat, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Buffers for aggregate_across_genes().
Definition aggregate_across_genes.hpp:40
std::vector< Sum_ * > sum
Definition aggregate_across_genes.hpp:46
Options for aggregate_across_genes().
Definition aggregate_across_genes.hpp:21
bool average
Definition aggregate_across_genes.hpp:32
int num_threads
Definition aggregate_across_genes.hpp:26
Results of aggregate_across_genes().
Definition aggregate_across_genes.hpp:54
std::vector< std::vector< Sum_ > > sum
Definition aggregate_across_genes.hpp:60