scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
model_gene_variances.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MODEL_GENE_VARIANCES_H
2#define SCRAN_MODEL_GENE_VARIANCES_H
3
4#include "tatami/tatami.hpp"
5#include "tatami_stats/tatami_stats.hpp"
7
9
10#include <algorithm>
11#include <vector>
12#include <limits>
13
19namespace scran_variances {
20
55
64template<typename Stat_>
86
91template<typename Stat_>
96 ModelGeneVariancesResults() = default;
97
106 std::vector<Stat_> means;
107
111 std::vector<Stat_> variances;
112
116 std::vector<Stat_> fitted;
117
121 std::vector<Stat_> residuals;
122};
123
128template<typename Stat_>
134 std::vector<ModelGeneVariancesBuffers<Stat_> > per_block;
135
141};
142
147template<typename Stat_>
153
154 ModelGeneVariancesBlockedResults(size_t ngenes, size_t nblocks, bool compute_average) : average(compute_average ? ngenes : 0) {
155 per_block.reserve(nblocks);
156 for (size_t b = 0; b < nblocks; ++b) {
157 per_block.emplace_back(ngenes);
158 }
159 }
167 std::vector<ModelGeneVariancesResults<Stat_> > per_block;
168
174};
175
179namespace internal {
180
181template<typename Value_, typename Index_, typename Stat_, typename Block_>
184 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
185 const Block_* block,
186 const std::vector<Index_>& block_size,
187 int num_threads)
188{
189 bool blocked = (block != NULL);
190 auto nblocks = block_size.size();
191 auto NR = mat.nrow(), NC = mat.ncol();
192
193 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
194 std::vector<Stat_> tmp_means(blocked ? nblocks : 0);
195 std::vector<Stat_> tmp_vars(blocked ? nblocks : 0);
196
197 std::vector<Value_> buffer(NC);
198 auto ext = tatami::consecutive_extractor<false>(&mat, true, start, length);
199 for (Index_ r = start, end = start + length; r < end; ++r) {
200 auto ptr = ext->fetch(buffer.data());
201
202 if (blocked) {
203 tatami_stats::grouped_variances::direct(
204 ptr,
205 NC,
206 block,
207 nblocks,
208 block_size.data(),
209 tmp_means.data(),
210 tmp_vars.data(),
211 false,
212 static_cast<Index_*>(NULL)
213 );
214 for (size_t b = 0; b < nblocks; ++b) {
215 buffers[b].means[r] = tmp_means[b];
216 buffers[b].variances[r] = tmp_vars[b];
217 }
218 } else {
219 auto stat = tatami_stats::variances::direct(ptr, NC, false);
220 buffers[0].means[r] = stat.first;
221 buffers[0].variances[r] = stat.second;
222 }
223 }
224 }, NR, num_threads);
225}
226
227template<typename Value_, typename Index_, typename Stat_, typename Block_>
230 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
231 const Block_* block,
232 const std::vector<Index_>& block_size,
233 int num_threads)
234{
235 bool blocked = (block != NULL);
236 auto nblocks = block_size.size();
237 auto NR = mat.nrow(), NC = mat.ncol();
238
239 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
240 std::vector<Stat_> tmp_means(nblocks);
241 std::vector<Stat_> tmp_vars(nblocks);
242 std::vector<Index_> tmp_nzero(nblocks);
243
244 std::vector<Value_> vbuffer(NC);
245 std::vector<Index_> ibuffer(NC);
248 auto ext = tatami::consecutive_extractor<true>(&mat, true, start, length, opt);
249
250 for (Index_ r = start, end = start + length; r < end; ++r) {
251 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
252
253 if (blocked) {
254 tatami_stats::grouped_variances::direct(
255 range.value,
256 range.index,
257 range.number,
258 block,
259 nblocks,
260 block_size.data(),
261 tmp_means.data(),
262 tmp_vars.data(),
263 tmp_nzero.data(),
264 false,
265 static_cast<Index_*>(NULL)
266 );
267 for (size_t b = 0; b < nblocks; ++b) {
268 buffers[b].means[r] = tmp_means[b];
269 buffers[b].variances[r] = tmp_vars[b];
270 }
271 } else {
272 auto stat = tatami_stats::variances::direct(range.value, range.number, NC, false);
273 buffers[0].means[r] = stat.first;
274 buffers[0].variances[r] = stat.second;
275 }
276 }
277 }, NR, num_threads);
278}
279
280template<typename Value_, typename Index_, typename Stat_, typename Block_>
283 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
284 const Block_* block,
285 const std::vector<Index_>& block_size,
286 int num_threads)
287{
288 bool blocked = (block != NULL);
289 auto nblocks = block_size.size();
290 auto NR = mat.nrow(), NC = mat.ncol();
291
293 std::vector<Value_> buffer(length);
294 auto ext = tatami::consecutive_extractor<false>(&mat, false, 0, NC, start, length);
295
296 std::vector<tatami_stats::LocalOutputBuffer<Stat_> > local_var_output;
297 local_var_output.reserve(nblocks);
298 std::vector<tatami_stats::LocalOutputBuffer<Stat_> > local_mean_output;
299 local_mean_output.reserve(nblocks);
300 std::vector<tatami_stats::variances::RunningDense<Stat_, Value_, Index_> > runners;
301 runners.reserve(nblocks);
302
303 for (size_t b = 0; b < nblocks; ++b) {
304 local_var_output.emplace_back(thread, start, length, buffers[b].variances);
305 local_mean_output.emplace_back(thread, start, length, buffers[b].means);
306 runners.emplace_back(length, local_mean_output.back().data(), local_var_output.back().data(), false);
307 }
308
309 if (blocked) {
310 for (Index_ c = 0; c < NC; ++c) {
311 auto ptr = ext->fetch(buffer.data());
312 runners[block[c]].add(ptr);
313 }
314 } else {
315 for (Index_ c = 0; c < NC; ++c) {
316 auto ptr = ext->fetch(buffer.data());
317 runners[0].add(ptr);
318 }
319 }
320
321 for (size_t b = 0; b < nblocks; ++b) {
322 runners[b].finish();
323 local_var_output[b].transfer();
324 local_mean_output[b].transfer();
325 }
326 }, NR, num_threads);
327}
328
329template<typename Value_, typename Index_, typename Stat_, typename Block_>
332 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
333 const Block_* block,
334 const std::vector<Index_>& block_size,
335 int num_threads)
336{
337 bool blocked = (block != NULL);
338 auto nblocks = block_size.size();
339 auto NR = mat.nrow(), NC = mat.ncol();
340 std::vector<std::vector<Index_> > nonzeros(nblocks, std::vector<Index_>(NR));
341
343 std::vector<Value_> vbuffer(length);
344 std::vector<Index_> ibuffer(length);
347 auto ext = tatami::consecutive_extractor<true>(&mat, false, 0, NC, start, length, opt);
348
349 std::vector<tatami_stats::LocalOutputBuffer<Stat_> > local_var_output;
350 local_var_output.reserve(nblocks);
351 std::vector<tatami_stats::LocalOutputBuffer<Stat_> > local_mean_output;
352 local_mean_output.reserve(nblocks);
353 std::vector<tatami_stats::variances::RunningSparse<Stat_, Value_, Index_> > runners;
354 runners.reserve(nblocks);
355
356 for (size_t b = 0; b < nblocks; ++b) {
357 local_var_output.emplace_back(thread, start, length, buffers[b].variances);
358 local_mean_output.emplace_back(thread, start, length, buffers[b].means);
359 runners.emplace_back(length, local_mean_output.back().data(), local_var_output.back().data(), false, start);
360 }
361
362 if (blocked) {
363 for (Index_ c = 0; c < NC; ++c) {
364 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
365 runners[block[c]].add(range.value, range.index, range.number);
366 }
367 } else {
368 for (Index_ c = 0; c < NC; ++c) {
369 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
370 runners[0].add(range.value, range.index, range.number);
371 }
372 }
373
374 for (size_t b = 0; b < nblocks; ++b) {
375 runners[b].finish();
376 local_var_output[b].transfer();
377 local_mean_output[b].transfer();
378 }
379 }, NR, num_threads);
380}
381
382template<typename Value_, typename Index_, typename Stat_, typename Block_>
385 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
386 const Block_* block,
387 const std::vector<Index_>& block_size,
388 int num_threads)
389{
390 if (mat.prefer_rows()) {
391 if (mat.sparse()) {
393 } else {
395 }
396 } else {
397 if (mat.sparse()) {
399 } else {
401 }
402 }
403}
404
405template<typename Index_, typename Stat_, class Function_>
406void compute_average(
407 Index_ ngenes,
408 const std::vector<ModelGeneVariancesBuffers<Stat_> >& per_block,
409 const std::vector<Index_>& block_size,
410 const std::vector<Stat_>& block_weights,
413 std::vector<Stat_*>& tmp_pointers,
414 std::vector<Stat_>& tmp_weights,
415 Stat_* output)
416{
417 if (!output) {
418 return;
419 }
420
421 tmp_pointers.clear();
422 tmp_weights.clear();
423 for (size_t b = 0, nblocks = per_block.size(); b < nblocks; ++b) {
424 if (block_size[b] < min_size) { // skip blocks with insufficient cells.
425 continue;
426 }
427 tmp_weights.push_back(block_weights[b]);
428 tmp_pointers.push_back(fun(per_block[b]));
429 }
430
432}
433
434}
462template<typename Value_, typename Index_, typename Block_, typename Stat_>
465 const Block_* block,
468{
469 Index_ NR = mat.nrow(), NC = mat.ncol();
470 std::vector<Index_> block_size;
471
472 if (block) {
473 block_size = tatami_stats::tabulate_groups(block, NC);
474 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
475 } else {
476 block_size.push_back(NC); // everything is one big block.
477 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
478 }
479 size_t nblocks = block_size.size();
480
482 auto fopt = options.fit_variance_trend_options;
483 fopt.num_threads = options.num_threads;
484 for (size_t b = 0; b < nblocks; ++b) {
485 const auto& current = buffers.per_block[b];
486 if (block_size[b] >= 2) {
487 fit_variance_trend(NR, current.means, current.variances, current.fitted, current.residuals, work, fopt);
488 } else {
489 std::fill_n(current.fitted, NR, std::numeric_limits<double>::quiet_NaN());
490 std::fill_n(current.residuals, NR, std::numeric_limits<double>::quiet_NaN());
491 }
492 }
493
494 auto ave_means = buffers.average.means;
495 auto ave_variances = buffers.average.variances;
496 auto ave_fitted = buffers.average.fitted;
497 auto ave_residuals = buffers.average.residuals;
498
500 auto block_weight = scran_blocks::compute_weights<Stat_>(block_size, options.block_weight_policy, options.variable_block_weight_parameters);
501
502 std::vector<Stat_*> tmp_pointers;
503 std::vector<Stat_> tmp_weights;
504 tmp_pointers.reserve(nblocks);
505 tmp_weights.reserve(nblocks);
506
507 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
508 /* min_size = */ 1, // skip blocks without enough cells to compute the mean.
509 [](const auto& x) -> Stat_* { return x.means; }, tmp_pointers, tmp_weights, ave_means);
510
511 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
512 /* min_size = */ 2, // skip blocks without enough cells to compute the variance.
513 [](const auto& x) -> Stat_* { return x.variances; }, tmp_pointers, tmp_weights, ave_variances);
514
515 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
516 /* min_size = */ 2, // ditto.
517 [](const auto& x) -> Stat_* { return x.fitted; }, tmp_pointers, tmp_weights, ave_fitted);
518
519 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
520 /* min_size = */ 2, // ditto.
521 [](const auto& x) -> Stat_* { return x.residuals; }, tmp_pointers, tmp_weights, ave_residuals);
522 }
523}
524
541template<typename Value_, typename Index_, typename Stat_>
543 ModelGeneVariancesBuffers<Stat_> buffers, // yes, the lack of a const ref here is deliberate, we need to move it into bbuffers anyway.
545
547 bbuffers.per_block.emplace_back(std::move(buffers));
548
549 bbuffers.average.means = NULL;
550 bbuffers.average.variances = NULL;
551 bbuffers.average.fitted = NULL;
552 bbuffers.average.residuals = NULL;
553
555}
556
570template<typename Stat_ = double, typename Value_, typename Index_>
573
575 buffers.means = output.means.data();
576 buffers.variances = output.variances.data();
577 buffers.fitted = output.fitted.data();
578 buffers.residuals = output.residuals.data();
579
581 return output;
582}
583
601template<typename Stat_ = double, typename Value_, typename Index_, typename Block_>
603 size_t nblocks = (block ? tatami_stats::total_groups(block, mat.ncol()) : 1);
605
607 buffers.per_block.resize(nblocks);
608 for (size_t b = 0; b < nblocks; ++b) {
609 auto& current = buffers.per_block[b];
610 current.means = output.per_block[b].means.data();
611 current.variances = output.per_block[b].variances.data();
612 current.fitted = output.per_block[b].fitted.data();
613 current.residuals = output.per_block[b].residuals.data();
614 }
615
616 if (!options.compute_average) {
617 buffers.average.means = NULL;
618 buffers.average.variances = NULL;
619 buffers.average.fitted = NULL;
620 buffers.average.residuals = NULL;
621 } else {
622 buffers.average.means = output.average.means.data();
623 buffers.average.variances = output.average.variances.data();
624 buffers.average.fitted = output.average.fitted.data();
625 buffers.average.residuals = output.average.residuals.data();
626 }
627
629 return output;
630}
631
632}
633
634#endif
Fit a mean-variance trend to log-count data.
void average_vectors_weighted(size_t n, std::vector< Stat_ * > in, const Weight_ *w, Output_ *out, bool skip_nan)
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:14
std::vector< Index_ > choose_highly_variable_genes_index(Index_ n, const Stat_ *statistic, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:247
void fit_variance_trend(size_t n, const Float_ *mean, const Float_ *variance, Float_ *fitted, Float_ *residuals, FitVarianceTrendWorkspace< Float_ > &workspace, const FitVarianceTrendOptions &options)
Definition fit_variance_trend.hpp:120
void model_gene_variances_blocked(const tatami::Matrix< Value_, Index_ > &mat, const Block_ *block, const ModelGeneVariancesBlockedBuffers< Stat_ > &buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:463
void model_gene_variances(const tatami::Matrix< Value_, Index_ > &mat, ModelGeneVariancesBuffers< Stat_ > buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:542
void parallelize(Function_ fun, Index_ tasks, int threads)
Options for fit_variance_trend().
Definition fit_variance_trend.hpp:19
Workspace for fit_variance_trend().
Definition fit_variance_trend.hpp:80
Buffers for model_gene_variances_blocked().
Definition model_gene_variances.hpp:129
ModelGeneVariancesBuffers< Stat_ > average
Definition model_gene_variances.hpp:140
std::vector< ModelGeneVariancesBuffers< Stat_ > > per_block
Definition model_gene_variances.hpp:134
Results of model_gene_variances_blocked().
Definition model_gene_variances.hpp:148
std::vector< ModelGeneVariancesResults< Stat_ > > per_block
Definition model_gene_variances.hpp:167
ModelGeneVariancesResults< Stat_ > average
Definition model_gene_variances.hpp:173
Buffers for model_gene_variances() and friends.
Definition model_gene_variances.hpp:65
Stat_ * means
Definition model_gene_variances.hpp:69
Stat_ * residuals
Definition model_gene_variances.hpp:84
Stat_ * variances
Definition model_gene_variances.hpp:74
Stat_ * fitted
Definition model_gene_variances.hpp:79
Options for model_gene_variances() and friends.
Definition model_gene_variances.hpp:24
FitVarianceTrendOptions fit_variance_trend_options
Definition model_gene_variances.hpp:28
bool compute_average
Definition model_gene_variances.hpp:47
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition model_gene_variances.hpp:41
int num_threads
Definition model_gene_variances.hpp:53
scran_blocks::WeightPolicy block_weight_policy
Definition model_gene_variances.hpp:34
Results of model_gene_variances().
Definition model_gene_variances.hpp:92
std::vector< Stat_ > fitted
Definition model_gene_variances.hpp:116
std::vector< Stat_ > means
Definition model_gene_variances.hpp:106
std::vector< Stat_ > residuals
Definition model_gene_variances.hpp:121
std::vector< Stat_ > variances
Definition model_gene_variances.hpp:111
bool sparse_ordered_index