scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
model_gene_variances.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MODEL_GENE_VARIANCES_H
2#define SCRAN_MODEL_GENE_VARIANCES_H
3
4#include <algorithm>
5#include <vector>
6#include <limits>
7#include <cstddef>
8
9#include "tatami/tatami.hpp"
10#include "tatami_stats/tatami_stats.hpp"
12#include "sanisizer/sanisizer.hpp"
13
15#include "utils.hpp"
16
22namespace scran_variances {
23
64
73template<typename Stat_>
78 Stat_* means;
79
83 Stat_* variances;
84
88 Stat_* fitted;
89
93 Stat_* residuals;
94};
95
100template<typename Stat_>
105 ModelGeneVariancesResults() = default;
106
107 template<typename Ngenes_>
108 ModelGeneVariancesResults(const Ngenes_ ngenes) :
109 means(sanisizer::cast<decltype(I(means.size()))>(ngenes)
110#ifdef SCRAN_VARIANCES_TEST_INIT
111 , SCRAN_VARIANCES_TEST_INIT
112#endif
113 ),
114 variances(sanisizer::cast<decltype(I(variances.size()))>(ngenes)
115#ifdef SCRAN_VARIANCES_TEST_INIT
116 , SCRAN_VARIANCES_TEST_INIT
117#endif
118 ),
119 fitted(sanisizer::cast<decltype(I(fitted.size()))>(ngenes)
120#ifdef SCRAN_VARIANCES_TEST_INIT
121 , SCRAN_VARIANCES_TEST_INIT
122#endif
123 ),
124 residuals(sanisizer::cast<decltype(I(residuals.size()))>(ngenes)
125#ifdef SCRAN_VARIANCES_TEST_INIT
126 , SCRAN_VARIANCES_TEST_INIT
127#endif
128 )
129 {}
137 std::vector<Stat_> means;
138
142 std::vector<Stat_> variances;
143
147 std::vector<Stat_> fitted;
148
152 std::vector<Stat_> residuals;
153};
154
159template<typename Stat_>
165 std::vector<ModelGeneVariancesBuffers<Stat_> > per_block;
166
172};
173
178template<typename Stat_>
184
185 template<typename Ngenes_, typename Nblocks_>
186 ModelGeneVariancesBlockedResults(Ngenes_ ngenes, Nblocks_ nblocks, const bool compute_average) : average(compute_average ? ngenes : 0) {
187 per_block.reserve(nblocks);
188 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
189 per_block.emplace_back(ngenes);
190 }
191 }
199 std::vector<ModelGeneVariancesResults<Stat_> > per_block;
200
206};
207
211namespace internal {
212
213template<typename Value_, typename Index_, typename Stat_, typename Block_>
214void compute_variances_dense_row(
216 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
217 const Block_* const block,
218 const std::vector<Index_>& block_size,
219 const int num_threads)
220{
221 const bool blocked = (block != NULL);
222 const auto nblocks = block_size.size();
223 const auto NR = mat.nrow(), NC = mat.ncol();
224
225 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
226 auto tmp_means = sanisizer::create<std::vector<Stat_> >(blocked ? nblocks : 0);
227 auto tmp_vars = sanisizer::create<std::vector<Stat_> >(blocked ? nblocks : 0);
228
230 auto ext = tatami::consecutive_extractor<false>(mat, true, start, length);
231 for (Index_ r = start, end = start + length; r < end; ++r) {
232 auto ptr = ext->fetch(buffer.data());
233
234 if (blocked) {
235 tatami_stats::grouped_variances::direct(
236 ptr,
237 NC,
238 block,
239 nblocks,
240 block_size.data(),
241 tmp_means.data(),
242 tmp_vars.data(),
243 false,
244 static_cast<Index_*>(NULL)
245 );
246 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
247 buffers[b].means[r] = tmp_means[b];
248 buffers[b].variances[r] = tmp_vars[b];
249 }
250 } else {
251 const auto stat = tatami_stats::variances::direct(ptr, NC, false);
252 buffers[0].means[r] = stat.first;
253 buffers[0].variances[r] = stat.second;
254 }
255 }
256 }, NR, num_threads);
257}
258
259template<typename Value_, typename Index_, typename Stat_, typename Block_>
260void compute_variances_sparse_row(
262 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
263 const Block_* const block,
264 const std::vector<Index_>& block_size,
265 const int num_threads)
266{
267 const bool blocked = (block != NULL);
268 const auto nblocks = block_size.size();
269 const auto NR = mat.nrow(), NC = mat.ncol();
270
271 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
272 auto tmp_means = sanisizer::create<std::vector<Stat_> >(nblocks);
273 auto tmp_vars = sanisizer::create<std::vector<Stat_> >(nblocks);
274 auto tmp_nzero = sanisizer::create<std::vector<Index_> >(nblocks);
275
278 auto ext = tatami::consecutive_extractor<true>(mat, true, start, length, [&]{
279 tatami::Options opt;
280 opt.sparse_ordered_index = false;
281 return opt;
282 }());
283
284 for (Index_ r = start, end = start + length; r < end; ++r) {
285 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
286
287 if (blocked) {
288 tatami_stats::grouped_variances::direct(
289 range.value,
290 range.index,
291 range.number,
292 block,
293 nblocks,
294 block_size.data(),
295 tmp_means.data(),
296 tmp_vars.data(),
297 tmp_nzero.data(),
298 false,
299 static_cast<Index_*>(NULL)
300 );
301 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
302 buffers[b].means[r] = tmp_means[b];
303 buffers[b].variances[r] = tmp_vars[b];
304 }
305 } else {
306 const auto stat = tatami_stats::variances::direct(range.value, range.number, NC, false);
307 buffers[0].means[r] = stat.first;
308 buffers[0].variances[r] = stat.second;
309 }
310 }
311 }, NR, num_threads);
312}
313
314template<typename Value_, typename Index_, typename Stat_, typename Block_>
315void compute_variances_dense_column(
317 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
318 const Block_* const block,
319 const std::vector<Index_>& block_size,
320 const int num_threads)
321{
322 const bool blocked = (block != NULL);
323 const auto nblocks = block_size.size();
324 const auto NR = mat.nrow(), NC = mat.ncol();
325
326 tatami::parallelize([&](const int thread, const Index_ start, const Index_ length) -> void {
328 auto ext = tatami::consecutive_extractor<false>(mat, false, static_cast<Index_>(0), NC, start, length);
329
330 auto get_var = [&](Index_ b) -> Stat_* { return buffers[b].variances; };
331 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_var)> local_vars(thread, nblocks, start, length, std::move(get_var));
332 auto get_mean = [&](Index_ b) -> Stat_* { return buffers[b].means; };
333 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_mean)> local_means(thread, nblocks, start, length, std::move(get_mean));
334
335 std::vector<tatami_stats::variances::RunningDense<Stat_, Value_, Index_> > runners;
336 runners.reserve(nblocks);
337 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
338 runners.emplace_back(length, local_means.data(b), local_vars.data(b), false);
339 }
340
341 if (blocked) {
342 for (decltype(I(NC)) c = 0; c < NC; ++c) {
343 auto ptr = ext->fetch(buffer.data());
344 runners[block[c]].add(ptr);
345 }
346 } else {
347 for (decltype(I(NC)) c = 0; c < NC; ++c) {
348 auto ptr = ext->fetch(buffer.data());
349 runners[0].add(ptr);
350 }
351 }
352
353 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
354 runners[b].finish();
355 }
356 local_vars.transfer();
357 local_means.transfer();
358 }, NR, num_threads);
359}
360
361template<typename Value_, typename Index_, typename Stat_, typename Block_>
362void compute_variances_sparse_column(
364 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
365 const Block_* const block,
366 const std::vector<Index_>& block_size,
367 const int num_threads)
368{
369 const bool blocked = (block != NULL);
370 const auto nblocks = block_size.size();
371 const auto NR = mat.nrow(), NC = mat.ncol();
372 auto nonzeros = sanisizer::create<std::vector<std::vector<Index_> > >(
373 nblocks,
375 );
376
377 tatami::parallelize([&](const int thread, const Index_ start, const Index_ length) -> void {
380 auto ext = tatami::consecutive_extractor<true>(mat, false, static_cast<Index_>(0), NC, start, length, [&]{
381 tatami::Options opt;
382 opt.sparse_ordered_index = false;
383 return opt;
384 }());
385
386 auto get_var = [&](Index_ b) -> Stat_* { return buffers[b].variances; };
387 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_var)> local_vars(thread, nblocks, start, length, std::move(get_var));
388 auto get_mean = [&](Index_ b) -> Stat_* { return buffers[b].means; };
389 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_mean)> local_means(thread, nblocks, start, length, std::move(get_mean));
390
391 std::vector<tatami_stats::variances::RunningSparse<Stat_, Value_, Index_> > runners;
392 runners.reserve(nblocks);
393 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
394 runners.emplace_back(length, local_means.data(b), local_vars.data(b), false, start);
395 }
396
397 if (blocked) {
398 for (decltype(I(NC)) c = 0; c < NC; ++c) {
399 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
400 runners[block[c]].add(range.value, range.index, range.number);
401 }
402 } else {
403 for (decltype(I(NC)) c = 0; c < NC; ++c) {
404 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
405 runners[0].add(range.value, range.index, range.number);
406 }
407 }
408
409 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
410 runners[b].finish();
411 }
412 local_vars.transfer();
413 local_means.transfer();
414 }, NR, num_threads);
415}
416
417template<typename Value_, typename Index_, typename Stat_, typename Block_>
418void compute_variances(
420 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
421 const Block_* const block,
422 const std::vector<Index_>& block_size,
423 const int num_threads)
424{
425 if (mat.prefer_rows()) {
426 if (mat.sparse()) {
427 compute_variances_sparse_row(mat, buffers, block, block_size, num_threads);
428 } else {
429 compute_variances_dense_row(mat, buffers, block, block_size, num_threads);
430 }
431 } else {
432 if (mat.sparse()) {
433 compute_variances_sparse_column(mat, buffers, block, block_size, num_threads);
434 } else {
435 compute_variances_dense_column(mat, buffers, block, block_size, num_threads);
436 }
437 }
438}
439
440template<typename Index_, typename Stat_, class Function_>
441void compute_average(
442 const Index_ ngenes,
443 const std::vector<ModelGeneVariancesBuffers<Stat_> >& per_block,
444 const std::vector<Index_>& block_size,
445 const std::vector<Stat_>& block_weights,
446 const Index_ min_size,
447 const Function_ fun,
448 std::vector<Stat_*>& tmp_pointers,
449 std::vector<Stat_>& tmp_weights,
450 Stat_* const output)
451{
452 if (!output) {
453 return;
454 }
455
456 tmp_pointers.clear();
457 tmp_weights.clear();
458
459 const auto nblocks = per_block.size();
460 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
461 if (block_size[b] < min_size) { // skip blocks with insufficient cells.
462 continue;
463 }
464 tmp_weights.push_back(block_weights[b]);
465 tmp_pointers.push_back(fun(per_block[b]));
466 }
467
468 scran_blocks::average_vectors_weighted(ngenes, tmp_pointers, tmp_weights.data(), output, /* skip_nan = */ false);
469}
470
471}
499template<typename Value_, typename Index_, typename Block_, typename Stat_>
502 const Block_* const block,
504 const ModelGeneVariancesOptions& options)
505{
506 const Index_ NR = mat.nrow(), NC = mat.ncol();
507 std::vector<Index_> block_size;
508
509 if (block) {
510 block_size = tatami_stats::tabulate_groups(block, NC);
511 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
512 } else {
513 block_size.push_back(NC); // everything is one big block.
514 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
515 }
516 const auto nblocks = block_size.size();
517
519 auto fopt = options.fit_variance_trend_options;
520 fopt.num_threads = options.num_threads;
521 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
522 const auto& current = buffers.per_block[b];
523 if (block_size[b] >= 2) {
524 fit_variance_trend(NR, current.means, current.variances, current.fitted, current.residuals, work, fopt);
525 } else {
526 std::fill_n(current.fitted, NR, std::numeric_limits<double>::quiet_NaN());
527 std::fill_n(current.residuals, NR, std::numeric_limits<double>::quiet_NaN());
528 }
529 }
530
531 const auto ave_means = buffers.average.means;
532 const auto ave_variances = buffers.average.variances;
533 const auto ave_fitted = buffers.average.fitted;
534 const auto ave_residuals = buffers.average.residuals;
535
536 if (ave_means || ave_variances || ave_fitted || ave_residuals) {
537 const auto block_weight = scran_blocks::compute_weights<Stat_>(block_size, options.block_weight_policy, options.variable_block_weight_parameters);
538
539 std::vector<Stat_*> tmp_pointers;
540 std::vector<Stat_> tmp_weights;
541 tmp_pointers.reserve(nblocks);
542 tmp_weights.reserve(nblocks);
543
544 internal::compute_average(
545 NR,
546 buffers.per_block,
547 block_size,
548 block_weight,
549 /* min_size = */ static_cast<Index_>(1), // skip blocks without enough cells to compute the mean.
550 [](const auto& x) -> Stat_* { return x.means; },
551 tmp_pointers,
552 tmp_weights,
553 ave_means
554 );
555
556 internal::compute_average(
557 NR,
558 buffers.per_block,
559 block_size,
560 block_weight,
561 /* min_size = */ static_cast<Index_>(2), // skip blocks without enough cells to compute the variance.
562 [](const auto& x) -> Stat_* { return x.variances; },
563 tmp_pointers,
564 tmp_weights,
565 ave_variances
566 );
567
568 internal::compute_average(
569 NR,
570 buffers.per_block,
571 block_size,
572 block_weight,
573 /* min_size = */ static_cast<Index_>(2), // ditto.
574 [](const auto& x) -> Stat_* { return x.fitted; },
575 tmp_pointers,
576 tmp_weights,
577 ave_fitted
578 );
579
580 internal::compute_average(
581 NR,
582 buffers.per_block,
583 block_size,
584 block_weight,
585 /* min_size = */ static_cast<Index_>(2), // ditto.
586 [](const auto& x) -> Stat_* { return x.residuals; },
587 tmp_pointers,
588 tmp_weights,
589 ave_residuals
590 );
591 }
592}
593
611template<typename Value_, typename Index_, typename Stat_>
614 ModelGeneVariancesBuffers<Stat_> buffers, // yes, the lack of a const ref here is deliberate, we need to move it into bbuffers anyway.
615 const ModelGeneVariancesOptions& options)
616{
618 bbuffers.per_block.emplace_back(std::move(buffers));
619
620 bbuffers.average.means = NULL;
621 bbuffers.average.variances = NULL;
622 bbuffers.average.fitted = NULL;
623 bbuffers.average.residuals = NULL;
624
625 model_gene_variances_blocked(mat, static_cast<Index_*>(NULL), bbuffers, options);
626}
627
641template<typename Stat_ = double, typename Value_, typename Index_>
644
646 buffers.means = output.means.data();
647 buffers.variances = output.variances.data();
648 buffers.fitted = output.fitted.data();
649 buffers.residuals = output.residuals.data();
650
651 model_gene_variances(mat, std::move(buffers), options);
652 return output;
653}
654
672template<typename Stat_ = double, typename Value_, typename Index_, typename Block_>
674 const auto nblocks = (block ? tatami_stats::total_groups(block, mat.ncol()) : 1);
675 ModelGeneVariancesBlockedResults<Stat_> output(mat.nrow(), nblocks, options.compute_average);
676
678 sanisizer::resize(buffers.per_block, nblocks);
679 for (decltype(I(nblocks)) b = 0; b < nblocks; ++b) {
680 auto& current = buffers.per_block[b];
681 current.means = output.per_block[b].means.data();
682 current.variances = output.per_block[b].variances.data();
683 current.fitted = output.per_block[b].fitted.data();
684 current.residuals = output.per_block[b].residuals.data();
685 }
686
687 if (!options.compute_average) {
688 buffers.average.means = NULL;
689 buffers.average.variances = NULL;
690 buffers.average.fitted = NULL;
691 buffers.average.residuals = NULL;
692 } else {
693 buffers.average.means = output.average.means.data();
694 buffers.average.variances = output.average.variances.data();
695 buffers.average.fitted = output.average.fitted.data();
696 buffers.average.residuals = output.average.residuals.data();
697 }
698
699 model_gene_variances_blocked(mat, block, buffers, options);
700 return output;
701}
702
703}
704
705#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Fit a mean-variance trend to log-count data.
void average_vectors_weighted(size_t n, std::vector< Stat_ * > in, const Weight_ *w, Output_ *out, bool skip_nan)
void compute_weights(std::size_t num_blocks, const Size_ *sizes, WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *weights)
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:15
void model_gene_variances_blocked(const tatami::Matrix< Value_, Index_ > &mat, const Block_ *const block, const ModelGeneVariancesBlockedBuffers< Stat_ > &buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:500
void fit_variance_trend(const std::size_t n, const Float_ *const mean, const Float_ *const variance, Float_ *const fitted, Float_ *const residuals, FitVarianceTrendWorkspace< Float_ > &workspace, const FitVarianceTrendOptions &options)
Definition fit_variance_trend.hpp:131
void model_gene_variances(const tatami::Matrix< Value_, Index_ > &mat, ModelGeneVariancesBuffers< Stat_ > buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:612
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Options for fit_variance_trend().
Definition fit_variance_trend.hpp:24
int num_threads
Definition fit_variance_trend.hpp:82
Workspace for fit_variance_trend().
Definition fit_variance_trend.hpp:91
Buffers for model_gene_variances_blocked().
Definition model_gene_variances.hpp:160
ModelGeneVariancesBuffers< Stat_ > average
Definition model_gene_variances.hpp:171
std::vector< ModelGeneVariancesBuffers< Stat_ > > per_block
Definition model_gene_variances.hpp:165
Results of model_gene_variances_blocked().
Definition model_gene_variances.hpp:179
std::vector< ModelGeneVariancesResults< Stat_ > > per_block
Definition model_gene_variances.hpp:199
ModelGeneVariancesResults< Stat_ > average
Definition model_gene_variances.hpp:205
Buffers for model_gene_variances() and friends.
Definition model_gene_variances.hpp:74
Stat_ * means
Definition model_gene_variances.hpp:78
Stat_ * residuals
Definition model_gene_variances.hpp:93
Stat_ * variances
Definition model_gene_variances.hpp:83
Stat_ * fitted
Definition model_gene_variances.hpp:88
Options for model_gene_variances() and friends.
Definition model_gene_variances.hpp:27
FitVarianceTrendOptions fit_variance_trend_options
Definition model_gene_variances.hpp:31
bool compute_average
Definition model_gene_variances.hpp:56
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition model_gene_variances.hpp:50
int num_threads
Definition model_gene_variances.hpp:62
scran_blocks::WeightPolicy block_weight_policy
Definition model_gene_variances.hpp:43
Results of model_gene_variances().
Definition model_gene_variances.hpp:101
std::vector< Stat_ > fitted
Definition model_gene_variances.hpp:147
std::vector< Stat_ > means
Definition model_gene_variances.hpp:137
std::vector< Stat_ > residuals
Definition model_gene_variances.hpp:152
std::vector< Stat_ > variances
Definition model_gene_variances.hpp:142
bool sparse_ordered_index