scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
model_gene_variances.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MODEL_GENE_VARIANCES_H
2#define SCRAN_MODEL_GENE_VARIANCES_H
3
5
6#include <algorithm>
7#include <vector>
8#include <limits>
9#include <cstddef>
10
11#include "tatami/tatami.hpp"
12#include "tatami_stats/tatami_stats.hpp"
14#include "sanisizer/sanisizer.hpp"
15
21namespace scran_variances {
22
57
66template<typename Stat_>
71 Stat_* means;
72
76 Stat_* variances;
77
81 Stat_* fitted;
82
86 Stat_* residuals;
87};
88
93template<typename Stat_>
98 ModelGeneVariancesResults() = default;
99
100 template<typename Ngenes_>
101 ModelGeneVariancesResults(Ngenes_ ngenes) :
102 means(sanisizer::cast<decltype(means.size())>(ngenes)
103#ifdef SCRAN_VARIANCES_TEST_INIT
104 , SCRAN_VARIANCES_TEST_INIT
105#endif
106 ),
107 variances(sanisizer::cast<decltype(variances.size())>(ngenes)
108#ifdef SCRAN_VARIANCES_TEST_INIT
109 , SCRAN_VARIANCES_TEST_INIT
110#endif
111 ),
112 fitted(sanisizer::cast<decltype(fitted.size())>(ngenes)
113#ifdef SCRAN_VARIANCES_TEST_INIT
114 , SCRAN_VARIANCES_TEST_INIT
115#endif
116 ),
117 residuals(sanisizer::cast<decltype(residuals.size())>(ngenes)
118#ifdef SCRAN_VARIANCES_TEST_INIT
119 , SCRAN_VARIANCES_TEST_INIT
120#endif
121 )
122 {}
130 std::vector<Stat_> means;
131
135 std::vector<Stat_> variances;
136
140 std::vector<Stat_> fitted;
141
145 std::vector<Stat_> residuals;
146};
147
152template<typename Stat_>
158 std::vector<ModelGeneVariancesBuffers<Stat_> > per_block;
159
165};
166
171template<typename Stat_>
177
178 template<typename Ngenes_, typename Nblocks_>
179 ModelGeneVariancesBlockedResults(Ngenes_ ngenes, Nblocks_ nblocks, bool compute_average) : average(compute_average ? ngenes : 0) {
180 per_block.reserve(nblocks);
181 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
182 per_block.emplace_back(ngenes);
183 }
184 }
192 std::vector<ModelGeneVariancesResults<Stat_> > per_block;
193
199};
200
204namespace internal {
205
206template<typename Value_, typename Index_, typename Stat_, typename Block_>
207void compute_variances_dense_row(
209 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
210 const Block_* block,
211 const std::vector<Index_>& block_size,
212 int num_threads)
213{
214 bool blocked = (block != NULL);
215 auto nblocks = block_size.size();
216 auto NR = mat.nrow(), NC = mat.ncol();
217
218 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
219 auto tmp_means = sanisizer::create<std::vector<Stat_> >(blocked ? nblocks : 0);
220 auto tmp_vars = sanisizer::create<std::vector<Stat_> >(blocked ? nblocks : 0);
221
223 auto ext = tatami::consecutive_extractor<false>(&mat, true, start, length);
224 for (Index_ r = start, end = start + length; r < end; ++r) {
225 auto ptr = ext->fetch(buffer.data());
226
227 if (blocked) {
228 tatami_stats::grouped_variances::direct(
229 ptr,
230 NC,
231 block,
232 nblocks,
233 block_size.data(),
234 tmp_means.data(),
235 tmp_vars.data(),
236 false,
237 static_cast<Index_*>(NULL)
238 );
239 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
240 buffers[b].means[r] = tmp_means[b];
241 buffers[b].variances[r] = tmp_vars[b];
242 }
243 } else {
244 auto stat = tatami_stats::variances::direct(ptr, NC, false);
245 buffers[0].means[r] = stat.first;
246 buffers[0].variances[r] = stat.second;
247 }
248 }
249 }, NR, num_threads);
250}
251
252template<typename Value_, typename Index_, typename Stat_, typename Block_>
253void compute_variances_sparse_row(
255 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
256 const Block_* block,
257 const std::vector<Index_>& block_size,
258 int num_threads)
259{
260 bool blocked = (block != NULL);
261 auto nblocks = block_size.size();
262 auto NR = mat.nrow(), NC = mat.ncol();
263
264 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
265 auto tmp_means = sanisizer::create<std::vector<Stat_> >(nblocks);
266 auto tmp_vars = sanisizer::create<std::vector<Stat_> >(nblocks);
267 auto tmp_nzero = sanisizer::create<std::vector<Index_> >(nblocks);
268
271 tatami::Options opt;
272 opt.sparse_ordered_index = false;
273 auto ext = tatami::consecutive_extractor<true>(&mat, true, start, length, opt);
274
275 for (Index_ r = start, end = start + length; r < end; ++r) {
276 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
277
278 if (blocked) {
279 tatami_stats::grouped_variances::direct(
280 range.value,
281 range.index,
282 range.number,
283 block,
284 nblocks,
285 block_size.data(),
286 tmp_means.data(),
287 tmp_vars.data(),
288 tmp_nzero.data(),
289 false,
290 static_cast<Index_*>(NULL)
291 );
292 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
293 buffers[b].means[r] = tmp_means[b];
294 buffers[b].variances[r] = tmp_vars[b];
295 }
296 } else {
297 auto stat = tatami_stats::variances::direct(range.value, range.number, NC, false);
298 buffers[0].means[r] = stat.first;
299 buffers[0].variances[r] = stat.second;
300 }
301 }
302 }, NR, num_threads);
303}
304
305template<typename Value_, typename Index_, typename Stat_, typename Block_>
306void compute_variances_dense_column(
308 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
309 const Block_* block,
310 const std::vector<Index_>& block_size,
311 int num_threads)
312{
313 bool blocked = (block != NULL);
314 auto nblocks = block_size.size();
315 auto NR = mat.nrow(), NC = mat.ncol();
316
317 tatami::parallelize([&](int thread, Index_ start, Index_ length) -> void {
319 auto ext = tatami::consecutive_extractor<false>(&mat, false, static_cast<Index_>(0), NC, start, length);
320
321 auto get_var = [&](Index_ b) -> Stat_* { return buffers[b].variances; };
322 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_var)> local_vars(thread, nblocks, start, length, std::move(get_var));
323 auto get_mean = [&](Index_ b) -> Stat_* { return buffers[b].means; };
324 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_mean)> local_means(thread, nblocks, start, length, std::move(get_mean));
325
326 std::vector<tatami_stats::variances::RunningDense<Stat_, Value_, Index_> > runners;
327 runners.reserve(nblocks);
328 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
329 runners.emplace_back(length, local_means.data(b), local_vars.data(b), false);
330 }
331
332 if (blocked) {
333 for (decltype(NC) c = 0; c < NC; ++c) {
334 auto ptr = ext->fetch(buffer.data());
335 runners[block[c]].add(ptr);
336 }
337 } else {
338 for (decltype(NC) c = 0; c < NC; ++c) {
339 auto ptr = ext->fetch(buffer.data());
340 runners[0].add(ptr);
341 }
342 }
343
344 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
345 runners[b].finish();
346 }
347 local_vars.transfer();
348 local_means.transfer();
349 }, NR, num_threads);
350}
351
352template<typename Value_, typename Index_, typename Stat_, typename Block_>
353void compute_variances_sparse_column(
355 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
356 const Block_* block,
357 const std::vector<Index_>& block_size,
358 int num_threads)
359{
360 bool blocked = (block != NULL);
361 auto nblocks = block_size.size();
362 auto NR = mat.nrow(), NC = mat.ncol();
363 auto nonzeros = sanisizer::create<std::vector<std::vector<Index_> > >(
364 nblocks,
366 );
367
368 tatami::parallelize([&](int thread, Index_ start, Index_ length) -> void {
371 tatami::Options opt;
372 opt.sparse_ordered_index = false;
373 auto ext = tatami::consecutive_extractor<true>(&mat, false, static_cast<Index_>(0), NC, start, length, opt);
374
375 auto get_var = [&](Index_ b) -> Stat_* { return buffers[b].variances; };
376 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_var)> local_vars(thread, nblocks, start, length, std::move(get_var));
377 auto get_mean = [&](Index_ b) -> Stat_* { return buffers[b].means; };
378 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_mean)> local_means(thread, nblocks, start, length, std::move(get_mean));
379
380 std::vector<tatami_stats::variances::RunningSparse<Stat_, Value_, Index_> > runners;
381 runners.reserve(nblocks);
382 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
383 runners.emplace_back(length, local_means.data(b), local_vars.data(b), false, start);
384 }
385
386 if (blocked) {
387 for (decltype(NC) c = 0; c < NC; ++c) {
388 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
389 runners[block[c]].add(range.value, range.index, range.number);
390 }
391 } else {
392 for (decltype(NC) c = 0; c < NC; ++c) {
393 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
394 runners[0].add(range.value, range.index, range.number);
395 }
396 }
397
398 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
399 runners[b].finish();
400 }
401 local_vars.transfer();
402 local_means.transfer();
403 }, NR, num_threads);
404}
405
406template<typename Value_, typename Index_, typename Stat_, typename Block_>
407void compute_variances(
409 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
410 const Block_* block,
411 const std::vector<Index_>& block_size,
412 int num_threads)
413{
414 if (mat.prefer_rows()) {
415 if (mat.sparse()) {
416 compute_variances_sparse_row(mat, buffers, block, block_size, num_threads);
417 } else {
418 compute_variances_dense_row(mat, buffers, block, block_size, num_threads);
419 }
420 } else {
421 if (mat.sparse()) {
422 compute_variances_sparse_column(mat, buffers, block, block_size, num_threads);
423 } else {
424 compute_variances_dense_column(mat, buffers, block, block_size, num_threads);
425 }
426 }
427}
428
429template<typename Index_, typename Stat_, class Function_>
430void compute_average(
431 Index_ ngenes,
432 const std::vector<ModelGeneVariancesBuffers<Stat_> >& per_block,
433 const std::vector<Index_>& block_size,
434 const std::vector<Stat_>& block_weights,
435 Index_ min_size,
436 Function_ fun,
437 std::vector<Stat_*>& tmp_pointers,
438 std::vector<Stat_>& tmp_weights,
439 Stat_* output)
440{
441 if (!output) {
442 return;
443 }
444
445 tmp_pointers.clear();
446 tmp_weights.clear();
447 for (decltype(per_block.size()) b = 0, nblocks = per_block.size(); b < nblocks; ++b) {
448 if (block_size[b] < min_size) { // skip blocks with insufficient cells.
449 continue;
450 }
451 tmp_weights.push_back(block_weights[b]);
452 tmp_pointers.push_back(fun(per_block[b]));
453 }
454
455 scran_blocks::average_vectors_weighted(ngenes, tmp_pointers, tmp_weights.data(), output, /* skip_nan = */ false);
456}
457
458}
486template<typename Value_, typename Index_, typename Block_, typename Stat_>
489 const Block_* block,
491 const ModelGeneVariancesOptions& options)
492{
493 Index_ NR = mat.nrow(), NC = mat.ncol();
494 std::vector<Index_> block_size;
495
496 if (block) {
497 block_size = tatami_stats::tabulate_groups(block, NC);
498 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
499 } else {
500 block_size.push_back(NC); // everything is one big block.
501 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
502 }
503 auto nblocks = block_size.size();
504
506 auto fopt = options.fit_variance_trend_options;
507 fopt.num_threads = options.num_threads;
508 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
509 const auto& current = buffers.per_block[b];
510 if (block_size[b] >= 2) {
511 fit_variance_trend(NR, current.means, current.variances, current.fitted, current.residuals, work, fopt);
512 } else {
513 std::fill_n(current.fitted, NR, std::numeric_limits<double>::quiet_NaN());
514 std::fill_n(current.residuals, NR, std::numeric_limits<double>::quiet_NaN());
515 }
516 }
517
518 auto ave_means = buffers.average.means;
519 auto ave_variances = buffers.average.variances;
520 auto ave_fitted = buffers.average.fitted;
521 auto ave_residuals = buffers.average.residuals;
522
523 if (ave_means || ave_variances || ave_fitted || ave_residuals) {
524 auto block_weight = scran_blocks::compute_weights<Stat_>(block_size, options.block_weight_policy, options.variable_block_weight_parameters);
525
526 std::vector<Stat_*> tmp_pointers;
527 std::vector<Stat_> tmp_weights;
528 tmp_pointers.reserve(nblocks);
529 tmp_weights.reserve(nblocks);
530
531 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
532 /* min_size = */ static_cast<Index_>(1), // skip blocks without enough cells to compute the mean.
533 [](const auto& x) -> Stat_* { return x.means; }, tmp_pointers, tmp_weights, ave_means);
534
535 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
536 /* min_size = */ static_cast<Index_>(2), // skip blocks without enough cells to compute the variance.
537 [](const auto& x) -> Stat_* { return x.variances; }, tmp_pointers, tmp_weights, ave_variances);
538
539 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
540 /* min_size = */ static_cast<Index_>(2), // ditto.
541 [](const auto& x) -> Stat_* { return x.fitted; }, tmp_pointers, tmp_weights, ave_fitted);
542
543 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
544 /* min_size = */ static_cast<Index_>(2), // ditto.
545 [](const auto& x) -> Stat_* { return x.residuals; }, tmp_pointers, tmp_weights, ave_residuals);
546 }
547}
548
565template<typename Value_, typename Index_, typename Stat_>
567 ModelGeneVariancesBuffers<Stat_> buffers, // yes, the lack of a const ref here is deliberate, we need to move it into bbuffers anyway.
568 const ModelGeneVariancesOptions& options) {
569
571 bbuffers.per_block.emplace_back(std::move(buffers));
572
573 bbuffers.average.means = NULL;
574 bbuffers.average.variances = NULL;
575 bbuffers.average.fitted = NULL;
576 bbuffers.average.residuals = NULL;
577
578 model_gene_variances_blocked(mat, static_cast<Index_*>(NULL), bbuffers, options);
579}
580
594template<typename Stat_ = double, typename Value_, typename Index_>
597
599 buffers.means = output.means.data();
600 buffers.variances = output.variances.data();
601 buffers.fitted = output.fitted.data();
602 buffers.residuals = output.residuals.data();
603
604 model_gene_variances(mat, std::move(buffers), options);
605 return output;
606}
607
625template<typename Stat_ = double, typename Value_, typename Index_, typename Block_>
627 auto nblocks = (block ? tatami_stats::total_groups(block, mat.ncol()) : 1);
628 ModelGeneVariancesBlockedResults<Stat_> output(mat.nrow(), nblocks, options.compute_average);
629
631 buffers.per_block.resize(nblocks);
632 for (decltype(nblocks) b = 0; b < nblocks; ++b) {
633 auto& current = buffers.per_block[b];
634 current.means = output.per_block[b].means.data();
635 current.variances = output.per_block[b].variances.data();
636 current.fitted = output.per_block[b].fitted.data();
637 current.residuals = output.per_block[b].residuals.data();
638 }
639
640 if (!options.compute_average) {
641 buffers.average.means = NULL;
642 buffers.average.variances = NULL;
643 buffers.average.fitted = NULL;
644 buffers.average.residuals = NULL;
645 } else {
646 buffers.average.means = output.average.means.data();
647 buffers.average.variances = output.average.variances.data();
648 buffers.average.fitted = output.average.fitted.data();
649 buffers.average.residuals = output.average.residuals.data();
650 }
651
652 model_gene_variances_blocked(mat, block, buffers, options);
653 return output;
654}
655
656}
657
658#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Fit a mean-variance trend to log-count data.
void average_vectors_weighted(size_t n, std::vector< Stat_ * > in, const Weight_ *w, Output_ *out, bool skip_nan)
void compute_weights(size_t num_blocks, const Size_ *sizes, WeightPolicy policy, const VariableWeightParameters &variable, Weight_ *weights)
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:16
void fit_variance_trend(std::size_t n, const Float_ *mean, const Float_ *variance, Float_ *fitted, Float_ *residuals, FitVarianceTrendWorkspace< Float_ > &workspace, const FitVarianceTrendOptions &options)
Definition fit_variance_trend.hpp:123
void model_gene_variances_blocked(const tatami::Matrix< Value_, Index_ > &mat, const Block_ *block, const ModelGeneVariancesBlockedBuffers< Stat_ > &buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:487
void model_gene_variances(const tatami::Matrix< Value_, Index_ > &mat, ModelGeneVariancesBuffers< Stat_ > buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:566
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Options for fit_variance_trend().
Definition fit_variance_trend.hpp:22
int num_threads
Definition fit_variance_trend.hpp:74
Workspace for fit_variance_trend().
Definition fit_variance_trend.hpp:83
Buffers for model_gene_variances_blocked().
Definition model_gene_variances.hpp:153
ModelGeneVariancesBuffers< Stat_ > average
Definition model_gene_variances.hpp:164
std::vector< ModelGeneVariancesBuffers< Stat_ > > per_block
Definition model_gene_variances.hpp:158
Results of model_gene_variances_blocked().
Definition model_gene_variances.hpp:172
std::vector< ModelGeneVariancesResults< Stat_ > > per_block
Definition model_gene_variances.hpp:192
ModelGeneVariancesResults< Stat_ > average
Definition model_gene_variances.hpp:198
Buffers for model_gene_variances() and friends.
Definition model_gene_variances.hpp:67
Stat_ * means
Definition model_gene_variances.hpp:71
Stat_ * residuals
Definition model_gene_variances.hpp:86
Stat_ * variances
Definition model_gene_variances.hpp:76
Stat_ * fitted
Definition model_gene_variances.hpp:81
Options for model_gene_variances() and friends.
Definition model_gene_variances.hpp:26
FitVarianceTrendOptions fit_variance_trend_options
Definition model_gene_variances.hpp:30
bool compute_average
Definition model_gene_variances.hpp:49
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition model_gene_variances.hpp:43
int num_threads
Definition model_gene_variances.hpp:55
scran_blocks::WeightPolicy block_weight_policy
Definition model_gene_variances.hpp:36
Results of model_gene_variances().
Definition model_gene_variances.hpp:94
std::vector< Stat_ > fitted
Definition model_gene_variances.hpp:140
std::vector< Stat_ > means
Definition model_gene_variances.hpp:130
std::vector< Stat_ > residuals
Definition model_gene_variances.hpp:145
std::vector< Stat_ > variances
Definition model_gene_variances.hpp:135
bool sparse_ordered_index