scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
model_gene_variances.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MODEL_GENE_VARIANCES_H
2#define SCRAN_MODEL_GENE_VARIANCES_H
3
4#include "tatami/tatami.hpp"
5#include "tatami_stats/tatami_stats.hpp"
7
9
10#include <algorithm>
11#include <vector>
12#include <limits>
13
19namespace scran_variances {
20
55
64template<typename Stat_>
86
91template<typename Stat_>
144
149template<typename Stat_>
155 std::vector<ModelGeneVariancesBuffers<Stat_> > per_block;
156
162};
163
168template<typename Stat_>
174
175 ModelGeneVariancesBlockedResults(size_t ngenes, size_t nblocks, bool compute_average) : average(compute_average ? ngenes : 0) {
176 per_block.reserve(nblocks);
177 for (size_t b = 0; b < nblocks; ++b) {
178 per_block.emplace_back(ngenes);
179 }
180 }
188 std::vector<ModelGeneVariancesResults<Stat_> > per_block;
189
195};
196
200namespace internal {
201
202template<typename Value_, typename Index_, typename Stat_, typename Block_>
205 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
206 const Block_* block,
207 const std::vector<Index_>& block_size,
208 int num_threads)
209{
210 bool blocked = (block != NULL);
211 auto nblocks = block_size.size();
212 auto NR = mat.nrow(), NC = mat.ncol();
213
214 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
215 std::vector<Stat_> tmp_means(blocked ? nblocks : 0);
216 std::vector<Stat_> tmp_vars(blocked ? nblocks : 0);
217
218 std::vector<Value_> buffer(NC);
219 auto ext = tatami::consecutive_extractor<false>(&mat, true, start, length);
220 for (Index_ r = start, end = start + length; r < end; ++r) {
221 auto ptr = ext->fetch(buffer.data());
222
223 if (blocked) {
224 tatami_stats::grouped_variances::direct(
225 ptr,
226 NC,
227 block,
228 nblocks,
229 block_size.data(),
230 tmp_means.data(),
231 tmp_vars.data(),
232 false,
233 static_cast<Index_*>(NULL)
234 );
235 for (size_t b = 0; b < nblocks; ++b) {
236 buffers[b].means[r] = tmp_means[b];
237 buffers[b].variances[r] = tmp_vars[b];
238 }
239 } else {
240 auto stat = tatami_stats::variances::direct(ptr, NC, false);
241 buffers[0].means[r] = stat.first;
242 buffers[0].variances[r] = stat.second;
243 }
244 }
245 }, NR, num_threads);
246}
247
248template<typename Value_, typename Index_, typename Stat_, typename Block_>
251 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
252 const Block_* block,
253 const std::vector<Index_>& block_size,
254 int num_threads)
255{
256 bool blocked = (block != NULL);
257 auto nblocks = block_size.size();
258 auto NR = mat.nrow(), NC = mat.ncol();
259
260 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
261 std::vector<Stat_> tmp_means(nblocks);
262 std::vector<Stat_> tmp_vars(nblocks);
263 std::vector<Index_> tmp_nzero(nblocks);
264
265 std::vector<Value_> vbuffer(NC);
266 std::vector<Index_> ibuffer(NC);
269 auto ext = tatami::consecutive_extractor<true>(&mat, true, start, length, opt);
270
271 for (Index_ r = start, end = start + length; r < end; ++r) {
272 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
273
274 if (blocked) {
275 tatami_stats::grouped_variances::direct(
276 range.value,
277 range.index,
278 range.number,
279 block,
280 nblocks,
281 block_size.data(),
282 tmp_means.data(),
283 tmp_vars.data(),
284 tmp_nzero.data(),
285 false,
286 static_cast<Index_*>(NULL)
287 );
288 for (size_t b = 0; b < nblocks; ++b) {
289 buffers[b].means[r] = tmp_means[b];
290 buffers[b].variances[r] = tmp_vars[b];
291 }
292 } else {
293 auto stat = tatami_stats::variances::direct(range.value, range.number, NC, false);
294 buffers[0].means[r] = stat.first;
295 buffers[0].variances[r] = stat.second;
296 }
297 }
298 }, NR, num_threads);
299}
300
301template<typename Value_, typename Index_, typename Stat_, typename Block_>
304 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
305 const Block_* block,
306 const std::vector<Index_>& block_size,
307 int num_threads)
308{
309 bool blocked = (block != NULL);
310 auto nblocks = block_size.size();
311 auto NR = mat.nrow(), NC = mat.ncol();
312
314 std::vector<Value_> buffer(length);
315 auto ext = tatami::consecutive_extractor<false>(&mat, false, static_cast<Index_>(0), NC, start, length);
316
317 auto get_var = [&](Index_ b) -> Stat_* { return buffers[b].variances; };
318 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_var)> local_vars(thread, nblocks, start, length, std::move(get_var));
319 auto get_mean = [&](Index_ b) -> Stat_* { return buffers[b].means; };
320 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_mean)> local_means(thread, nblocks, start, length, std::move(get_mean));
321
322 std::vector<tatami_stats::variances::RunningDense<Stat_, Value_, Index_> > runners;
323 runners.reserve(nblocks);
324 for (size_t b = 0; b < nblocks; ++b) {
325 runners.emplace_back(length, local_means.data(b), local_vars.data(b), false);
326 }
327
328 if (blocked) {
329 for (Index_ c = 0; c < NC; ++c) {
330 auto ptr = ext->fetch(buffer.data());
331 runners[block[c]].add(ptr);
332 }
333 } else {
334 for (Index_ c = 0; c < NC; ++c) {
335 auto ptr = ext->fetch(buffer.data());
336 runners[0].add(ptr);
337 }
338 }
339
340 for (size_t b = 0; b < nblocks; ++b) {
341 runners[b].finish();
342 }
343 local_vars.transfer();
344 local_means.transfer();
345 }, NR, num_threads);
346}
347
348template<typename Value_, typename Index_, typename Stat_, typename Block_>
351 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
352 const Block_* block,
353 const std::vector<Index_>& block_size,
354 int num_threads)
355{
356 bool blocked = (block != NULL);
357 auto nblocks = block_size.size();
358 auto NR = mat.nrow(), NC = mat.ncol();
359 std::vector<std::vector<Index_> > nonzeros(nblocks, std::vector<Index_>(NR));
360
362 std::vector<Value_> vbuffer(length);
363 std::vector<Index_> ibuffer(length);
366 auto ext = tatami::consecutive_extractor<true>(&mat, false, static_cast<Index_>(0), NC, start, length, opt);
367
368 auto get_var = [&](Index_ b) -> Stat_* { return buffers[b].variances; };
369 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_var)> local_vars(thread, nblocks, start, length, std::move(get_var));
370 auto get_mean = [&](Index_ b) -> Stat_* { return buffers[b].means; };
371 tatami_stats::LocalOutputBuffers<Stat_, decltype(get_mean)> local_means(thread, nblocks, start, length, std::move(get_mean));
372
373 std::vector<tatami_stats::variances::RunningSparse<Stat_, Value_, Index_> > runners;
374 runners.reserve(nblocks);
375 for (size_t b = 0; b < nblocks; ++b) {
376 runners.emplace_back(length, local_means.data(b), local_vars.data(b), false, start);
377 }
378
379 if (blocked) {
380 for (Index_ c = 0; c < NC; ++c) {
381 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
382 runners[block[c]].add(range.value, range.index, range.number);
383 }
384 } else {
385 for (Index_ c = 0; c < NC; ++c) {
386 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
387 runners[0].add(range.value, range.index, range.number);
388 }
389 }
390
391 for (size_t b = 0; b < nblocks; ++b) {
392 runners[b].finish();
393 }
394 local_vars.transfer();
395 local_means.transfer();
396 }, NR, num_threads);
397}
398
399template<typename Value_, typename Index_, typename Stat_, typename Block_>
402 const std::vector<ModelGeneVariancesBuffers<Stat_> >& buffers,
403 const Block_* block,
404 const std::vector<Index_>& block_size,
405 int num_threads)
406{
407 if (mat.prefer_rows()) {
408 if (mat.sparse()) {
410 } else {
412 }
413 } else {
414 if (mat.sparse()) {
416 } else {
418 }
419 }
420}
421
422template<typename Index_, typename Stat_, class Function_>
423void compute_average(
424 Index_ ngenes,
425 const std::vector<ModelGeneVariancesBuffers<Stat_> >& per_block,
426 const std::vector<Index_>& block_size,
427 const std::vector<Stat_>& block_weights,
430 std::vector<Stat_*>& tmp_pointers,
431 std::vector<Stat_>& tmp_weights,
432 Stat_* output)
433{
434 if (!output) {
435 return;
436 }
437
438 tmp_pointers.clear();
439 tmp_weights.clear();
440 for (size_t b = 0, nblocks = per_block.size(); b < nblocks; ++b) {
441 if (block_size[b] < min_size) { // skip blocks with insufficient cells.
442 continue;
443 }
444 tmp_weights.push_back(block_weights[b]);
445 tmp_pointers.push_back(fun(per_block[b]));
446 }
447
449}
450
451}
479template<typename Value_, typename Index_, typename Block_, typename Stat_>
482 const Block_* block,
485{
486 Index_ NR = mat.nrow(), NC = mat.ncol();
487 std::vector<Index_> block_size;
488
489 if (block) {
490 block_size = tatami_stats::tabulate_groups(block, NC);
491 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
492 } else {
493 block_size.push_back(NC); // everything is one big block.
494 internal::compute_variances(mat, buffers.per_block, block, block_size, options.num_threads);
495 }
496 size_t nblocks = block_size.size();
497
499 auto fopt = options.fit_variance_trend_options;
500 fopt.num_threads = options.num_threads;
501 for (size_t b = 0; b < nblocks; ++b) {
502 const auto& current = buffers.per_block[b];
503 if (block_size[b] >= 2) {
504 fit_variance_trend(NR, current.means, current.variances, current.fitted, current.residuals, work, fopt);
505 } else {
506 std::fill_n(current.fitted, NR, std::numeric_limits<double>::quiet_NaN());
507 std::fill_n(current.residuals, NR, std::numeric_limits<double>::quiet_NaN());
508 }
509 }
510
511 auto ave_means = buffers.average.means;
512 auto ave_variances = buffers.average.variances;
513 auto ave_fitted = buffers.average.fitted;
514 auto ave_residuals = buffers.average.residuals;
515
517 auto block_weight = scran_blocks::compute_weights<Stat_>(block_size, options.block_weight_policy, options.variable_block_weight_parameters);
518
519 std::vector<Stat_*> tmp_pointers;
520 std::vector<Stat_> tmp_weights;
521 tmp_pointers.reserve(nblocks);
522 tmp_weights.reserve(nblocks);
523
524 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
525 /* min_size = */ static_cast<Index_>(1), // skip blocks without enough cells to compute the mean.
526 [](const auto& x) -> Stat_* { return x.means; }, tmp_pointers, tmp_weights, ave_means);
527
528 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
529 /* min_size = */ static_cast<Index_>(2), // skip blocks without enough cells to compute the variance.
530 [](const auto& x) -> Stat_* { return x.variances; }, tmp_pointers, tmp_weights, ave_variances);
531
532 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
533 /* min_size = */ static_cast<Index_>(2), // ditto.
534 [](const auto& x) -> Stat_* { return x.fitted; }, tmp_pointers, tmp_weights, ave_fitted);
535
536 internal::compute_average(NR, buffers.per_block, block_size, block_weight,
537 /* min_size = */ static_cast<Index_>(2), // ditto.
538 [](const auto& x) -> Stat_* { return x.residuals; }, tmp_pointers, tmp_weights, ave_residuals);
539 }
540}
541
558template<typename Value_, typename Index_, typename Stat_>
560 ModelGeneVariancesBuffers<Stat_> buffers, // yes, the lack of a const ref here is deliberate, we need to move it into bbuffers anyway.
562
564 bbuffers.per_block.emplace_back(std::move(buffers));
565
566 bbuffers.average.means = NULL;
567 bbuffers.average.variances = NULL;
568 bbuffers.average.fitted = NULL;
569 bbuffers.average.residuals = NULL;
570
572}
573
587template<typename Stat_ = double, typename Value_, typename Index_>
590
592 buffers.means = output.means.data();
593 buffers.variances = output.variances.data();
594 buffers.fitted = output.fitted.data();
595 buffers.residuals = output.residuals.data();
596
598 return output;
599}
600
618template<typename Stat_ = double, typename Value_, typename Index_, typename Block_>
620 size_t nblocks = (block ? tatami_stats::total_groups(block, mat.ncol()) : 1);
622
624 buffers.per_block.resize(nblocks);
625 for (size_t b = 0; b < nblocks; ++b) {
626 auto& current = buffers.per_block[b];
627 current.means = output.per_block[b].means.data();
628 current.variances = output.per_block[b].variances.data();
629 current.fitted = output.per_block[b].fitted.data();
630 current.residuals = output.per_block[b].residuals.data();
631 }
632
633 if (!options.compute_average) {
634 buffers.average.means = NULL;
635 buffers.average.variances = NULL;
636 buffers.average.fitted = NULL;
637 buffers.average.residuals = NULL;
638 } else {
639 buffers.average.means = output.average.means.data();
640 buffers.average.variances = output.average.variances.data();
641 buffers.average.fitted = output.average.fitted.data();
642 buffers.average.residuals = output.average.residuals.data();
643 }
644
646 return output;
647}
648
649}
650
651#endif
Fit a mean-variance trend to log-count data.
void average_vectors_weighted(size_t n, std::vector< Stat_ * > in, const Weight_ *w, Output_ *out, bool skip_nan)
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:14
void fit_variance_trend(size_t n, const Float_ *mean, const Float_ *variance, Float_ *fitted, Float_ *residuals, FitVarianceTrendWorkspace< Float_ > &workspace, const FitVarianceTrendOptions &options)
Definition fit_variance_trend.hpp:120
void model_gene_variances_blocked(const tatami::Matrix< Value_, Index_ > &mat, const Block_ *block, const ModelGeneVariancesBlockedBuffers< Stat_ > &buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:480
void model_gene_variances(const tatami::Matrix< Value_, Index_ > &mat, ModelGeneVariancesBuffers< Stat_ > buffers, const ModelGeneVariancesOptions &options)
Definition model_gene_variances.hpp:559
void choose_highly_variable_genes(size_t n, const Stat_ *statistic, Bool_ *output, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:188
void parallelize(Function_ fun, Index_ tasks, int threads)
Options for fit_variance_trend().
Definition fit_variance_trend.hpp:19
Workspace for fit_variance_trend().
Definition fit_variance_trend.hpp:80
Buffers for model_gene_variances_blocked().
Definition model_gene_variances.hpp:150
ModelGeneVariancesBuffers< Stat_ > average
Definition model_gene_variances.hpp:161
std::vector< ModelGeneVariancesBuffers< Stat_ > > per_block
Definition model_gene_variances.hpp:155
Results of model_gene_variances_blocked().
Definition model_gene_variances.hpp:169
std::vector< ModelGeneVariancesResults< Stat_ > > per_block
Definition model_gene_variances.hpp:188
ModelGeneVariancesResults< Stat_ > average
Definition model_gene_variances.hpp:194
Buffers for model_gene_variances() and friends.
Definition model_gene_variances.hpp:65
Stat_ * means
Definition model_gene_variances.hpp:69
Stat_ * residuals
Definition model_gene_variances.hpp:84
Stat_ * variances
Definition model_gene_variances.hpp:74
Stat_ * fitted
Definition model_gene_variances.hpp:79
Options for model_gene_variances() and friends.
Definition model_gene_variances.hpp:24
FitVarianceTrendOptions fit_variance_trend_options
Definition model_gene_variances.hpp:28
bool compute_average
Definition model_gene_variances.hpp:47
scran_blocks::VariableWeightParameters variable_block_weight_parameters
Definition model_gene_variances.hpp:41
int num_threads
Definition model_gene_variances.hpp:53
scran_blocks::WeightPolicy block_weight_policy
Definition model_gene_variances.hpp:34
Results of model_gene_variances().
Definition model_gene_variances.hpp:92
std::vector< Stat_ > fitted
Definition model_gene_variances.hpp:137
std::vector< Stat_ > means
Definition model_gene_variances.hpp:127
std::vector< Stat_ > residuals
Definition model_gene_variances.hpp:142
std::vector< Stat_ > variances
Definition model_gene_variances.hpp:132
bool sparse_ordered_index