scran_pca
Principal component analysis for single-cell data
Loading...
Searching...
No Matches
simple_pca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PCA_SIMPLE_PCA_HPP
2#define SCRAN_PCA_SIMPLE_PCA_HPP
3
4#include <vector>
5#include <type_traits>
6#include <algorithm>
7#include <memory>
8
9#include "tatami/tatami.hpp"
10#include "tatami_stats/tatami_stats.hpp"
11#include "irlba/irlba.hpp"
12#include "irlba/parallel.hpp"
13#include "irlba_tatami/irlba_tatami.hpp"
14#include "Eigen/Dense"
15#include "sanisizer/sanisizer.hpp"
16
17#include "utils.hpp"
18
24namespace scran_pca {
25
31template<typename EigenVector_ = Eigen::VectorXd>
37 // Avoid throwing an error if too many PCs are requested.
38 irlba_options.cap_number = true;
39 }
49 int number = 25;
50
56 bool scale = false;
57
62 bool transpose = true;
63
68 bool realize_matrix = true;
69
76 int num_threads = 1;
77
82};
83
87template<bool sparse_, typename Value_, typename Index_, class EigenVector_>
88void compute_row_means_and_variances(const tatami::Matrix<Value_, Index_>& mat, const int num_threads, EigenVector_& center_v, EigenVector_& scale_v) {
89 const auto ngenes = mat.nrow();
90
91 if (mat.prefer_rows()) {
92 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
93 auto ext = tatami::consecutive_extractor<sparse_>(mat, true, start, length, [&]{
95 opt.sparse_extract_index = false;
96 return opt;
97 }());
98 const auto ncells = mat.ncol();
100
101 for (Index_ g = start, end = start + length; g < end; ++g) {
102 const auto results = [&]{
103 if constexpr(sparse_) {
104 auto range = ext->fetch(vbuffer.data(), NULL);
105 return tatami_stats::variances::direct(range.value, range.number, ncells, /* skip_nan = */ false);
106 } else {
107 auto ptr = ext->fetch(vbuffer.data());
108 return tatami_stats::variances::direct(ptr, ncells, /* skip_nan = */ false);
109 }
110 }();
111 center_v.coeffRef(g) = results.first;
112 scale_v.coeffRef(g) = results.second;
113 }
114 }, ngenes, num_threads);
115
116 } else {
117 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
118 const auto ncells = mat.ncol();
119 auto ext = tatami::consecutive_extractor<sparse_>(mat, false, static_cast<Index_>(0), ncells, start, length);
120
121 typedef typename EigenVector_::Scalar Scalar;
122 tatami_stats::LocalOutputBuffer<Scalar> cbuffer(t, start, length, center_v.data());
123 tatami_stats::LocalOutputBuffer<Scalar> sbuffer(t, start, length, scale_v.data());
124
125 auto running = [&]{
126 if constexpr(sparse_) {
127 return tatami_stats::variances::RunningSparse<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false, /* subtract = */ start);
128 } else {
129 return tatami_stats::variances::RunningDense<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false);
130 }
131 }();
132
134 auto ibuffer = [&]{
135 if constexpr(sparse_) {
137 } else {
138 return false;
139 }
140 }();
141
142 for (Index_ c = 0; c < ncells; ++c) {
143 if constexpr(sparse_) {
144 const auto range = ext->fetch(vbuffer.data(), ibuffer.data());
145 running.add(range.value, range.index, range.number);
146 } else {
147 const auto ptr = ext->fetch(vbuffer.data());
148 running.add(ptr);
149 }
150 }
151
152 running.finish();
153 cbuffer.transfer();
154 sbuffer.transfer();
155 }, ngenes, num_threads);
156 }
157}
158
159template<class EigenVector_, class EigenMatrix_>
160std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_deferred_matrix_for_irlba(
161 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr,
162 const SimplePcaOptions<EigenVector_>& options,
163 const EigenVector_& center_v,
164 const EigenVector_& scale_v
165) {
166 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > alt;
167 alt.reset(new irlba::CenteredMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&center_v)> >(std::move(ptr), &center_v));
168 ptr.swap(alt);
169
170 if (options.scale) {
171 alt.reset(new irlba::ScaledMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&scale_v)> >(std::move(ptr), &scale_v, true, true));
172 ptr.swap(alt);
173 }
174
175 return ptr;
176}
177
178template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
179std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_sparse_matrix_for_irlba(
181 const SimplePcaOptions<EigenVector_>& options,
182 EigenVector_& center_v,
183 EigenVector_& scale_v,
184 typename EigenVector_::Scalar& total_var
185) {
186 const auto ngenes = mat.nrow();
187 sanisizer::resize(center_v, ngenes);
188 sanisizer::resize(scale_v, ngenes);
189 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output;
190
191 if (options.realize_matrix) {
192 // 'extracted' contains row-major contents...
194 mat,
195 /* row = */ true,
196 [&]{
198 opt.two_pass = false;
199 opt.num_threads = options.num_threads;
200 return opt;
201 }()
202 );
203
204 // But we effectively transpose it to CSC with genes in columns.
205 const Index_ ncells = mat.ncol();
206 const auto sparse_ptr = new irlba::ParallelSparseMatrix<
207 EigenVector_,
208 EigenMatrix_,
209 I<decltype(extracted.value)>,
210 I<decltype(extracted.index)>,
211 I<decltype(extracted.pointers)>
212 >(
213 ncells,
214 ngenes,
215 std::move(extracted.value),
216 std::move(extracted.index),
217 std::move(extracted.pointers),
218 true,
219 options.num_threads
220 );
221 output.reset(sparse_ptr);
222
223 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
224 const auto& pointers = sparse_ptr->get_pointers();
225 const auto& values = sparse_ptr->get_values();
226 for (Index_ g = start, end = start + length; g < end; ++g) {
227 const auto offset = pointers[g];
228 const auto next_offset = pointers[g + 1]; // increment won't overflow as 'g + 1 <= end'.
229 const Index_ num_nonzero = next_offset - offset;
230 const auto results = tatami_stats::variances::direct(values.data() + offset, num_nonzero, ncells, /* skip_nan = */ false);
231 center_v.coeffRef(g) = results.first;
232 scale_v.coeffRef(g) = results.second;
233 }
234 }, ngenes, options.num_threads);
235
236 total_var = process_scale_vector(options.scale, scale_v);
237
238 } else {
239 compute_row_means_and_variances<true>(mat, options.num_threads, center_v, scale_v);
240 total_var = process_scale_vector(options.scale, scale_v);
241
242 output.reset(
243 new irlba_tatami::Transposed<EigenVector_, EigenMatrix_, Value_, Index_, decltype(&mat)>(&mat, options.num_threads)
244 );
245 }
246
247 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
248}
249
250template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
251std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_dense_matrix_for_irlba(
253 const SimplePcaOptions<EigenVector_>& options,
254 EigenVector_& center_v,
255 EigenVector_& scale_v,
256 typename EigenVector_::Scalar& total_var
257) {
258 const Index_ ngenes = mat.nrow();
259 sanisizer::resize(center_v, ngenes);
260 sanisizer::resize(scale_v, ngenes);
261
262 if (options.realize_matrix) {
263 // Create a matrix with genes in columns.
264 const Index_ ncells = mat.ncol();
265 auto emat = std::make_unique<EigenMatrix_>(
266 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().rows())> >(ncells),
267 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().cols())> >(ngenes)
268 );
269
270 // By default, Eigen's matrices are column major. In such cases, because we want to do
271 // a transposition, we pretend it's row major during the conversion.
272 static_assert(!EigenMatrix_::IsRowMajor);
274 mat,
275 /* row_major = */ true,
276 emat->data(),
277 [&]{
278 tatami::ConvertToDenseOptions opt;
279 opt.num_threads = options.num_threads;
280 return opt;
281 }()
282 );
283
284 center_v.array() = emat->array().colwise().sum();
285 if (ncells) {
286 center_v /= ncells;
287 } else {
288 std::fill(center_v.begin(), center_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
289 }
290 emat->array().rowwise() -= center_v.adjoint().array(); // applying it to avoid wasting time with deferred operations inside IRLBA.
291
292 scale_v.array() = emat->array().colwise().squaredNorm();
293 if (ncells > 1) {
294 scale_v /= ncells - 1;
295 } else {
296 std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
297 }
298
299 total_var = process_scale_vector(options.scale, scale_v);
300 if (options.scale) {
301 emat->array().rowwise() /= scale_v.adjoint().array();
302 }
303
304 return std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> >(
306 );
307
308 } else {
309 compute_row_means_and_variances<false>(mat, options.num_threads, center_v, scale_v);
310 total_var = process_scale_vector(options.scale, scale_v);
311
312 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output(
313 new irlba_tatami::Transposed<EigenVector_, EigenMatrix_, Value_, Index_, decltype(&mat)>(&mat, options.num_threads)
314 );
315 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
316 }
317}
327template<typename EigenMatrix_, typename EigenVector_>
337 EigenMatrix_ components;
338
344 EigenVector_ variance_explained;
345
350 typename EigenVector_::Scalar total_variance = 0;
351
357 EigenMatrix_ rotation;
358
363 EigenVector_ center;
364
369 EigenVector_ scale;
370
375};
376
380template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_, class SubsetFunction_>
381void simple_pca_internal(
383 const SimplePcaOptions<EigenVector_>& options,
385 SubsetFunction_ subset_fun
386) {
388
389 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr;
390 if (mat.sparse()) {
391 ptr = prepare_sparse_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
392 } else {
393 ptr = prepare_dense_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
394 }
395
396 output.metrics = irlba::compute(*ptr, options.number, output.components, output.rotation, output.variance_explained, options.irlba_options);
397
398 subset_fun(output.components, output.variance_explained);
399
400 clean_up(mat.ncol(), output.components, output.variance_explained);
401 if (options.transpose) {
402 output.components.adjointInPlace();
403 }
404
405 if (!options.scale) {
406 output.scale = EigenVector_();
407 }
408}
436template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_>
438 simple_pca_internal(
439 mat,
440 options,
441 output,
442 [](const EigenMatrix_&, const EigenVector_&) -> void {}
443 );
444}
445
461template<typename EigenMatrix_ = Eigen::MatrixXd, class EigenVector_ = Eigen::VectorXd, typename Value_, typename Index_>
467
468}
469
470#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Metrics compute(const Matrix_ &matrix, const Eigen::Index number, EigenMatrix_ &outU, EigenMatrix_ &outV, EigenVector_ &outD, const Options< EigenVector_ > &options)
Principal component analysis on single-cell data.
void simple_pca(const tatami::Matrix< Value_, Index_ > &mat, const SimplePcaOptions< EigenVector_ > &options, SimplePcaResults< EigenMatrix_, EigenVector_ > &output)
Definition simple_pca.hpp:437
CompressedSparseContents< StoredValue_, StoredIndex_, StoredPointer_ > retrieve_compressed_sparse_contents(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row, const RetrieveCompressedSparseContentsOptions &options)
int parallelize(Function_ fun, const Index_ tasks, const int workers)
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row_major, StoredValue_ *const store, const ConvertToDenseOptions &options)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
Options for simple_pca().
Definition simple_pca.hpp:32
bool scale
Definition simple_pca.hpp:56
int number
Definition simple_pca.hpp:49
bool realize_matrix
Definition simple_pca.hpp:68
int num_threads
Definition simple_pca.hpp:76
bool transpose
Definition simple_pca.hpp:62
irlba::Options< EigenVector_ > irlba_options
Definition simple_pca.hpp:81
Results of simple_pca().
Definition simple_pca.hpp:328
EigenMatrix_ components
Definition simple_pca.hpp:337
EigenMatrix_ rotation
Definition simple_pca.hpp:357
EigenVector_ center
Definition simple_pca.hpp:363
EigenVector_ variance_explained
Definition simple_pca.hpp:344
EigenVector_ scale
Definition simple_pca.hpp:369
EigenVector_::Scalar total_variance
Definition simple_pca.hpp:350
irlba::Metrics metrics
Definition simple_pca.hpp:374
bool sparse_extract_index