scran_pca
Principal component analysis for single-cell data
Loading...
Searching...
No Matches
simple_pca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PCA_SIMPLE_PCA_HPP
2#define SCRAN_PCA_SIMPLE_PCA_HPP
3
4#include <vector>
5#include <type_traits>
6#include <algorithm>
7#include <memory>
8
9#include "tatami/tatami.hpp"
10#include "tatami_stats/tatami_stats.hpp"
11#include "irlba/irlba.hpp"
12#include "irlba/parallel.hpp"
13#include "Eigen/Dense"
14#include "sanisizer/sanisizer.hpp"
15
16#include "utils.hpp"
17
23namespace scran_pca {
24
33 // Avoid throwing an error if too many PCs are requested.
35 }
45 int number = 25;
46
52 bool scale = false;
53
58 bool transpose = true;
59
64 bool realize_matrix = true;
65
70 int num_threads = 1;
71
76};
77
81template<bool sparse_, typename Value_, typename Index_, class EigenVector_>
82void compute_row_means_and_variances(const tatami::Matrix<Value_, Index_>& mat, const int num_threads, EigenVector_& center_v, EigenVector_& scale_v) {
83 const auto ngenes = mat.nrow();
84
85 if (mat.prefer_rows()) {
86 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
87 auto ext = tatami::consecutive_extractor<sparse_>(mat, true, start, length, [&]{
89 opt.sparse_extract_index = false;
90 return opt;
91 }());
92 const auto ncells = mat.ncol();
94
95 for (Index_ g = start, end = start + length; g < end; ++g) {
96 const auto results = [&]{
97 if constexpr(sparse_) {
98 auto range = ext->fetch(vbuffer.data(), NULL);
99 return tatami_stats::variances::direct(range.value, range.number, ncells, /* skip_nan = */ false);
100 } else {
101 auto ptr = ext->fetch(vbuffer.data());
102 return tatami_stats::variances::direct(ptr, ncells, /* skip_nan = */ false);
103 }
104 }();
105 center_v.coeffRef(g) = results.first;
106 scale_v.coeffRef(g) = results.second;
107 }
108 }, ngenes, num_threads);
109
110 } else {
111 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
112 const auto ncells = mat.ncol();
113 auto ext = tatami::consecutive_extractor<sparse_>(mat, false, static_cast<Index_>(0), ncells, start, length);
114
115 typedef typename EigenVector_::Scalar Scalar;
116 tatami_stats::LocalOutputBuffer<Scalar> cbuffer(t, start, length, center_v.data());
117 tatami_stats::LocalOutputBuffer<Scalar> sbuffer(t, start, length, scale_v.data());
118
119 auto running = [&]{
120 if constexpr(sparse_) {
121 return tatami_stats::variances::RunningSparse<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false, /* subtract = */ start);
122 } else {
123 return tatami_stats::variances::RunningDense<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false);
124 }
125 }();
126
128 auto ibuffer = [&]{
129 if constexpr(sparse_) {
131 } else {
132 return false;
133 }
134 }();
135
136 for (Index_ c = 0; c < ncells; ++c) {
137 if constexpr(sparse_) {
138 const auto range = ext->fetch(vbuffer.data(), ibuffer.data());
139 running.add(range.value, range.index, range.number);
140 } else {
141 const auto ptr = ext->fetch(vbuffer.data());
142 running.add(ptr);
143 }
144 }
145
146 running.finish();
147 cbuffer.transfer();
148 sbuffer.transfer();
149 }, ngenes, num_threads);
150 }
151}
152
153template<class EigenVector_, class EigenMatrix_>
154std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_deferred_matrix_for_irlba(
155 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr,
156 const SimplePcaOptions& options,
157 const EigenVector_& center_v,
158 const EigenVector_& scale_v
159) {
160 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > alt;
161 alt.reset(new irlba::CenteredMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&center_v)> >(std::move(ptr), &center_v));
162 ptr.swap(alt);
163
164 if (options.scale) {
165 alt.reset(new irlba::ScaledMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&scale_v)> >(std::move(ptr), &scale_v, true, true));
166 ptr.swap(alt);
167 }
168
169 return ptr;
170}
171
172template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
173std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_sparse_matrix_for_irlba(
175 const SimplePcaOptions& options,
176 EigenVector_& center_v,
177 EigenVector_& scale_v,
178 typename EigenVector_::Scalar& total_var
179) {
180 const auto ngenes = mat.nrow();
181 sanisizer::resize(center_v, ngenes);
182 sanisizer::resize(scale_v, ngenes);
183 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output;
184
185 if (options.realize_matrix) {
186 // 'extracted' contains row-major contents...
188 mat,
189 /* row = */ true,
190 [&]{
192 opt.two_pass = false;
193 opt.num_threads = options.num_threads;
194 return opt;
195 }()
196 );
197
198 // But we effectively transpose it to CSC with genes in columns.
199 const Index_ ncells = mat.ncol();
200 const auto sparse_ptr = new irlba::ParallelSparseMatrix<
201 EigenVector_,
202 EigenMatrix_,
203 I<decltype(extracted.value)>,
204 I<decltype(extracted.index)>,
205 I<decltype(extracted.pointers)>
206 >(
207 ncells,
208 ngenes,
209 std::move(extracted.value),
210 std::move(extracted.index),
211 std::move(extracted.pointers),
212 true,
213 options.num_threads
214 );
215 output.reset(sparse_ptr);
216
217 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
218 const auto& pointers = sparse_ptr->get_pointers();
219 const auto& values = sparse_ptr->get_values();
220 for (Index_ g = start, end = start + length; g < end; ++g) {
221 const auto offset = pointers[g];
222 const auto next_offset = pointers[g + 1]; // increment won't overflow as 'g + 1 <= end'.
223 const Index_ num_nonzero = next_offset - offset;
224 const auto results = tatami_stats::variances::direct(values.data() + offset, num_nonzero, ncells, /* skip_nan = */ false);
225 center_v.coeffRef(g) = results.first;
226 scale_v.coeffRef(g) = results.second;
227 }
228 }, ngenes, options.num_threads);
229
230 total_var = process_scale_vector(options.scale, scale_v);
231
232 } else {
233 compute_row_means_and_variances<true>(mat, options.num_threads, center_v, scale_v);
234 total_var = process_scale_vector(options.scale, scale_v);
235
236 output.reset(new TransposedTatamiWrapperMatrix<EigenVector_, EigenMatrix_, Value_, Index_>(mat, options.num_threads));
237 }
238
239 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
240}
241
242template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
243std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_dense_matrix_for_irlba(
245 const SimplePcaOptions& options,
246 EigenVector_& center_v,
247 EigenVector_& scale_v,
248 typename EigenVector_::Scalar& total_var
249) {
250 const Index_ ngenes = mat.nrow();
251 sanisizer::resize(center_v, ngenes);
252 sanisizer::resize(scale_v, ngenes);
253
254 if (options.realize_matrix) {
255 // Create a matrix with genes in columns.
256 const Index_ ncells = mat.ncol();
257 auto emat = std::make_unique<EigenMatrix_>(
258 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().rows())> >(ncells),
259 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().cols())> >(ngenes)
260 );
261
262 // By default, Eigen's matrices are column major. In such cases, because we want to do
263 // a transposition, we pretend it's row major during the conversion.
264 static_assert(!EigenMatrix_::IsRowMajor);
266 mat,
267 /* row_major = */ true,
268 emat->data(),
269 [&]{
270 tatami::ConvertToDenseOptions opt;
271 opt.num_threads = options.num_threads;
272 return opt;
273 }()
274 );
275
276 center_v.array() = emat->array().colwise().sum();
277 if (ncells) {
278 center_v /= ncells;
279 } else {
280 std::fill(center_v.begin(), center_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
281 }
282 emat->array().rowwise() -= center_v.adjoint().array(); // applying it to avoid wasting time with deferred operations inside IRLBA.
283
284 scale_v.array() = emat->array().colwise().squaredNorm();
285 if (ncells > 1) {
286 scale_v /= ncells - 1;
287 } else {
288 std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
289 }
290
291 total_var = process_scale_vector(options.scale, scale_v);
292 if (options.scale) {
293 emat->array().rowwise() /= scale_v.adjoint().array();
294 }
295
296 return std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> >(
298 );
299
300 } else {
301 compute_row_means_and_variances<false>(mat, options.num_threads, center_v, scale_v);
302 total_var = process_scale_vector(options.scale, scale_v);
303
304 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output(
305 new TransposedTatamiWrapperMatrix<EigenVector_, EigenMatrix_, Value_, Index_>(mat, options.num_threads)
306 );
307 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
308 }
309}
319template<typename EigenMatrix_, typename EigenVector_>
329 EigenMatrix_ components;
330
336 EigenVector_ variance_explained;
337
342 typename EigenVector_::Scalar total_variance = 0;
343
349 EigenMatrix_ rotation;
350
355 EigenVector_ center;
356
361 EigenVector_ scale;
362
366 bool converged = false;
367};
368
372template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_, class SubsetFunction_>
373void simple_pca_internal(
375 const SimplePcaOptions& options,
377 SubsetFunction_ subset_fun
378) {
380
381 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr;
382 if (mat.sparse()) {
383 ptr = prepare_sparse_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
384 } else {
385 ptr = prepare_dense_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
386 }
387
388 const auto stats = irlba::compute(*ptr, options.number, output.components, output.rotation, output.variance_explained, options.irlba_options);
389 output.converged = stats.first;
390
391 subset_fun(output.components, output.variance_explained);
392
393 clean_up(mat.ncol(), output.components, output.variance_explained);
394 if (options.transpose) {
395 output.components.adjointInPlace();
396 }
397
398 if (!options.scale) {
399 output.scale = EigenVector_();
400 }
401}
429template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_>
431 simple_pca_internal(
432 mat,
433 options,
434 output,
435 [](const EigenMatrix_&, const EigenVector_&) -> void {}
436 );
437}
438
454template<typename EigenMatrix_ = Eigen::MatrixXd, class EigenVector_ = Eigen::VectorXd, typename Value_, typename Index_>
460
461}
462
463#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
std::pair< bool, int > compute(const Matrix_ &matrix, const Eigen::Index number, EigenMatrix_ &outU, EigenMatrix_ &outV, EigenVector_ &outD, const Options< EigenVector_ > &options)
Principal component analysis on single-cell data.
void simple_pca(const tatami::Matrix< Value_, Index_ > &mat, const SimplePcaOptions &options, SimplePcaResults< EigenMatrix_, EigenVector_ > &output)
Definition simple_pca.hpp:430
void parallelize(Function_ fun, const Index_ tasks, const int threads)
CompressedSparseContents< StoredValue_, StoredIndex_, StoredPointer_ > retrieve_compressed_sparse_contents(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row, const RetrieveCompressedSparseContentsOptions &options)
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row_major, StoredValue_ *const store, const ConvertToDenseOptions &options)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
Options for simple_pca().
Definition simple_pca.hpp:28
bool realize_matrix
Definition simple_pca.hpp:64
irlba::Options< Eigen::VectorXd > irlba_options
Definition simple_pca.hpp:75
bool transpose
Definition simple_pca.hpp:58
int num_threads
Definition simple_pca.hpp:70
bool scale
Definition simple_pca.hpp:52
int number
Definition simple_pca.hpp:45
Results of simple_pca().
Definition simple_pca.hpp:320
EigenMatrix_ components
Definition simple_pca.hpp:329
EigenMatrix_ rotation
Definition simple_pca.hpp:349
bool converged
Definition simple_pca.hpp:366
EigenVector_ center
Definition simple_pca.hpp:355
EigenVector_ variance_explained
Definition simple_pca.hpp:336
EigenVector_ scale
Definition simple_pca.hpp:361
EigenVector_::Scalar total_variance
Definition simple_pca.hpp:342
bool sparse_extract_index