scran_pca
Principal component analysis for single-cell data
Loading...
Searching...
No Matches
simple_pca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PCA_SIMPLE_PCA_HPP
2#define SCRAN_PCA_SIMPLE_PCA_HPP
3
4#include <vector>
5#include <type_traits>
6#include <algorithm>
7#include <memory>
8
9#include "tatami/tatami.hpp"
10#include "tatami_stats/tatami_stats.hpp"
11#include "irlba/irlba.hpp"
12#include "irlba/parallel.hpp"
13#include "irlba_tatami/irlba_tatami.hpp"
14#include "Eigen/Dense"
15#include "sanisizer/sanisizer.hpp"
16
17#include "utils.hpp"
18
24namespace scran_pca {
25
34 // Avoid throwing an error if too many PCs are requested.
36 }
46 int number = 25;
47
53 bool scale = false;
54
59 bool transpose = true;
60
65 bool realize_matrix = true;
66
71 int num_threads = 1;
72
77};
78
82template<bool sparse_, typename Value_, typename Index_, class EigenVector_>
83void compute_row_means_and_variances(const tatami::Matrix<Value_, Index_>& mat, const int num_threads, EigenVector_& center_v, EigenVector_& scale_v) {
84 const auto ngenes = mat.nrow();
85
86 if (mat.prefer_rows()) {
87 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
88 auto ext = tatami::consecutive_extractor<sparse_>(mat, true, start, length, [&]{
90 opt.sparse_extract_index = false;
91 return opt;
92 }());
93 const auto ncells = mat.ncol();
95
96 for (Index_ g = start, end = start + length; g < end; ++g) {
97 const auto results = [&]{
98 if constexpr(sparse_) {
99 auto range = ext->fetch(vbuffer.data(), NULL);
100 return tatami_stats::variances::direct(range.value, range.number, ncells, /* skip_nan = */ false);
101 } else {
102 auto ptr = ext->fetch(vbuffer.data());
103 return tatami_stats::variances::direct(ptr, ncells, /* skip_nan = */ false);
104 }
105 }();
106 center_v.coeffRef(g) = results.first;
107 scale_v.coeffRef(g) = results.second;
108 }
109 }, ngenes, num_threads);
110
111 } else {
112 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
113 const auto ncells = mat.ncol();
114 auto ext = tatami::consecutive_extractor<sparse_>(mat, false, static_cast<Index_>(0), ncells, start, length);
115
116 typedef typename EigenVector_::Scalar Scalar;
117 tatami_stats::LocalOutputBuffer<Scalar> cbuffer(t, start, length, center_v.data());
118 tatami_stats::LocalOutputBuffer<Scalar> sbuffer(t, start, length, scale_v.data());
119
120 auto running = [&]{
121 if constexpr(sparse_) {
122 return tatami_stats::variances::RunningSparse<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false, /* subtract = */ start);
123 } else {
124 return tatami_stats::variances::RunningDense<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false);
125 }
126 }();
127
129 auto ibuffer = [&]{
130 if constexpr(sparse_) {
132 } else {
133 return false;
134 }
135 }();
136
137 for (Index_ c = 0; c < ncells; ++c) {
138 if constexpr(sparse_) {
139 const auto range = ext->fetch(vbuffer.data(), ibuffer.data());
140 running.add(range.value, range.index, range.number);
141 } else {
142 const auto ptr = ext->fetch(vbuffer.data());
143 running.add(ptr);
144 }
145 }
146
147 running.finish();
148 cbuffer.transfer();
149 sbuffer.transfer();
150 }, ngenes, num_threads);
151 }
152}
153
154template<class EigenVector_, class EigenMatrix_>
155std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_deferred_matrix_for_irlba(
156 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr,
157 const SimplePcaOptions& options,
158 const EigenVector_& center_v,
159 const EigenVector_& scale_v
160) {
161 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > alt;
162 alt.reset(new irlba::CenteredMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&center_v)> >(std::move(ptr), &center_v));
163 ptr.swap(alt);
164
165 if (options.scale) {
166 alt.reset(new irlba::ScaledMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&scale_v)> >(std::move(ptr), &scale_v, true, true));
167 ptr.swap(alt);
168 }
169
170 return ptr;
171}
172
173template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
174std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_sparse_matrix_for_irlba(
176 const SimplePcaOptions& options,
177 EigenVector_& center_v,
178 EigenVector_& scale_v,
179 typename EigenVector_::Scalar& total_var
180) {
181 const auto ngenes = mat.nrow();
182 sanisizer::resize(center_v, ngenes);
183 sanisizer::resize(scale_v, ngenes);
184 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output;
185
186 if (options.realize_matrix) {
187 // 'extracted' contains row-major contents...
189 mat,
190 /* row = */ true,
191 [&]{
193 opt.two_pass = false;
194 opt.num_threads = options.num_threads;
195 return opt;
196 }()
197 );
198
199 // But we effectively transpose it to CSC with genes in columns.
200 const Index_ ncells = mat.ncol();
201 const auto sparse_ptr = new irlba::ParallelSparseMatrix<
202 EigenVector_,
203 EigenMatrix_,
204 I<decltype(extracted.value)>,
205 I<decltype(extracted.index)>,
206 I<decltype(extracted.pointers)>
207 >(
208 ncells,
209 ngenes,
210 std::move(extracted.value),
211 std::move(extracted.index),
212 std::move(extracted.pointers),
213 true,
214 options.num_threads
215 );
216 output.reset(sparse_ptr);
217
218 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
219 const auto& pointers = sparse_ptr->get_pointers();
220 const auto& values = sparse_ptr->get_values();
221 for (Index_ g = start, end = start + length; g < end; ++g) {
222 const auto offset = pointers[g];
223 const auto next_offset = pointers[g + 1]; // increment won't overflow as 'g + 1 <= end'.
224 const Index_ num_nonzero = next_offset - offset;
225 const auto results = tatami_stats::variances::direct(values.data() + offset, num_nonzero, ncells, /* skip_nan = */ false);
226 center_v.coeffRef(g) = results.first;
227 scale_v.coeffRef(g) = results.second;
228 }
229 }, ngenes, options.num_threads);
230
231 total_var = process_scale_vector(options.scale, scale_v);
232
233 } else {
234 compute_row_means_and_variances<true>(mat, options.num_threads, center_v, scale_v);
235 total_var = process_scale_vector(options.scale, scale_v);
236
237 output.reset(
238 new irlba_tatami::Transposed<EigenVector_, EigenMatrix_, Value_, Index_, decltype(&mat)>(&mat, options.num_threads)
239 );
240 }
241
242 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
243}
244
245template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
246std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_dense_matrix_for_irlba(
248 const SimplePcaOptions& options,
249 EigenVector_& center_v,
250 EigenVector_& scale_v,
251 typename EigenVector_::Scalar& total_var
252) {
253 const Index_ ngenes = mat.nrow();
254 sanisizer::resize(center_v, ngenes);
255 sanisizer::resize(scale_v, ngenes);
256
257 if (options.realize_matrix) {
258 // Create a matrix with genes in columns.
259 const Index_ ncells = mat.ncol();
260 auto emat = std::make_unique<EigenMatrix_>(
261 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().rows())> >(ncells),
262 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().cols())> >(ngenes)
263 );
264
265 // By default, Eigen's matrices are column major. In such cases, because we want to do
266 // a transposition, we pretend it's row major during the conversion.
267 static_assert(!EigenMatrix_::IsRowMajor);
269 mat,
270 /* row_major = */ true,
271 emat->data(),
272 [&]{
273 tatami::ConvertToDenseOptions opt;
274 opt.num_threads = options.num_threads;
275 return opt;
276 }()
277 );
278
279 center_v.array() = emat->array().colwise().sum();
280 if (ncells) {
281 center_v /= ncells;
282 } else {
283 std::fill(center_v.begin(), center_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
284 }
285 emat->array().rowwise() -= center_v.adjoint().array(); // applying it to avoid wasting time with deferred operations inside IRLBA.
286
287 scale_v.array() = emat->array().colwise().squaredNorm();
288 if (ncells > 1) {
289 scale_v /= ncells - 1;
290 } else {
291 std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
292 }
293
294 total_var = process_scale_vector(options.scale, scale_v);
295 if (options.scale) {
296 emat->array().rowwise() /= scale_v.adjoint().array();
297 }
298
299 return std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> >(
301 );
302
303 } else {
304 compute_row_means_and_variances<false>(mat, options.num_threads, center_v, scale_v);
305 total_var = process_scale_vector(options.scale, scale_v);
306
307 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output(
308 new irlba_tatami::Transposed<EigenVector_, EigenMatrix_, Value_, Index_, decltype(&mat)>(&mat, options.num_threads)
309 );
310 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
311 }
312}
322template<typename EigenMatrix_, typename EigenVector_>
332 EigenMatrix_ components;
333
339 EigenVector_ variance_explained;
340
345 typename EigenVector_::Scalar total_variance = 0;
346
352 EigenMatrix_ rotation;
353
358 EigenVector_ center;
359
364 EigenVector_ scale;
365
369 bool converged = false;
370};
371
375template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_, class SubsetFunction_>
376void simple_pca_internal(
378 const SimplePcaOptions& options,
380 SubsetFunction_ subset_fun
381) {
383
384 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr;
385 if (mat.sparse()) {
386 ptr = prepare_sparse_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
387 } else {
388 ptr = prepare_dense_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
389 }
390
391 const auto stats = irlba::compute(*ptr, options.number, output.components, output.rotation, output.variance_explained, options.irlba_options);
392 output.converged = stats.first;
393
394 subset_fun(output.components, output.variance_explained);
395
396 clean_up(mat.ncol(), output.components, output.variance_explained);
397 if (options.transpose) {
398 output.components.adjointInPlace();
399 }
400
401 if (!options.scale) {
402 output.scale = EigenVector_();
403 }
404}
432template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_>
434 simple_pca_internal(
435 mat,
436 options,
437 output,
438 [](const EigenMatrix_&, const EigenVector_&) -> void {}
439 );
440}
441
457template<typename EigenMatrix_ = Eigen::MatrixXd, class EigenVector_ = Eigen::VectorXd, typename Value_, typename Index_>
463
464}
465
466#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
Principal component analysis on single-cell data.
void simple_pca(const tatami::Matrix< Value_, Index_ > &mat, const SimplePcaOptions &options, SimplePcaResults< EigenMatrix_, EigenVector_ > &output)
Definition simple_pca.hpp:433
void parallelize(Function_ fun, const Index_ tasks, const int threads)
CompressedSparseContents< StoredValue_, StoredIndex_, StoredPointer_ > retrieve_compressed_sparse_contents(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row, const RetrieveCompressedSparseContentsOptions &options)
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row_major, StoredValue_ *const store, const ConvertToDenseOptions &options)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
Options for simple_pca().
Definition simple_pca.hpp:29
bool realize_matrix
Definition simple_pca.hpp:65
irlba::Options< Eigen::VectorXd > irlba_options
Definition simple_pca.hpp:76
bool transpose
Definition simple_pca.hpp:59
int num_threads
Definition simple_pca.hpp:71
bool scale
Definition simple_pca.hpp:53
int number
Definition simple_pca.hpp:46
Results of simple_pca().
Definition simple_pca.hpp:323
EigenMatrix_ components
Definition simple_pca.hpp:332
EigenMatrix_ rotation
Definition simple_pca.hpp:352
bool converged
Definition simple_pca.hpp:369
EigenVector_ center
Definition simple_pca.hpp:358
EigenVector_ variance_explained
Definition simple_pca.hpp:339
EigenVector_ scale
Definition simple_pca.hpp:364
EigenVector_::Scalar total_variance
Definition simple_pca.hpp:345
bool sparse_extract_index