scran_pca
Principal component analysis for single-cell data
Loading...
Searching...
No Matches
simple_pca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PCA_SIMPLE_PCA_HPP
2#define SCRAN_PCA_SIMPLE_PCA_HPP
3
4#include <vector>
5#include <type_traits>
6#include <algorithm>
7
8#include "tatami/tatami.hpp"
9#include "tatami_stats/tatami_stats.hpp"
10#include "irlba/irlba.hpp"
11#include "irlba/parallel.hpp"
12#include "Eigen/Dense"
13#include "sanisizer/sanisizer.hpp"
14
15#include "utils.hpp"
16
22namespace scran_pca {
23
32 // Avoid throwing an error if too many PCs are requested.
34 }
44 int number = 25;
45
51 bool scale = false;
52
57 bool transpose = true;
58
63 bool realize_matrix = true;
64
69 int num_threads = 1;
70
75};
76
80namespace internal {
81
82template<bool sparse_, typename Value_, typename Index_, class EigenVector_>
83void compute_row_means_and_variances(const tatami::Matrix<Value_, Index_>& mat, const int num_threads, EigenVector_& center_v, EigenVector_& scale_v) {
84 const auto ngenes = mat.nrow();
85
86 if (mat.prefer_rows()) {
87 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
88 auto ext = tatami::consecutive_extractor<sparse_>(mat, true, start, length, [&]{
90 opt.sparse_extract_index = false;
91 return opt;
92 }());
93 const auto ncells = mat.ncol();
95
96 for (Index_ g = start, end = start + length; g < end; ++g) {
97 const auto results = [&]{
98 if constexpr(sparse_) {
99 auto range = ext->fetch(vbuffer.data(), NULL);
100 return tatami_stats::variances::direct(range.value, range.number, ncells, /* skip_nan = */ false);
101 } else {
102 auto ptr = ext->fetch(vbuffer.data());
103 return tatami_stats::variances::direct(ptr, ncells, /* skip_nan = */ false);
104 }
105 }();
106 center_v.coeffRef(g) = results.first;
107 scale_v.coeffRef(g) = results.second;
108 }
109 }, ngenes, num_threads);
110
111 } else {
112 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
113 const auto ncells = mat.ncol();
114 auto ext = tatami::consecutive_extractor<sparse_>(mat, false, static_cast<Index_>(0), ncells, start, length);
115
116 typedef typename EigenVector_::Scalar Scalar;
117 tatami_stats::LocalOutputBuffer<Scalar> cbuffer(t, start, length, center_v.data());
118 tatami_stats::LocalOutputBuffer<Scalar> sbuffer(t, start, length, scale_v.data());
119
120 auto running = [&]{
121 if constexpr(sparse_) {
122 return tatami_stats::variances::RunningSparse<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false, /* subtract = */ start);
123 } else {
124 return tatami_stats::variances::RunningDense<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false);
125 }
126 }();
127
129 auto ibuffer = [&]{
130 if constexpr(sparse_) {
132 } else {
133 return false;
134 }
135 }();
136
137 for (Index_ c = 0; c < ncells; ++c) {
138 if constexpr(sparse_) {
139 const auto range = ext->fetch(vbuffer.data(), ibuffer.data());
140 running.add(range.value, range.index, range.number);
141 } else {
142 const auto ptr = ext->fetch(vbuffer.data());
143 running.add(ptr);
144 }
145 }
146
147 running.finish();
148 cbuffer.transfer();
149 sbuffer.transfer();
150 }, ngenes, num_threads);
151 }
152}
153
154template<class IrlbaMatrix_, class EigenMatrix_, class EigenVector_>
155auto run_irlba_deferred(
156 const IrlbaMatrix_& mat,
157 const SimplePcaOptions& options,
158 EigenMatrix_& components,
159 EigenMatrix_& rotation,
160 EigenVector_& variance_explained,
161 EigenVector_& center_v,
162 EigenVector_& scale_v)
163{
164 irlba::Centered<IrlbaMatrix_, EigenVector_> centered(mat, center_v);
165 if (options.scale) {
166 irlba::Scaled<true, decltype(I(centered)), EigenVector_> scaled(centered, scale_v, true);
167 return irlba::compute(scaled, options.number, components, rotation, variance_explained, options.irlba_options);
168 } else {
169 return irlba::compute(centered, options.number, components, rotation, variance_explained, options.irlba_options);
170 }
171}
172
173template<typename Value_, typename Index_, class EigenMatrix_, class EigenVector_>
174void run_sparse(
176 const SimplePcaOptions& options,
177 EigenMatrix_& components,
178 EigenMatrix_& rotation,
179 EigenVector_& variance_explained,
180 EigenVector_& center_v,
181 EigenVector_& scale_v,
182 typename EigenVector_::Scalar& total_var,
183 bool& converged)
184{
185 const auto ngenes = mat.nrow();
186 sanisizer::resize(center_v, ngenes);
187 sanisizer::resize(scale_v, ngenes);
188
189 if (options.realize_matrix) {
190 // 'extracted' contains row-major contents...
192 mat,
193 /* row = */ true,
194 [&]{
196 opt.two_pass = false;
197 opt.num_threads = options.num_threads;
198 return opt;
199 }()
200 );
201
202 // But we effectively transpose it to CSC with genes in columns.
203 const Index_ ncells = mat.ncol();
205 ncells,
206 ngenes,
207 std::move(extracted.value),
208 std::move(extracted.index),
209 std::move(extracted.pointers),
210 true,
211 options.num_threads
212 );
213
214 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
215 const auto& pointers = emat.get_pointers();
216 const auto& values = emat.get_values();
217 for (Index_ g = start, end = start + length; g < end; ++g) {
218 const auto offset = pointers[g];
219 const auto next_offset = pointers[g + 1]; // increment won't overflow as 'g + 1 <= end'.
220 const Index_ num_nonzero = next_offset - offset;
221 const auto results = tatami_stats::variances::direct(values.data() + offset, num_nonzero, ncells, /* skip_nan = */ false);
222 center_v.coeffRef(g) = results.first;
223 scale_v.coeffRef(g) = results.second;
224 }
225 }, ngenes, options.num_threads);
226
227 total_var = internal::process_scale_vector(options.scale, scale_v);
228 const auto out = run_irlba_deferred(emat, options, components, rotation, variance_explained, center_v, scale_v);
229 converged = out.first;
230
231 } else {
232 compute_row_means_and_variances<true>(mat, options.num_threads, center_v, scale_v);
233 total_var = internal::process_scale_vector(options.scale, scale_v);
234 const auto out = run_irlba_deferred(
235 internal::TransposedTatamiWrapper<EigenVector_, Value_, Index_>(mat, options.num_threads),
236 options,
237 components,
238 rotation,
239 variance_explained,
240 center_v,
241 scale_v
242 );
243 converged = out.first;
244 }
245}
246
247template<typename Value_, typename Index_, class EigenMatrix_, class EigenVector_>
248void run_dense(
250 const SimplePcaOptions& options,
251 EigenMatrix_& components,
252 EigenMatrix_& rotation,
253 EigenVector_& variance_explained,
254 EigenVector_& center_v,
255 EigenVector_& scale_v,
256 typename EigenVector_::Scalar& total_var,
257 bool& converged)
258{
259 const Index_ ngenes = mat.nrow();
260 sanisizer::resize(center_v, ngenes);
261 sanisizer::resize(scale_v, ngenes);
262
263 if (options.realize_matrix) {
264 // Create a matrix with genes in columns.
265 const Index_ ncells = mat.ncol();
266 EigenMatrix_ emat(
267 sanisizer::cast<decltype(I(std::declval<EigenMatrix_>().rows()))>(ncells),
268 sanisizer::cast<decltype(I(std::declval<EigenMatrix_>().cols()))>(ngenes)
269 );
270
271 // If emat is row-major, we want to fill it with columns of 'mat', so row_major = false.
272 // If emat is column-major, we want to fill it with rows of 'mat', so row_major = true.
274 mat,
275 /* row_major = */ !emat.IsRowMajor,
276 emat.data(),
277 [&]{
278 tatami::ConvertToDenseOptions opt;
279 opt.num_threads = options.num_threads;
280 return opt;
281 }()
282 );
283
284 center_v.array() = emat.array().colwise().sum();
285 if (ncells) {
286 center_v /= ncells;
287 } else {
288 std::fill(center_v.begin(), center_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
289 }
290 emat.array().rowwise() -= center_v.adjoint().array(); // applying it to avoid wasting time with deferred operations inside IRLBA.
291
292 scale_v.array() = emat.array().colwise().squaredNorm();
293 if (ncells > 1) {
294 scale_v /= ncells - 1;
295 } else {
296 std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
297 }
298
299 total_var = internal::process_scale_vector(options.scale, scale_v);
300 if (options.scale) {
301 emat.array().rowwise() /= scale_v.adjoint().array();
302 }
303
304 const auto out = irlba::compute(emat, options.number, components, rotation, variance_explained, options.irlba_options);
305 converged = out.first;
306
307 } else {
308 compute_row_means_and_variances<false>(mat, options.num_threads, center_v, scale_v);
309 total_var = internal::process_scale_vector(options.scale, scale_v);
310 const auto out = run_irlba_deferred(
311 internal::TransposedTatamiWrapper<EigenVector_, Value_, Index_>(mat, options.num_threads),
312 options,
313 components,
314 rotation,
315 variance_explained,
316 center_v,
317 scale_v
318 );
319 converged = out.first;
320 }
321}
322
323}
333template<typename EigenMatrix_, typename EigenVector_>
341 EigenMatrix_ components;
342
347 EigenVector_ variance_explained;
348
353 typename EigenVector_::Scalar total_variance = 0;
354
360 EigenMatrix_ rotation;
361
366 EigenVector_ center;
367
372 EigenVector_ scale;
373
377 bool converged = false;
378};
379
403template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_>
406
407 if (mat.sparse()) {
408 internal::run_sparse(mat, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance, output.converged);
409 } else {
410 internal::run_dense(mat, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance, output.converged);
411 }
412
413 internal::clean_up(mat.ncol(), output.components, output.variance_explained);
414 if (options.transpose) {
415 output.components.adjointInPlace();
416 }
417
418 if (!options.scale) {
419 output.scale = EigenVector_();
420 }
421}
422
438template<typename EigenMatrix_ = Eigen::MatrixXd, class EigenVector_ = Eigen::VectorXd, typename Value_, typename Index_>
444
445}
446
447#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
std::pair< bool, int > compute(const Matrix_ &matrix, Eigen::Index number, EigenMatrix_ &outU, EigenMatrix_ &outV, EigenVector_ &outD, const Options &options)
Principal component analysis on single-cell data.
void simple_pca(const tatami::Matrix< Value_, Index_ > &mat, const SimplePcaOptions &options, SimplePcaResults< EigenMatrix_, EigenVector_ > &output)
Definition simple_pca.hpp:404
CompressedSparseContents< StoredValue_, StoredIndex_, StoredPointer_ > retrieve_compressed_sparse_contents(const Matrix< InputValue_, InputIndex_ > &matrix, bool row, const RetrieveCompressedSparseContentsOptions &options)
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, bool row_major, StoredValue_ *store, const ConvertToDenseOptions &options)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Options for simple_pca().
Definition simple_pca.hpp:27
bool realize_matrix
Definition simple_pca.hpp:63
bool transpose
Definition simple_pca.hpp:57
int num_threads
Definition simple_pca.hpp:69
irlba::Options irlba_options
Definition simple_pca.hpp:74
bool scale
Definition simple_pca.hpp:51
int number
Definition simple_pca.hpp:44
Results of simple_pca().
Definition simple_pca.hpp:334
EigenMatrix_ components
Definition simple_pca.hpp:341
EigenMatrix_ rotation
Definition simple_pca.hpp:360
bool converged
Definition simple_pca.hpp:377
EigenVector_ center
Definition simple_pca.hpp:366
EigenVector_ variance_explained
Definition simple_pca.hpp:347
EigenVector_ scale
Definition simple_pca.hpp:372
EigenVector_::Scalar total_variance
Definition simple_pca.hpp:353
bool sparse_extract_index