scran_pca
Principal component analysis for single-cell data
Loading...
Searching...
No Matches
simple_pca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PCA_SIMPLE_PCA_HPP
2#define SCRAN_PCA_SIMPLE_PCA_HPP
3
4#include <vector>
5#include <type_traits>
6#include <algorithm>
7
8#include "tatami/tatami.hpp"
9#include "tatami_stats/tatami_stats.hpp"
10#include "irlba/irlba.hpp"
11#include "irlba/parallel.hpp"
12#include "Eigen/Dense"
13#include "sanisizer/sanisizer.hpp"
14
15#include "utils.hpp"
16
22namespace scran_pca {
23
33 }
43 int number = 25;
44
49 bool scale = false;
50
55 bool transpose = true;
56
61 int num_threads = 1;
62
67 bool realize_matrix = true;
68
73};
74
78namespace internal {
79
80template<bool sparse_, typename Value_, typename Index_, class EigenVector_>
81void compute_row_means_and_variances(const tatami::Matrix<Value_, Index_>& mat, int num_threads, EigenVector_& center_v, EigenVector_& scale_v) {
82 auto ngenes = mat.nrow();
83
84 if (mat.prefer_rows()) {
85 tatami::parallelize([&](int , Index_ start, Index_ length) -> void {
86 auto ext = tatami::consecutive_extractor<sparse_>(mat, true, start, length, [&]{
88 opt.sparse_extract_index = false;
89 return opt;
90 }());
91 auto ncells = mat.ncol();
93
94 for (Index_ g = start, end = start + length; g < end; ++g) {
95 auto results = [&]{
96 if constexpr(sparse_) {
97 auto range = ext->fetch(vbuffer.data(), NULL);
98 return tatami_stats::variances::direct(range.value, range.number, ncells, /* skip_nan = */ false);
99 } else {
100 auto ptr = ext->fetch(vbuffer.data());
101 return tatami_stats::variances::direct(ptr, ncells, /* skip_nan = */ false);
102 }
103 }();
104 center_v.coeffRef(g) = results.first;
105 scale_v.coeffRef(g) = results.second;
106 }
107 }, ngenes, num_threads);
108
109 } else {
110 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
111 auto ncells = mat.ncol();
112 auto ext = tatami::consecutive_extractor<sparse_>(mat, false, static_cast<Index_>(0), ncells, start, length);
113
114 typedef typename EigenVector_::Scalar Scalar;
115 tatami_stats::LocalOutputBuffer<Scalar> cbuffer(t, start, length, center_v.data());
116 tatami_stats::LocalOutputBuffer<Scalar> sbuffer(t, start, length, scale_v.data());
117
118 auto running = [&]{
119 if constexpr(sparse_) {
120 return tatami_stats::variances::RunningSparse<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false, /* subtract = */ start);
121 } else {
122 return tatami_stats::variances::RunningDense<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false);
123 }
124 }();
125
127 auto ibuffer = [&]{
128 if constexpr(sparse_) {
130 } else {
131 return false;
132 }
133 }();
134
135 for (Index_ c = 0; c < ncells; ++c) {
136 if constexpr(sparse_) {
137 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
138 running.add(range.value, range.index, range.number);
139 } else {
140 auto ptr = ext->fetch(vbuffer.data());
141 running.add(ptr);
142 }
143 }
144
145 running.finish();
146 cbuffer.transfer();
147 sbuffer.transfer();
148 }, ngenes, num_threads);
149 }
150}
151
152template<class IrlbaMatrix_, class EigenMatrix_, class EigenVector_>
153auto run_irlba_deferred(
154 const IrlbaMatrix_& mat,
155 const SimplePcaOptions& options,
156 EigenMatrix_& components,
157 EigenMatrix_& rotation,
158 EigenVector_& variance_explained,
159 EigenVector_& center_v,
160 EigenVector_& scale_v)
161{
162 irlba::Centered<IrlbaMatrix_, EigenVector_> centered(mat, center_v);
163 if (options.scale) {
164 irlba::Scaled<true, decltype(centered), EigenVector_> scaled(centered, scale_v, true);
165 return irlba::compute(scaled, options.number, components, rotation, variance_explained, options.irlba_options);
166 } else {
167 return irlba::compute(centered, options.number, components, rotation, variance_explained, options.irlba_options);
168 }
169}
170
171template<typename Value_, typename Index_, class EigenMatrix_, class EigenVector_>
172void run_sparse(
174 const SimplePcaOptions& options,
175 EigenMatrix_& components,
176 EigenMatrix_& rotation,
177 EigenVector_& variance_explained,
178 EigenVector_& center_v,
179 EigenVector_& scale_v,
180 typename EigenVector_::Scalar& total_var,
181 bool& converged)
182{
183 auto ngenes = mat.nrow();
184 center_v.resize(tatami::cast_Index_to_container_size<decltype(center_v)>(ngenes));
185 scale_v.resize(tatami::cast_Index_to_container_size<decltype(scale_v)>(ngenes));
186
187 if (options.realize_matrix) {
188 // 'extracted' contains row-major contents...
190 mat,
191 /* row = */ true,
192 [&]{
194 opt.two_pass = false;
195 opt.num_threads = options.num_threads;
196 return opt;
197 }()
198 );
199
200 // But we effectively transpose it to CSC with genes in columns.
201 Index_ ncells = mat.ncol();
203 ncells,
204 ngenes,
205 std::move(extracted.value),
206 std::move(extracted.index),
207 std::move(extracted.pointers),
208 true,
209 options.num_threads
210 );
211
212 tatami::parallelize([&](int, Index_ start, Index_ length) -> void {
213 const auto& pointers = emat.get_pointers();
214 const auto& values = emat.get_values();
215 for (Index_ g = start, end = start + length; g < end; ++g) {
216 auto offset = pointers[g];
217 auto next_offset = pointers[g + 1]; // increment won't overflow as 'g + 1 <= end'.
218 Index_ num_nonzero = next_offset - offset;
219 auto results = tatami_stats::variances::direct(values.data() + offset, num_nonzero, ncells, /* skip_nan = */ false);
220 center_v.coeffRef(g) = results.first;
221 scale_v.coeffRef(g) = results.second;
222 }
223 }, ngenes, options.num_threads);
224
225 total_var = internal::process_scale_vector(options.scale, scale_v);
226 auto out = run_irlba_deferred(emat, options, components, rotation, variance_explained, center_v, scale_v);
227 converged = out.first;
228
229 } else {
230 compute_row_means_and_variances<true>(mat, options.num_threads, center_v, scale_v);
231 total_var = internal::process_scale_vector(options.scale, scale_v);
232 auto out = run_irlba_deferred(
233 internal::TransposedTatamiWrapper<EigenVector_, Value_, Index_>(mat, options.num_threads),
234 options,
235 components,
236 rotation,
237 variance_explained,
238 center_v,
239 scale_v
240 );
241 converged = out.first;
242 }
243}
244
245template<typename Value_, typename Index_, class EigenMatrix_, class EigenVector_>
246void run_dense(
248 const SimplePcaOptions& options,
249 EigenMatrix_& components,
250 EigenMatrix_& rotation,
251 EigenVector_& variance_explained,
252 EigenVector_& center_v,
253 EigenVector_& scale_v,
254 typename EigenVector_::Scalar& total_var,
255 bool& converged)
256{
257 Index_ ngenes = mat.nrow();
258 center_v.resize(tatami::cast_Index_to_container_size<decltype(center_v)>(ngenes));
259 scale_v.resize(tatami::cast_Index_to_container_size<decltype(scale_v)>(ngenes));
260
261 if (options.realize_matrix) {
262 // Create a matrix with genes in columns.
263 Index_ ncells = mat.ncol();
264 EigenMatrix_ emat(
265 sanisizer::cast<decltype(std::declval<EigenMatrix_>().rows())>(ncells),
266 sanisizer::cast<decltype(std::declval<EigenMatrix_>().cols())>(ngenes)
267 );
268
269 // If emat is row-major, we want to fill it with columns of 'mat', so row_major = false.
270 // If emat is column-major, we want to fill it with rows of 'mat', so row_major = true.
272 mat,
273 /* row_major = */ !emat.IsRowMajor,
274 emat.data(),
275 [&]{
276 tatami::ConvertToDenseOptions opt;
277 opt.num_threads = options.num_threads;
278 return opt;
279 }()
280 );
281
282 center_v.array() = emat.array().colwise().sum();
283 if (ncells) {
284 center_v /= ncells;
285 } else {
286 std::fill(center_v.begin(), center_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
287 }
288 emat.array().rowwise() -= center_v.adjoint().array(); // applying it to avoid wasting time with deferred operations inside IRLBA.
289
290 scale_v.array() = emat.array().colwise().squaredNorm();
291 if (ncells > 1) {
292 scale_v /= ncells - 1;
293 } else {
294 std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
295 }
296
297 total_var = internal::process_scale_vector(options.scale, scale_v);
298 if (options.scale) {
299 emat.array().rowwise() /= scale_v.adjoint().array();
300 }
301
302 auto out = irlba::compute(emat, options.number, components, rotation, variance_explained, options.irlba_options);
303 converged = out.first;
304
305 } else {
306 compute_row_means_and_variances<false>(mat, options.num_threads, center_v, scale_v);
307 total_var = internal::process_scale_vector(options.scale, scale_v);
308 auto out = run_irlba_deferred(
309 internal::TransposedTatamiWrapper<EigenVector_, Value_, Index_>(mat, options.num_threads),
310 options,
311 components,
312 rotation,
313 variance_explained,
314 center_v,
315 scale_v
316 );
317 converged = out.first;
318 }
319}
320
321}
331template<typename EigenMatrix_, typename EigenVector_>
339 EigenMatrix_ components;
340
345 EigenVector_ variance_explained;
346
351 typename EigenVector_::Scalar total_variance = 0;
352
358 EigenMatrix_ rotation;
359
364 EigenVector_ center;
365
370 EigenVector_ scale;
371
375 bool converged = false;
376};
377
401template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_>
404
405 if (mat.sparse()) {
406 internal::run_sparse(mat, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance, output.converged);
407 } else {
408 internal::run_dense(mat, options, output.components, output.rotation, output.variance_explained, output.center, output.scale, output.total_variance, output.converged);
409 }
410
411 internal::clean_up(mat.ncol(), output.components, output.variance_explained);
412 if (options.transpose) {
413 output.components.adjointInPlace();
414 }
415
416 if (!options.scale) {
417 output.scale = EigenVector_();
418 }
419}
420
436template<typename EigenMatrix_ = Eigen::MatrixXd, class EigenVector_ = Eigen::VectorXd, typename Value_, typename Index_>
442
443}
444
445#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
std::pair< bool, int > compute(const Matrix_ &matrix, Eigen::Index number, EigenMatrix_ &outU, EigenMatrix_ &outV, EigenVector_ &outD, const Options &options)
Principal component analysis on single-cell data.
void simple_pca(const tatami::Matrix< Value_, Index_ > &mat, const SimplePcaOptions &options, SimplePcaResults< EigenMatrix_, EigenVector_ > &output)
Definition simple_pca.hpp:402
CompressedSparseContents< StoredValue_, StoredIndex_, StoredPointer_ > retrieve_compressed_sparse_contents(const Matrix< InputValue_, InputIndex_ > &matrix, bool row, const RetrieveCompressedSparseContentsOptions &options)
decltype(std::declval< Container_ >().size()) cast_Index_to_container_size(Index_ x)
void parallelize(Function_ fun, Index_ tasks, int threads)
Container_ create_container_of_Index_size(Index_ x, Args_ &&... args)
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, bool row_major, StoredValue_ *store, const ConvertToDenseOptions &options)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, bool row, Index_ iter_start, Index_ iter_length, Args_ &&... args)
Options for simple_pca().
Definition simple_pca.hpp:27
bool realize_matrix
Definition simple_pca.hpp:67
bool transpose
Definition simple_pca.hpp:55
int num_threads
Definition simple_pca.hpp:61
irlba::Options irlba_options
Definition simple_pca.hpp:72
bool scale
Definition simple_pca.hpp:49
int number
Definition simple_pca.hpp:43
Results of simple_pca().
Definition simple_pca.hpp:332
EigenMatrix_ components
Definition simple_pca.hpp:339
EigenMatrix_ rotation
Definition simple_pca.hpp:358
bool converged
Definition simple_pca.hpp:375
EigenVector_ center
Definition simple_pca.hpp:364
EigenVector_ variance_explained
Definition simple_pca.hpp:345
EigenVector_ scale
Definition simple_pca.hpp:370
EigenVector_::Scalar total_variance
Definition simple_pca.hpp:351
bool sparse_extract_index