scran_pca
Principal component analysis for single-cell data
Loading...
Searching...
No Matches
simple_pca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PCA_SIMPLE_PCA_HPP
2#define SCRAN_PCA_SIMPLE_PCA_HPP
3
4#include <vector>
5#include <type_traits>
6#include <algorithm>
7
8#include "tatami/tatami.hpp"
9#include "tatami_stats/tatami_stats.hpp"
10#include "irlba/irlba.hpp"
11#include "irlba/parallel.hpp"
12#include "Eigen/Dense"
13#include "sanisizer/sanisizer.hpp"
14
15#include "utils.hpp"
16
22namespace scran_pca {
23
32 // Avoid throwing an error if too many PCs are requested.
34 }
44 int number = 25;
45
51 bool scale = false;
52
57 bool transpose = true;
58
63 bool realize_matrix = true;
64
69 int num_threads = 1;
70
75};
76
80template<bool sparse_, typename Value_, typename Index_, class EigenVector_>
81void compute_row_means_and_variances(const tatami::Matrix<Value_, Index_>& mat, const int num_threads, EigenVector_& center_v, EigenVector_& scale_v) {
82 const auto ngenes = mat.nrow();
83
84 if (mat.prefer_rows()) {
85 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
86 auto ext = tatami::consecutive_extractor<sparse_>(mat, true, start, length, [&]{
88 opt.sparse_extract_index = false;
89 return opt;
90 }());
91 const auto ncells = mat.ncol();
93
94 for (Index_ g = start, end = start + length; g < end; ++g) {
95 const auto results = [&]{
96 if constexpr(sparse_) {
97 auto range = ext->fetch(vbuffer.data(), NULL);
98 return tatami_stats::variances::direct(range.value, range.number, ncells, /* skip_nan = */ false);
99 } else {
100 auto ptr = ext->fetch(vbuffer.data());
101 return tatami_stats::variances::direct(ptr, ncells, /* skip_nan = */ false);
102 }
103 }();
104 center_v.coeffRef(g) = results.first;
105 scale_v.coeffRef(g) = results.second;
106 }
107 }, ngenes, num_threads);
108
109 } else {
110 tatami::parallelize([&](int t, Index_ start, Index_ length) -> void {
111 const auto ncells = mat.ncol();
112 auto ext = tatami::consecutive_extractor<sparse_>(mat, false, static_cast<Index_>(0), ncells, start, length);
113
114 typedef typename EigenVector_::Scalar Scalar;
115 tatami_stats::LocalOutputBuffer<Scalar> cbuffer(t, start, length, center_v.data());
116 tatami_stats::LocalOutputBuffer<Scalar> sbuffer(t, start, length, scale_v.data());
117
118 auto running = [&]{
119 if constexpr(sparse_) {
120 return tatami_stats::variances::RunningSparse<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false, /* subtract = */ start);
121 } else {
122 return tatami_stats::variances::RunningDense<Scalar, Value_, Index_>(length, cbuffer.data(), sbuffer.data(), /* skip_nan = */ false);
123 }
124 }();
125
127 auto ibuffer = [&]{
128 if constexpr(sparse_) {
130 } else {
131 return false;
132 }
133 }();
134
135 for (Index_ c = 0; c < ncells; ++c) {
136 if constexpr(sparse_) {
137 const auto range = ext->fetch(vbuffer.data(), ibuffer.data());
138 running.add(range.value, range.index, range.number);
139 } else {
140 const auto ptr = ext->fetch(vbuffer.data());
141 running.add(ptr);
142 }
143 }
144
145 running.finish();
146 cbuffer.transfer();
147 sbuffer.transfer();
148 }, ngenes, num_threads);
149 }
150}
151
152template<class EigenVector_, class EigenMatrix_>
153std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_deferred_matrix_for_irlba(
154 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr,
155 const SimplePcaOptions& options,
156 const EigenVector_& center_v,
157 const EigenVector_& scale_v
158) {
159 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > alt;
160 alt.reset(new irlba::CenteredMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&center_v)>>(std::move(ptr), &center_v));
161 ptr.swap(alt);
162
163 if (options.scale) {
164 alt.reset(new irlba::ScaledMatrix<EigenVector_, EigenMatrix_, I<decltype(ptr)>, I<decltype(&scale_v)>>(std::move(ptr), &scale_v, true, true));
165 ptr.swap(alt);
166 }
167
168 return ptr;
169}
170
171template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
172std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_sparse_matrix_for_irlba(
174 const SimplePcaOptions& options,
175 EigenVector_& center_v,
176 EigenVector_& scale_v,
177 typename EigenVector_::Scalar& total_var
178) {
179 const auto ngenes = mat.nrow();
180 sanisizer::resize(center_v, ngenes);
181 sanisizer::resize(scale_v, ngenes);
182 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output;
183
184 if (options.realize_matrix) {
185 // 'extracted' contains row-major contents...
187 mat,
188 /* row = */ true,
189 [&]{
191 opt.two_pass = false;
192 opt.num_threads = options.num_threads;
193 return opt;
194 }()
195 );
196
197 // But we effectively transpose it to CSC with genes in columns.
198 const Index_ ncells = mat.ncol();
199 const auto sparse_ptr = new irlba::ParallelSparseMatrix<
200 EigenVector_,
201 EigenMatrix_,
202 I<decltype(extracted.value)>,
203 I<decltype(extracted.index)>,
204 I<decltype(extracted.pointers)>
205 >(
206 ncells,
207 ngenes,
208 std::move(extracted.value),
209 std::move(extracted.index),
210 std::move(extracted.pointers),
211 true,
212 options.num_threads
213 );
214 output.reset(sparse_ptr);
215
216 tatami::parallelize([&](const int, const Index_ start, const Index_ length) -> void {
217 const auto& pointers = sparse_ptr->get_pointers();
218 const auto& values = sparse_ptr->get_values();
219 for (Index_ g = start, end = start + length; g < end; ++g) {
220 const auto offset = pointers[g];
221 const auto next_offset = pointers[g + 1]; // increment won't overflow as 'g + 1 <= end'.
222 const Index_ num_nonzero = next_offset - offset;
223 const auto results = tatami_stats::variances::direct(values.data() + offset, num_nonzero, ncells, /* skip_nan = */ false);
224 center_v.coeffRef(g) = results.first;
225 scale_v.coeffRef(g) = results.second;
226 }
227 }, ngenes, options.num_threads);
228
229 total_var = process_scale_vector(options.scale, scale_v);
230
231 } else {
232 compute_row_means_and_variances<true>(mat, options.num_threads, center_v, scale_v);
233 total_var = process_scale_vector(options.scale, scale_v);
234
235 output.reset(new TransposedTatamiWrapperMatrix<EigenVector_, EigenMatrix_, Value_, Index_>(mat, options.num_threads));
236 }
237
238 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
239}
240
241template<class EigenMatrix_, typename Value_, typename Index_, class EigenVector_>
242std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > prepare_dense_matrix_for_irlba(
244 const SimplePcaOptions& options,
245 EigenVector_& center_v,
246 EigenVector_& scale_v,
247 typename EigenVector_::Scalar& total_var
248) {
249 const Index_ ngenes = mat.nrow();
250 sanisizer::resize(center_v, ngenes);
251 sanisizer::resize(scale_v, ngenes);
252
253 if (options.realize_matrix) {
254 // Create a matrix with genes in columns.
255 const Index_ ncells = mat.ncol();
256 auto emat = std::make_unique<Eigen::MatrixXd>(
257 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().rows())> >(ncells),
258 sanisizer::cast<I<decltype(std::declval<EigenMatrix_>().cols())> >(ngenes)
259 );
260
261 // If emat is row-major, we want to fill it with columns of 'mat', so row_major = false.
262 // If emat is column-major, we want to fill it with rows of 'mat', so row_major = true.
264 mat,
265 /* row_major = */ !(emat->IsRowMajor),
266 emat->data(),
267 [&]{
268 tatami::ConvertToDenseOptions opt;
269 opt.num_threads = options.num_threads;
270 return opt;
271 }()
272 );
273
274 center_v.array() = emat->array().colwise().sum();
275 if (ncells) {
276 center_v /= ncells;
277 } else {
278 std::fill(center_v.begin(), center_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
279 }
280 emat->array().rowwise() -= center_v.adjoint().array(); // applying it to avoid wasting time with deferred operations inside IRLBA.
281
282 scale_v.array() = emat->array().colwise().squaredNorm();
283 if (ncells > 1) {
284 scale_v /= ncells - 1;
285 } else {
286 std::fill(scale_v.begin(), scale_v.end(), std::numeric_limits<typename EigenVector_::Scalar>::quiet_NaN());
287 }
288
289 total_var = process_scale_vector(options.scale, scale_v);
290 if (options.scale) {
291 emat->array().rowwise() /= scale_v.adjoint().array();
292 }
293
294 return std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> >(
296 );
297
298 } else {
299 compute_row_means_and_variances<false>(mat, options.num_threads, center_v, scale_v);
300 total_var = process_scale_vector(options.scale, scale_v);
301
302 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > output(
303 new TransposedTatamiWrapperMatrix<EigenVector_, EigenMatrix_, Value_, Index_>(mat, options.num_threads)
304 );
305 return prepare_deferred_matrix_for_irlba(std::move(output), options, center_v, scale_v);
306 }
307}
317template<typename EigenMatrix_, typename EigenVector_>
325 EigenMatrix_ components;
326
331 EigenVector_ variance_explained;
332
337 typename EigenVector_::Scalar total_variance = 0;
338
344 EigenMatrix_ rotation;
345
350 EigenVector_ center;
351
356 EigenVector_ scale;
357
361 bool converged = false;
362};
363
387template<typename Value_, typename Index_, typename EigenMatrix_, class EigenVector_>
390
391 std::unique_ptr<irlba::Matrix<EigenVector_, EigenMatrix_> > ptr;
392 if (mat.sparse()) {
393 ptr = prepare_sparse_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
394 } else {
395 ptr = prepare_dense_matrix_for_irlba<EigenMatrix_>(mat, options, output.center, output.scale, output.total_variance);
396 }
397
398 const auto stats = irlba::compute(*ptr, options.number, output.components, output.rotation, output.variance_explained, options.irlba_options);
399 output.converged = stats.first;
400
401 clean_up(mat.ncol(), output.components, output.variance_explained);
402 if (options.transpose) {
403 output.components.adjointInPlace();
404 }
405
406 if (!options.scale) {
407 output.scale = EigenVector_();
408 }
409}
410
426template<typename EigenMatrix_ = Eigen::MatrixXd, class EigenVector_ = Eigen::VectorXd, typename Value_, typename Index_>
432
433}
434
435#endif
virtual Index_ ncol() const=0
virtual Index_ nrow() const=0
virtual bool prefer_rows() const=0
virtual std::unique_ptr< MyopicSparseExtractor< Value_, Index_ > > sparse(bool row, const Options &opt) const=0
std::pair< bool, int > compute(const Matrix_ &matrix, const Eigen::Index number, EigenMatrix_ &outU, EigenMatrix_ &outV, EigenVector_ &outD, const Options< EigenVector_ > &options)
Principal component analysis on single-cell data.
void simple_pca(const tatami::Matrix< Value_, Index_ > &mat, const SimplePcaOptions &options, SimplePcaResults< EigenMatrix_, EigenVector_ > &output)
Definition simple_pca.hpp:388
void parallelize(Function_ fun, const Index_ tasks, const int threads)
CompressedSparseContents< StoredValue_, StoredIndex_, StoredPointer_ > retrieve_compressed_sparse_contents(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row, const RetrieveCompressedSparseContentsOptions &options)
void convert_to_dense(const Matrix< InputValue_, InputIndex_ > &matrix, const bool row_major, StoredValue_ *const store, const ConvertToDenseOptions &options)
Container_ create_container_of_Index_size(const Index_ x, Args_ &&... args)
auto consecutive_extractor(const Matrix< Value_, Index_ > &matrix, const bool row, const Index_ iter_start, const Index_ iter_length, Args_ &&... args)
Options for simple_pca().
Definition simple_pca.hpp:27
bool realize_matrix
Definition simple_pca.hpp:63
irlba::Options< Eigen::VectorXd > irlba_options
Definition simple_pca.hpp:74
bool transpose
Definition simple_pca.hpp:57
int num_threads
Definition simple_pca.hpp:69
bool scale
Definition simple_pca.hpp:51
int number
Definition simple_pca.hpp:44
Results of simple_pca().
Definition simple_pca.hpp:318
EigenMatrix_ components
Definition simple_pca.hpp:325
EigenMatrix_ rotation
Definition simple_pca.hpp:344
bool converged
Definition simple_pca.hpp:361
EigenVector_ center
Definition simple_pca.hpp:350
EigenVector_ variance_explained
Definition simple_pca.hpp:331
EigenVector_ scale
Definition simple_pca.hpp:356
EigenVector_::Scalar total_variance
Definition simple_pca.hpp:337
bool sparse_extract_index