irlba
A C++ library for IRLBA
Loading...
Searching...
No Matches
sparse.hpp
Go to the documentation of this file.
1#ifndef IRLBA_MATRIX_SPARSE_HPP
2#define IRLBA_MATRIX_SPARSE_HPP
3
4#include <vector>
5#include <memory>
6#include <cstddef>
7#include <cassert>
8
9#include "../utils.hpp"
10#include "../parallel.hpp"
11#include "interface.hpp"
12
13#include "Eigen/Dense"
14#include "sanisizer/sanisizer.hpp"
15
16#ifndef IRLBA_CUSTOM_PARALLEL
17#include "subpar/subpar.hpp"
18#endif
19
25namespace irlba {
26
30template<class ValueArray_, class IndexArray_, class PointerArray_ >
31class ParallelSparseMatrixCore {
32public:
33 typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;
34
35public:
36 ParallelSparseMatrixCore(
37 Eigen::Index nrow,
38 Eigen::Index ncol,
39 ValueArray_ x,
40 IndexArray_ i,
41 PointerArray_ p,
42 bool column_major,
43 int num_threads
44 ) :
45 my_primary_dim(column_major ? ncol : nrow),
46 my_secondary_dim(column_major ? nrow : ncol),
47 my_num_threads(num_threads),
48 my_values(std::move(x)),
49 my_indices(std::move(i)),
50 my_ptrs(std::move(p)),
51 my_column_major(column_major)
52 {
53 assert(num_threads > 0);
54 if (num_threads > 1) {
55 const auto total_nzeros = my_ptrs[my_primary_dim]; // last element - not using back() to avoid an extra requirement on PointerArray.
56 const PointerType per_thread_floor = total_nzeros / my_num_threads;
57 const int per_thread_extra = total_nzeros % my_num_threads;
58
59 // Note that we do a lot of 't + 1' incrementing, but this is guaranteed to fit in an int because 't + 1 <= my_num_threads'.
60 // We just need 'my_num_threads + 1' to fit in a size_t for the various vector allocations.
61 const auto nthreads_p1 = sanisizer::sum<std::size_t>(my_num_threads, 1);
62
63 // Splitting primary dimension elements across threads so each thread processes the same number of nonzero elements.
64 sanisizer::resize(my_primary_boundaries, nthreads_p1);
65
66 Eigen::Index primary_counter = 0;
67 PointerType sofar = 0;
68 for (int t = 0; t < my_num_threads; ++t) {
69 sofar += per_thread_floor + (t < per_thread_extra); // first few threads might get an extra element to deal with the remainder.
70 while (primary_counter < my_primary_dim && my_ptrs[primary_counter + 1] <= sofar) {
71 ++primary_counter;
72 }
73 my_primary_boundaries[t + 1] = primary_counter;
74 }
75 }
76 }
77
78private:
79 Eigen::Index my_primary_dim, my_secondary_dim;
80 int my_num_threads;
81
82 ValueArray_ my_values;
83 IndexArray_ my_indices;
84 PointerArray_ my_ptrs;
85 bool my_column_major;
86
87 std::vector<Eigen::Index> my_primary_boundaries;
88
89public:
90 Eigen::Index rows() const {
91 if (my_column_major) {
92 return my_secondary_dim;
93 } else {
94 return my_primary_dim;
95 }
96 }
97
98 Eigen::Index cols() const {
99 if (my_column_major) {
100 return my_primary_dim;
101 } else {
102 return my_secondary_dim;
103 }
104 }
105
106 const ValueArray_& get_values() const {
107 return my_values;
108 }
109
110 const IndexArray_& get_indices() const {
111 return my_indices;
112 }
113
114 const PointerArray_& get_pointers() const {
115 return my_ptrs;
116 }
117
118 int get_num_threads() const {
119 return my_num_threads;
120 }
121
122 bool get_column_major() const {
123 return my_column_major;
124 }
125
126 const std::vector<Eigen::Index>& get_primary_boundaries() const {
127 return my_primary_boundaries;
128 }
129
130public:
131 template<typename EigenVector_>
132 void indirect_multiply(const EigenVector_& rhs, std::vector<std::vector<typename EigenVector_::Scalar> >& thread_buffers, EigenVector_& output) const {
133 if (my_num_threads == 1) {
134 output.setZero();
135 for (Eigen::Index c = 0; c < my_primary_dim; ++c) {
136 const auto start = my_ptrs[c];
137 const auto end = my_ptrs[c + 1];
138 const auto val = rhs.coeff(c);
139 for (PointerType s = start; s < end; ++s) {
140 output.coeffRef(my_indices[s]) += my_values[s] * val;
141 }
142 }
143 return;
144 }
145
146 parallelize(my_num_threads, [&](int t) -> void {
147 // Using a separate buffer for the other threads to avoid false
148 // sharing. On first use, each buffer is allocated within each
149 // thread to give malloc a chance of using thread-specific arenas.
150 typename EigenVector_::Scalar* optr;
151 if (t != 0) {
152 auto& curbuffer = thread_buffers[t - 1];
153 sanisizer::resize(curbuffer, my_secondary_dim);
154 optr = curbuffer.data();
155 } else {
156 optr = output.data();
157 }
158 std::fill_n(optr, my_secondary_dim, static_cast<typename EigenVector_::Scalar>(0));
159
160 // Using a thread-dependent reduction strategy. This results in slightly
161 // different results depending on the number of threads, oh well.
162 const auto curstart = my_primary_boundaries[t];
163 const auto curend = my_primary_boundaries[t + 1];
164 for (Eigen::Index p = curstart; p < curend; ++p) {
165 const auto start = my_ptrs[p];
166 const auto end = my_ptrs[p + 1];
167 const auto val = rhs.coeff(p);
168 for (PointerType s = start; s < end; ++s) {
169 optr[my_indices[s]] += my_values[s] * val;
170 }
171 }
172 });
173
174 assert(sanisizer::is_equal(thread_buffers.size(), my_num_threads - 1));
175 for (const auto& buffer : thread_buffers) {
176 for (Eigen::Index x = 0; x < my_secondary_dim; ++x) {
177 output.coeffRef(x) += buffer[x];
178 }
179 }
180 }
181
182public:
183 template<typename EigenVector_>
184 void direct_multiply(const EigenVector_& rhs, EigenVector_& output) const {
185 if (my_num_threads == 1) {
186 for (Eigen::Index c = 0; c < my_primary_dim; ++c) {
187 output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);
188 }
189 return;
190 }
191
192 parallelize(my_num_threads, [&](int t) -> void {
193 const auto curstart = my_primary_boundaries[t];
194 const auto curend = my_primary_boundaries[t + 1];
195 for (auto c = curstart; c < curend; ++c) {
196 output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);
197 }
198 });
199 }
200
201private:
202 template<typename Scalar_, class EigenVector_>
203 Scalar_ column_dot_product(Eigen::Index p, const EigenVector_& rhs) const {
204 const PointerType primary_start = my_ptrs[p], primary_end = my_ptrs[p + 1];
205 if (primary_start == primary_end) {
206 return 0;
207 }
208
209 // Copying Eigen's use of two accumulators; effectively unrolls the loop a little for speed.
210 Scalar_ dot1 = 0, dot2 = 0;
211
212 PointerType s = primary_start;
213 const PointerType primary_end_m1 = primary_end - 1;
214 for (; s < primary_end_m1; s += 2) {
215 dot1 += my_values[s] * rhs.coeff(my_indices[s]);
216 dot2 += my_values[s + 1] * rhs.coeff(my_indices[s + 1]);
217 }
218
219 if (s < primary_end) {
220 dot1 += my_values[s] * rhs.coeff(my_indices[s]);
221 }
222
223 return dot1 + dot2;
224 }
225};
226
227template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >
228class ParallelSparseWorkspace final : public Workspace<EigenVector_> {
229public:
230 ParallelSparseWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :
231 my_core(core)
232 {
233 if (my_core.get_num_threads() > 1 && my_core.get_column_major()) {
234 sanisizer::resize(my_thread_buffers, my_core.get_num_threads() - 1);
235 }
236 }
237
238private:
239 const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;
240 std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;
241
242public:
243 void multiply(const EigenVector_& right, EigenVector_& output) {
244 if (my_core.get_column_major()) {
245 my_core.indirect_multiply(right, my_thread_buffers, output);
246 } else {
247 my_core.direct_multiply(right, output);
248 }
249 }
250};
251
252template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >
253class ParallelSparseAdjointWorkspace final : public AdjointWorkspace<EigenVector_> {
254public:
255 ParallelSparseAdjointWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :
256 my_core(core)
257 {
258 if (my_core.get_num_threads() > 1 && !my_core.get_column_major()) {
259 sanisizer::resize(my_thread_buffers, my_core.get_num_threads() - 1);
260 }
261 }
262
263private:
264 const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;
265 std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;
266
267public:
268 void multiply(const EigenVector_& right, EigenVector_& output) {
269 if (my_core.get_column_major()) {
270 my_core.direct_multiply(right, output);
271 } else {
272 my_core.indirect_multiply(right, my_thread_buffers, output);
273 }
274 }
275};
276
277template<class EigenMatrix_, class ValueArray_, class IndexArray_, class PointerArray_ >
278class ParallelSparseRealizeWorkspace final : public RealizeWorkspace<EigenMatrix_> {
279public:
280 ParallelSparseRealizeWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :
281 my_core(core)
282 {}
283
284private:
285 const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;
286
287public:
288 const EigenMatrix_& realize(EigenMatrix_& buffer) {
289 const auto nr = my_core.rows(), nc = my_core.cols();
290 buffer.resize(nr, nc);
291 buffer.setZero();
292
293 const auto& ptrs = my_core.get_pointers();
294 const auto& indices = my_core.get_indices();
295 const auto& values = my_core.get_values();
296
297 typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;
298 if (my_core.get_column_major()) {
299 for (Eigen::Index c = 0; c < nc; ++c) {
300 PointerType col_start = ptrs[c], col_end = ptrs[c + 1];
301 for (PointerType s = col_start; s < col_end; ++s) {
302 buffer.coeffRef(indices[s], c) = values[s];
303 }
304 }
305 } else {
306 for (Eigen::Index r = 0; r < nr; ++r) {
307 PointerType row_start = ptrs[r], row_end = ptrs[r + 1];
308 for (PointerType s = row_start; s < row_end; ++s) {
309 buffer.coeffRef(r, indices[s]) = values[s];
310 }
311 }
312 }
313
314 return buffer;
315 }
316};
352template<
353 class EigenVector_,
354 class EigenMatrix_,
355 class ValueArray_,
356 class IndexArray_,
357 class PointerArray_
358>
359class ParallelSparseMatrix final : public Matrix<EigenVector_, EigenMatrix_> {
360public:
366
384 ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads) :
385 my_core(nrow, ncol, std::move(x), std::move(i), std::move(p), column_major, num_threads)
386 {}
387
388private:
389 ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_> my_core;
390
391public:
395 Eigen::Index rows() const {
396 return my_core.rows();
397 }
398
402 Eigen::Index cols() const {
403 return my_core.cols();
404 }
405
410 const ValueArray_& get_values() const {
411 return my_core.get_values();
412 }
413
418 const IndexArray_& get_indices() const {
419 return my_core.get_indices();
420 }
421
425 const PointerArray_& get_pointers() const {
426 return my_core.get_pointers();
427 }
428
432 typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;
433
441 const std::vector<Eigen::Index>& get_primary_boundaries() const {
442 return my_core.get_primary_boundaries();
443 }
444
445public:
446 std::unique_ptr<Workspace<EigenVector_> > new_workspace() const {
447 return new_known_workspace();
448 }
449
450 std::unique_ptr<AdjointWorkspace<EigenVector_> > new_adjoint_workspace() const {
452 }
453
454 std::unique_ptr<RealizeWorkspace<EigenMatrix_> > new_realize_workspace() const {
456 }
457
458public:
462 auto new_known_workspace() const {
463 return std::make_unique<ParallelSparseWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);
464 }
465
470 return std::make_unique<ParallelSparseAdjointWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);
471 }
472
477 return std::make_unique<ParallelSparseRealizeWorkspace<EigenMatrix_, ValueArray_, IndexArray_, PointerArray_> >(my_core);
478 }
479
480};
481
482}
483
484#endif
Interface for a matrix to use in compute().
Definition interface.hpp:142
Sparse matrix with customizable parallelization.
Definition sparse.hpp:359
ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads)
Definition sparse.hpp:384
const std::vector< Eigen::Index > & get_primary_boundaries() const
Definition sparse.hpp:441
Eigen::Index cols() const
Definition sparse.hpp:402
const ValueArray_ & get_values() const
Definition sparse.hpp:410
std::unique_ptr< AdjointWorkspace< EigenVector_ > > new_adjoint_workspace() const
Definition sparse.hpp:450
Eigen::Index rows() const
Definition sparse.hpp:395
I< decltype(std::declval< PointerArray_ >()[0])> PointerType
Definition sparse.hpp:432
ParallelSparseMatrix()
Definition sparse.hpp:365
std::unique_ptr< RealizeWorkspace< EigenMatrix_ > > new_realize_workspace() const
Definition sparse.hpp:454
auto new_known_adjoint_workspace() const
Definition sparse.hpp:469
const PointerArray_ & get_pointers() const
Definition sparse.hpp:425
const IndexArray_ & get_indices() const
Definition sparse.hpp:418
std::unique_ptr< Workspace< EigenVector_ > > new_workspace() const
Definition sparse.hpp:446
auto new_known_realize_workspace() const
Definition sparse.hpp:476
auto new_known_workspace() const
Definition sparse.hpp:462
Interfaces for matrix inputs.
Implements IRLBA for approximate SVD.
Definition compute.hpp:23
void parallelize(Task_ num_tasks, Run_ run_task)
Definition parallel.hpp:33
Classes for parallelized multiplication.