irlba
A C++ library for IRLBA
Loading...
Searching...
No Matches
sparse.hpp
Go to the documentation of this file.
1#ifndef IRLBA_MATRIX_SPARSE_HPP
2#define IRLBA_MATRIX_SPARSE_HPP
3
4#include <vector>
5#include <memory>
6#include <cstddef>
7
8#include "../utils.hpp"
9#include "../parallel.hpp"
10#include "interface.hpp"
11
12#include "Eigen/Dense"
13#include "sanisizer/sanisizer.hpp"
14
15#ifndef IRLBA_CUSTOM_PARALLEL
16#include "subpar/subpar.hpp"
17#endif
18
24namespace irlba {
25
29template<class ValueArray_, class IndexArray_, class PointerArray_ >
30class ParallelSparseMatrixCore {
31public:
32 typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;
33
34public:
35 ParallelSparseMatrixCore(
36 Eigen::Index nrow,
37 Eigen::Index ncol,
38 ValueArray_ x,
39 IndexArray_ i,
40 PointerArray_ p,
41 bool column_major,
42 int num_threads
43 ) :
44 my_primary_dim(column_major ? ncol : nrow),
45 my_secondary_dim(column_major ? nrow : ncol),
46 my_num_threads(num_threads),
47 my_values(std::move(x)),
48 my_indices(std::move(i)),
49 my_ptrs(std::move(p)),
50 my_column_major(column_major)
51 {
52 if (num_threads > 1) {
53 const auto total_nzeros = my_ptrs[my_primary_dim]; // last element - not using back() to avoid an extra requirement on PointerArray.
54 const PointerType per_thread_floor = total_nzeros / my_num_threads;
55 const int per_thread_extra = total_nzeros % my_num_threads;
56
57 // Note that we do a lot of 't + 1' incrementing, but this is guaranteed to fit in an int because 't + 1 <= my_num_threads'.
58 // We just need 'my_num_threads + 1' to fit in a size_t for the various vector allocations.
59 const auto nthreads_p1 = sanisizer::sum<std::size_t>(my_num_threads, 1);
60
61 // Splitting primary dimension elements across threads so each thread processes the same number of nonzero elements.
62 {
63 sanisizer::resize(my_primary_boundaries, nthreads_p1);
64
65 Eigen::Index primary_counter = 0;
66 PointerType sofar = 0;
67 for (int t = 0; t < my_num_threads; ++t) {
68 sofar += per_thread_floor + (t < per_thread_extra); // first few threads might get an extra element to deal with the remainder.
69 while (primary_counter < my_primary_dim && my_ptrs[primary_counter + 1] <= sofar) {
70 ++primary_counter;
71 }
72 my_primary_boundaries[t + 1] = primary_counter;
73 }
74 }
75
76 // Splitting secondary dimension elements across threads so each thread processes the same number of nonzero elements.
77 {
78 auto secondary_nonzeros = sanisizer::create<std::vector<PointerType> >(my_secondary_dim);
79 for (PointerType i = 0; i < total_nzeros; ++i) { // don't using range for loop to avoid an extra requirement on IndexArray.
80 ++(secondary_nonzeros[my_indices[i]]);
81 }
82
83 sanisizer::resize(my_secondary_boundaries, nthreads_p1);
84 Eigen::Index secondary_counter = 0;
85 PointerType sofar = 0;
86 PointerType cum_secondary = 0;
87 for (int t = 0; t < my_num_threads; ++t) {
88 sofar += per_thread_floor + (t < per_thread_extra); // first few threads might get an extra element to deal with the remainder.
89 while (secondary_counter < my_secondary_dim && cum_secondary <= sofar) {
90 cum_secondary += secondary_nonzeros[secondary_counter];
91 ++secondary_counter;
92 }
93 my_secondary_boundaries[t + 1] = secondary_counter;
94 }
95
96 sanisizer::resize(my_secondary_nonzero_boundaries, nthreads_p1);
97 for (auto& starts : my_secondary_nonzero_boundaries) {
98 sanisizer::resize(starts, my_primary_dim);
99 }
100
101 for (Eigen::Index c = 0; c < my_primary_dim; ++c) {
102 const auto primary_start = my_ptrs[c], primary_end = my_ptrs[c + 1];
103 my_secondary_nonzero_boundaries[0][c] = primary_start;
104 auto s = primary_start;
105 for (int thread = 0; thread < my_num_threads; ++thread) {
106 const auto limit = my_secondary_boundaries[thread + 1];
107 while (s < primary_end && my_indices[s] < limit) {
108 ++s;
109 }
110 my_secondary_nonzero_boundaries[thread + 1][c] = s;
111 }
112 }
113 }
114 }
115 }
116
117private:
118 Eigen::Index my_primary_dim, my_secondary_dim;
119 int my_num_threads;
120
121 ValueArray_ my_values;
122 IndexArray_ my_indices;
123 PointerArray_ my_ptrs;
124 bool my_column_major;
125
126 std::vector<Eigen::Index> my_primary_boundaries;
127 std::vector<Eigen::Index> my_secondary_boundaries;
128 std::vector<std::vector<PointerType> > my_secondary_nonzero_boundaries;
129
130public:
131 Eigen::Index rows() const {
132 if (my_column_major) {
133 return my_secondary_dim;
134 } else {
135 return my_primary_dim;
136 }
137 }
138
139 Eigen::Index cols() const {
140 if (my_column_major) {
141 return my_primary_dim;
142 } else {
143 return my_secondary_dim;
144 }
145 }
146
147 const ValueArray_& get_values() const {
148 return my_values;
149 }
150
151 const IndexArray_& get_indices() const {
152 return my_indices;
153 }
154
155 const PointerArray_& get_pointers() const {
156 return my_ptrs;
157 }
158
159 int get_num_threads() const {
160 return my_num_threads;
161 }
162
163 bool get_column_major() const {
164 return my_column_major;
165 }
166
167 const std::vector<Eigen::Index>& get_primary_boundaries() const {
168 return my_primary_boundaries;
169 }
170
171 const std::vector<Eigen::Index>& get_secondary_boundaries() const {
172 return my_secondary_boundaries;
173 }
174
175 const std::vector<std::vector<PointerType> >& get_secondary_nonzero_boundaries() const {
176 return my_secondary_nonzero_boundaries;
177 }
178
179public:
180 template<typename EigenVector_>
181 void indirect_multiply(const EigenVector_& rhs, std::vector<std::vector<typename EigenVector_::Scalar> >& thread_buffers, EigenVector_& output) const {
182 if (my_num_threads == 1) {
183 output.setZero();
184 for (Eigen::Index c = 0; c < my_primary_dim; ++c) {
185 auto start = my_ptrs[c];
186 auto end = my_ptrs[c + 1];
187 auto val = rhs.coeff(c);
188 for (PointerType s = start; s < end; ++s) {
189 output.coeffRef(my_indices[s]) += my_values[s] * val;
190 }
191 }
192 return;
193 }
194
195 parallelize(my_num_threads, [&](int t) -> void {
196 const auto secondary_start = my_secondary_boundaries[t];
197 const auto secondary_end = my_secondary_boundaries[t + 1];
198 const auto secondary_len = secondary_end - secondary_start;
199
200 // Using a separate buffer for the other threads to avoid false
201 // sharing. On first use, each buffer is allocated within each
202 // thread to give malloc a chance of using thread-specific arenas.
203 typename EigenVector_::Scalar* optr;
204 if (t != 0) {
205 auto& curbuffer = thread_buffers[t - 1];
206 sanisizer::resize(curbuffer, secondary_len);
207 optr = curbuffer.data();
208 } else {
209 optr = output.data() + secondary_start;
210 }
211 std::fill_n(optr, secondary_len, static_cast<typename EigenVector_::Scalar>(0));
212
213 const auto& nz_starts = my_secondary_nonzero_boundaries[t];
214 const auto& nz_ends = my_secondary_nonzero_boundaries[t + 1];
215 for (Eigen::Index c = 0; c < my_primary_dim; ++c) {
216 const auto nz_start = nz_starts[c];
217 const auto nz_end = nz_ends[c];
218 const auto val = rhs.coeff(c);
219 for (PointerType s = nz_start; s < nz_end; ++s) {
220 optr[my_indices[s] - secondary_start] += my_values[s] * val;
221 }
222 }
223
224 if (t != 0) {
225 std::copy_n(optr, secondary_len, output.data() + secondary_start);
226 }
227 });
228
229 return;
230 }
231
232public:
233 template<typename EigenVector_>
234 void direct_multiply(const EigenVector_& rhs, EigenVector_& output) const {
235 if (my_num_threads == 1) {
236 for (Eigen::Index c = 0; c < my_primary_dim; ++c) {
237 output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);
238 }
239 return;
240 }
241
242 parallelize(my_num_threads, [&](int t) -> void {
243 const auto curstart = my_primary_boundaries[t];
244 const auto curend = my_primary_boundaries[t + 1];
245 for (auto c = curstart; c < curend; ++c) {
246 output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);
247 }
248 });
249
250 return;
251 }
252
253private:
254 template<typename Scalar_, class EigenVector_>
255 Scalar_ column_dot_product(Eigen::Index p, const EigenVector_& rhs) const {
256 PointerType primary_start = my_ptrs[p], primary_end = my_ptrs[p + 1];
257 Scalar_ dot = 0;
258 for (PointerType s = primary_start; s < primary_end; ++s) {
259 dot += my_values[s] * rhs.coeff(my_indices[s]);
260 }
261 return dot;
262 }
263};
278template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >
279class ParallelSparseWorkspace final : public Workspace<EigenVector_> {
280public:
284 ParallelSparseWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :
285 my_core(core)
286 {
287 if (my_core.get_num_threads() > 1 && my_core.get_column_major()) {
288 my_thread_buffers.resize(my_core.get_num_threads() - 1);
289 }
290 }
295private:
296 const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;
297 std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;
298
299public:
300 void multiply(const EigenVector_& right, EigenVector_& output) {
301 if (my_core.get_column_major()) {
302 my_core.indirect_multiply(right, my_thread_buffers, output);
303 } else {
304 my_core.direct_multiply(right, output);
305 }
306 }
307};
308
319template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >
320class ParallelSparseAdjointWorkspace final : public AdjointWorkspace<EigenVector_> {
321public:
325 ParallelSparseAdjointWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :
326 my_core(core)
327 {
328 if (my_core.get_num_threads() > 1 && !my_core.get_column_major()) {
329 my_thread_buffers.resize(my_core.get_num_threads() - 1);
330 }
331 }
336private:
337 const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;
338 std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;
339
340public:
341 void multiply(const EigenVector_& right, EigenVector_& output) {
342 if (my_core.get_column_major()) {
343 my_core.direct_multiply(right, output);
344 } else {
345 my_core.indirect_multiply(right, my_thread_buffers, output);
346 }
347 }
348};
349
360template<class EigenMatrix_, class ValueArray_, class IndexArray_, class PointerArray_ >
361class ParallelSparseRealizeWorkspace final : public RealizeWorkspace<EigenMatrix_> {
362public:
366 ParallelSparseRealizeWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :
367 my_core(core)
368 {}
373private:
374 const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;
375
376public:
377 const EigenMatrix_& realize(EigenMatrix_& buffer) {
378 const auto nr = my_core.rows(), nc = my_core.cols();
379 buffer.resize(nr, nc);
380 buffer.setZero();
381
382 const auto& ptrs = my_core.get_pointers();
383 const auto& indices = my_core.get_indices();
384 const auto& values = my_core.get_values();
385
386 typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;
387 if (my_core.get_column_major()) {
388 for (Eigen::Index c = 0; c < nc; ++c) {
389 PointerType col_start = ptrs[c], col_end = ptrs[c + 1];
390 for (PointerType s = col_start; s < col_end; ++s) {
391 buffer.coeffRef(indices[s], c) = values[s];
392 }
393 }
394 } else {
395 for (Eigen::Index r = 0; r < nr; ++r) {
396 PointerType row_start = ptrs[r], row_end = ptrs[r + 1];
397 for (PointerType s = row_start; s < row_end; ++s) {
398 buffer.coeffRef(r, indices[s]) = values[s];
399 }
400 }
401 }
402
403 return buffer;
404 }
405};
406
433template<
434 class EigenVector_,
435 class EigenMatrix_,
436 class ValueArray_,
437 class IndexArray_,
438 class PointerArray_
439>
440class ParallelSparseMatrix final : public Matrix<EigenVector_, EigenMatrix_> {
441public:
447
464 ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads) :
465 my_core(nrow, ncol, std::move(x), std::move(i), std::move(p), column_major, num_threads)
466 {}
467
468private:
469 ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_> my_core;
470
471public:
475 Eigen::Index rows() const {
476 return my_core.rows();
477 }
478
482 Eigen::Index cols() const {
483 return my_core.cols();
484 }
485
490 const ValueArray_& get_values() const {
491 return my_core.get_values();
492 }
493
498 const IndexArray_& get_indices() const {
499 return my_core.get_indices();
500 }
501
505 const PointerArray_& get_pointers() const {
506 return my_core.get_pointers();
507 }
508
512 typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;
513
521 const std::vector<Eigen::Index>& get_primary_boundaries() const {
522 return my_core.get_primary_boundaries();
523 }
524
532 const std::vector<Eigen::Index>& get_secondary_boundaries() const {
533 return my_core.get_secondary_boundaries();
534 }
535
545 const std::vector<std::vector<PointerType> >& get_secondary_nonzero_boundaries() const {
546 return my_core.get_secondary_nonzero_boundaries();
547 }
548
549public:
550 std::unique_ptr<Workspace<EigenVector_> > new_workspace() const {
551 return new_known_workspace();
552 }
553
554 std::unique_ptr<AdjointWorkspace<EigenVector_> > new_adjoint_workspace() const {
556 }
557
558 std::unique_ptr<RealizeWorkspace<EigenMatrix_> > new_realize_workspace() const {
560 }
561
562public:
566 std::unique_ptr<ParallelSparseWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> > new_known_workspace() const {
567 return std::make_unique<ParallelSparseWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);
568 }
569
573 std::unique_ptr<ParallelSparseAdjointWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> > new_known_adjoint_workspace() const {
574 return std::make_unique<ParallelSparseAdjointWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);
575 }
576
580 std::unique_ptr<ParallelSparseRealizeWorkspace<EigenMatrix_, ValueArray_, IndexArray_, PointerArray_> > new_known_realize_workspace() const {
581 return std::make_unique<ParallelSparseRealizeWorkspace<EigenMatrix_, ValueArray_, IndexArray_, PointerArray_> >(my_core);
582 }
583
584};
585
586}
587
588#endif
Workspace class for multiplying a transposed Matrix.
Definition interface.hpp:61
Interface for a matrix to use in compute().
Definition interface.hpp:142
Workspace for multiplication of a transposed ParallelSparseMatrix.
Definition sparse.hpp:320
void multiply(const EigenVector_ &right, EigenVector_ &output)
Definition sparse.hpp:341
Sparse matrix with customizable parallelization.
Definition sparse.hpp:440
ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads)
Definition sparse.hpp:464
const std::vector< Eigen::Index > & get_primary_boundaries() const
Definition sparse.hpp:521
Eigen::Index cols() const
Definition sparse.hpp:482
const ValueArray_ & get_values() const
Definition sparse.hpp:490
std::unique_ptr< AdjointWorkspace< EigenVector_ > > new_adjoint_workspace() const
Definition sparse.hpp:554
Eigen::Index rows() const
Definition sparse.hpp:475
I< decltype(std::declval< PointerArray_ >()[0])> PointerType
Definition sparse.hpp:512
ParallelSparseMatrix()
Definition sparse.hpp:446
std::unique_ptr< RealizeWorkspace< EigenMatrix_ > > new_realize_workspace() const
Definition sparse.hpp:558
std::unique_ptr< ParallelSparseWorkspace< EigenVector_, ValueArray_, IndexArray_, PointerArray_ > > new_known_workspace() const
Definition sparse.hpp:566
const std::vector< std::vector< PointerType > > & get_secondary_nonzero_boundaries() const
Definition sparse.hpp:545
const PointerArray_ & get_pointers() const
Definition sparse.hpp:505
std::unique_ptr< ParallelSparseRealizeWorkspace< EigenMatrix_, ValueArray_, IndexArray_, PointerArray_ > > new_known_realize_workspace() const
Definition sparse.hpp:580
const IndexArray_ & get_indices() const
Definition sparse.hpp:498
std::unique_ptr< Workspace< EigenVector_ > > new_workspace() const
Definition sparse.hpp:550
const std::vector< Eigen::Index > & get_secondary_boundaries() const
Definition sparse.hpp:532
std::unique_ptr< ParallelSparseAdjointWorkspace< EigenVector_, ValueArray_, IndexArray_, PointerArray_ > > new_known_adjoint_workspace() const
Definition sparse.hpp:573
Workspace for realizing a ParallelSparseMatrix.
Definition sparse.hpp:361
const EigenMatrix_ & realize(EigenMatrix_ &buffer)
Definition sparse.hpp:377
Workspace for multiplication of a ParallelSparseMatrix.
Definition sparse.hpp:279
void multiply(const EigenVector_ &right, EigenVector_ &output)
Definition sparse.hpp:300
Workspace class for realizing a Matrix.
Definition interface.hpp:99
Workspace class for multiplying a Matrix.
Definition interface.hpp:24
Interfaces for matrix inputs.
Implements IRLBA for approximate SVD.
Definition compute.hpp:22
void parallelize(Task_ num_tasks, Run_ run_task)
Definition parallel.hpp:33
Classes for parallelized multiplication.