CppIrlba/sparse_8hpp_source.html

#ifndef IRLBA_MATRIX_SPARSE_HPP

#define IRLBA_MATRIX_SPARSE_HPP


#include <vector>

#include <memory>

#include <cstddef>


#include "../utils.hpp"

#include "../parallel.hpp"

#include "interface.hpp"


#include "Eigen/Dense"

#include "sanisizer/sanisizer.hpp"


#ifndef IRLBA_CUSTOM_PARALLEL

#include "subpar/subpar.hpp"

#endif


namespace irlba {


template<class ValueArray_, class IndexArray_, class PointerArray_ >

class ParallelSparseMatrixCore {

public:

    typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;


public:

    ParallelSparseMatrixCore(

        Eigen::Index nrow,

        Eigen::Index ncol,

        ValueArray_ x,

        IndexArray_ i,

        PointerArray_ p,

        bool column_major,

        int num_threads

    ) :

        my_primary_dim(column_major ? ncol : nrow),

        my_secondary_dim(column_major ? nrow : ncol),

        my_num_threads(num_threads),

        my_values(std::move(x)),

        my_indices(std::move(i)),

        my_ptrs(std::move(p)),

        my_column_major(column_major)

    {

        if (num_threads > 1) {

            const auto total_nzeros = my_ptrs[my_primary_dim]; // last element - not using back() to avoid an extra requirement on PointerArray.

            const PointerType per_thread_floor = total_nzeros / my_num_threads;

            const int per_thread_extra = total_nzeros % my_num_threads;


            // Note that we do a lot of 't + 1' incrementing, but this is guaranteed to fit in an int because 't + 1 <= my_num_threads'.

            // We just need 'my_num_threads + 1' to fit in a size_t for the various vector allocations.

            const auto nthreads_p1 = sanisizer::sum<std::size_t>(my_num_threads, 1);


            // Splitting primary dimension elements across threads so each thread processes the same number of nonzero elements.

            {

                sanisizer::resize(my_primary_boundaries, nthreads_p1);


                Eigen::Index primary_counter = 0;

                PointerType sofar = 0;

                for (int t = 0; t < my_num_threads; ++t) {

                    sofar += per_thread_floor + (t < per_thread_extra); // first few threads might get an extra element to deal with the remainder.

                    while (primary_counter < my_primary_dim && my_ptrs[primary_counter + 1] <= sofar) {

                        ++primary_counter;

                    }

                    my_primary_boundaries[t + 1] = primary_counter;

                }

            }


            // Splitting secondary dimension elements across threads so each thread processes the same number of nonzero elements.

            {

                auto secondary_nonzeros = sanisizer::create<std::vector<PointerType> >(my_secondary_dim);

                for (PointerType i = 0; i < total_nzeros; ++i) { // don't using range for loop to avoid an extra requirement on IndexArray.

                    ++(secondary_nonzeros[my_indices[i]]);

                }


                sanisizer::resize(my_secondary_boundaries, nthreads_p1);

                Eigen::Index secondary_counter = 0;

                PointerType sofar = 0;

                PointerType cum_secondary = 0;

                for (int t = 0; t < my_num_threads; ++t) {

                    sofar += per_thread_floor + (t < per_thread_extra); // first few threads might get an extra element to deal with the remainder.

                    while (secondary_counter < my_secondary_dim && cum_secondary <= sofar) {

                        cum_secondary += secondary_nonzeros[secondary_counter];

                        ++secondary_counter;

                    }

                    my_secondary_boundaries[t + 1] = secondary_counter;

                }


                sanisizer::resize(my_secondary_nonzero_boundaries, nthreads_p1);

                for (auto& starts : my_secondary_nonzero_boundaries) {

                    sanisizer::resize(starts, my_primary_dim);

                }


                for (Eigen::Index c = 0; c < my_primary_dim; ++c) {

                    const auto primary_start = my_ptrs[c], primary_end = my_ptrs[c + 1];

                    my_secondary_nonzero_boundaries[0][c] = primary_start;

                    auto s = primary_start;

                    for (int thread = 0; thread < my_num_threads; ++thread) {

                        const auto limit = my_secondary_boundaries[thread + 1];

                        while (s < primary_end && static_cast<Eigen::Index>(my_indices[s]) < limit) { // cast is safe as my_indices[s] < my_secondary_dim.

                            ++s;

                        }

                        my_secondary_nonzero_boundaries[thread + 1][c] = s;

                    }

                }

            }

        }

    }


private:

    Eigen::Index my_primary_dim, my_secondary_dim;

    int my_num_threads;


    ValueArray_ my_values;

    IndexArray_ my_indices;

    PointerArray_ my_ptrs;

    bool my_column_major;


    std::vector<Eigen::Index> my_primary_boundaries;


    // In theory, it is possible that the IndexArray type (i.e., IndexType) is not large enough to hold my_secondary_dim.

    // So while it is safe to cast from the IndexType to Eigen::Index, it is not safe to go the other way;

    // hence we use an Eigen::Index to hold the secondary boundaries as the last entry is equal to my_secondary_dim.

    std::vector<Eigen::Index> my_secondary_boundaries;


    std::vector<std::vector<PointerType> > my_secondary_nonzero_boundaries;


public:

    Eigen::Index rows() const {

        if (my_column_major) {

            return my_secondary_dim;

        } else {

            return my_primary_dim;

        }

    }


    Eigen::Index cols() const {

        if (my_column_major) {

            return my_primary_dim;

        } else {

            return my_secondary_dim;

        }

    }


    const ValueArray_& get_values() const {

        return my_values;

    }


    const IndexArray_& get_indices() const {

        return my_indices;

    }


    const PointerArray_& get_pointers() const {

        return my_ptrs;

    }


    int get_num_threads() const {

        return my_num_threads;

    }


    bool get_column_major() const {

        return my_column_major;

    }


    const std::vector<Eigen::Index>& get_primary_boundaries() const {

        return my_primary_boundaries;

    }


    const std::vector<Eigen::Index>& get_secondary_boundaries() const {

        return my_secondary_boundaries;

    }


    const std::vector<std::vector<PointerType> >& get_secondary_nonzero_boundaries() const {

        return my_secondary_nonzero_boundaries;

    }


public:

    template<typename EigenVector_>

    void indirect_multiply(const EigenVector_& rhs, std::vector<std::vector<typename EigenVector_::Scalar> >& thread_buffers, EigenVector_& output) const {

        if (my_num_threads == 1) {

            output.setZero();

            for (Eigen::Index c = 0; c < my_primary_dim; ++c) {

                auto start = my_ptrs[c];

                auto end = my_ptrs[c + 1];

                auto val = rhs.coeff(c);

                for (PointerType s = start; s < end; ++s) {

                    output.coeffRef(my_indices[s]) += my_values[s] * val;

                }

            }

            return;

        }


        parallelize(my_num_threads, [&](int t) -> void {

            const auto secondary_start = my_secondary_boundaries[t];

            const auto secondary_end = my_secondary_boundaries[t + 1];

            const auto secondary_len = secondary_end - secondary_start;


            // Using a separate buffer for the other threads to avoid false

            // sharing. On first use, each buffer is allocated within each

            // thread to give malloc a chance of using thread-specific arenas.

            typename EigenVector_::Scalar* optr;

            if (t != 0) {

                auto& curbuffer = thread_buffers[t - 1];

                sanisizer::resize(curbuffer, secondary_len);

                optr = curbuffer.data();

            } else {

                optr = output.data() + secondary_start;

            }

            std::fill_n(optr, secondary_len, static_cast<typename EigenVector_::Scalar>(0));


            const auto& nz_starts = my_secondary_nonzero_boundaries[t];

            const auto& nz_ends = my_secondary_nonzero_boundaries[t + 1];

            for (Eigen::Index c = 0; c < my_primary_dim; ++c) {

                const auto nz_start = nz_starts[c];

                const auto nz_end = nz_ends[c];

                const auto val = rhs.coeff(c);

                for (PointerType s = nz_start; s < nz_end; ++s) {

                    optr[my_indices[s] - secondary_start] += my_values[s] * val;

                }

            }


            if (t != 0) {

                std::copy_n(optr, secondary_len, output.data() + secondary_start);

            }

        });


        return;

    }


public:

    template<typename EigenVector_>

    void direct_multiply(const EigenVector_& rhs, EigenVector_& output) const {

        if (my_num_threads == 1) {

            for (Eigen::Index c = 0; c < my_primary_dim; ++c) {

                output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);

            }

            return;

        }


        parallelize(my_num_threads, [&](int t) -> void {

            const auto curstart = my_primary_boundaries[t];

            const auto curend = my_primary_boundaries[t + 1];

            for (auto c = curstart; c < curend; ++c) {

                output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);

            }

        });


        return;

    }


private:

    template<typename Scalar_, class EigenVector_>

    Scalar_ column_dot_product(Eigen::Index p, const EigenVector_& rhs) const {

        PointerType primary_start = my_ptrs[p], primary_end = my_ptrs[p + 1];

        Scalar_ dot = 0;

        for (PointerType s = primary_start; s < primary_end; ++s) {

            dot += my_values[s] * rhs.coeff(my_indices[s]);

        }

        return dot;

    }

};

template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >


class ParallelSparseWorkspace final : public Workspace<EigenVector_> {

public:

    ParallelSparseWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :

        my_core(core)

    {

        if (my_core.get_num_threads() > 1 && my_core.get_column_major()) {

            my_thread_buffers.resize(my_core.get_num_threads() - 1);

        }

    }

private:

    const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;

    std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;


public:


    void multiply(const EigenVector_& right, EigenVector_& output) {

        if (my_core.get_column_major()) {

            my_core.indirect_multiply(right, my_thread_buffers, output);

        } else {

            my_core.direct_multiply(right, output);

        }

    }


};


template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >


class ParallelSparseAdjointWorkspace final : public AdjointWorkspace<EigenVector_> {

public:

    ParallelSparseAdjointWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :

        my_core(core)

    {

        if (my_core.get_num_threads() > 1 && !my_core.get_column_major()) {

            my_thread_buffers.resize(my_core.get_num_threads() - 1);

        }

    }

private:

    const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;

    std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;


public:


    void multiply(const EigenVector_& right, EigenVector_& output) {

        if (my_core.get_column_major()) {

            my_core.direct_multiply(right, output);

        } else {

            my_core.indirect_multiply(right, my_thread_buffers, output);

        }

    }


};


template<class EigenMatrix_, class ValueArray_, class IndexArray_, class PointerArray_ >


class ParallelSparseRealizeWorkspace final : public RealizeWorkspace<EigenMatrix_> {

public:

    ParallelSparseRealizeWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :

        my_core(core)

    {}

private:

    const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;


public:


    const EigenMatrix_& realize(EigenMatrix_& buffer) {

        const auto nr = my_core.rows(), nc = my_core.cols();

        buffer.resize(nr, nc);

        buffer.setZero();


        const auto& ptrs = my_core.get_pointers();

        const auto& indices = my_core.get_indices();

        const auto& values = my_core.get_values();


        typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;

        if (my_core.get_column_major()) {

            for (Eigen::Index c = 0; c < nc; ++c) {

                PointerType col_start = ptrs[c], col_end = ptrs[c + 1];

                for (PointerType s = col_start; s < col_end; ++s) {

                    buffer.coeffRef(indices[s], c) = values[s];

                }

            }

        } else {

            for (Eigen::Index r = 0; r < nr; ++r) {

                PointerType row_start = ptrs[r], row_end = ptrs[r + 1];

                for (PointerType s = row_start; s < row_end; ++s) {

                    buffer.coeffRef(r, indices[s]) = values[s];

                }

            }

        }


        return buffer;

    }


};


template<

    class EigenVector_,

    class EigenMatrix_,

    class ValueArray_,

    class IndexArray_,

    class PointerArray_

>


class ParallelSparseMatrix final : public Matrix<EigenVector_, EigenMatrix_> {

public:

    ParallelSparseMatrix() {}


    ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads) :

        my_core(nrow, ncol, std::move(x), std::move(i), std::move(p), column_major, num_threads)

    {}


private:

    ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_> my_core;


public:


    Eigen::Index rows() const {

        return my_core.rows();

    }


    Eigen::Index cols() const {

        return my_core.cols();

    }


    const ValueArray_& get_values() const {

        return my_core.get_values();

    }


    const IndexArray_& get_indices() const {

        return my_core.get_indices();

    }


    const PointerArray_& get_pointers() const {

        return my_core.get_pointers();

    }


    typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;


    const std::vector<Eigen::Index>& get_primary_boundaries() const {

        return my_core.get_primary_boundaries();

    }


    const std::vector<Eigen::Index>& get_secondary_boundaries() const {

        return my_core.get_secondary_boundaries();

    }


    const std::vector<std::vector<PointerType> >& get_secondary_nonzero_boundaries() const {

        return my_core.get_secondary_nonzero_boundaries();

    }


public:


    std::unique_ptr<Workspace<EigenVector_> > new_workspace() const {

        return new_known_workspace();

    }


    std::unique_ptr<AdjointWorkspace<EigenVector_> > new_adjoint_workspace() const {

        return new_known_adjoint_workspace();

    }


    std::unique_ptr<RealizeWorkspace<EigenMatrix_> > new_realize_workspace() const {

        return new_known_realize_workspace();

    }


public:


    std::unique_ptr<ParallelSparseWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> > new_known_workspace() const {

        return std::make_unique<ParallelSparseWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);

    }


    std::unique_ptr<ParallelSparseAdjointWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> > new_known_adjoint_workspace() const {

        return std::make_unique<ParallelSparseAdjointWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);

    }


    std::unique_ptr<ParallelSparseRealizeWorkspace<EigenMatrix_, ValueArray_, IndexArray_, PointerArray_> > new_known_realize_workspace() const {

        return std::make_unique<ParallelSparseRealizeWorkspace<EigenMatrix_, ValueArray_, IndexArray_, PointerArray_> >(my_core);

    }


};


}


#endif

irlba::AdjointWorkspace
Workspace class for multiplying a transposed Matrix.
Definition interface.hpp:61

irlba::Matrix
Interface for a matrix to use in compute().
Definition interface.hpp:142

irlba::ParallelSparseAdjointWorkspace
Workspace for multiplication of a transposed ParallelSparseMatrix.
Definition sparse.hpp:325

irlba::ParallelSparseAdjointWorkspace::multiply
void multiply(const EigenVector_ &right, EigenVector_ &output)
Definition sparse.hpp:346

irlba::ParallelSparseMatrix
Sparse matrix with customizable parallelization.
Definition sparse.hpp:445

irlba::ParallelSparseMatrix::ParallelSparseMatrix
ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads)
Definition sparse.hpp:469

irlba::ParallelSparseMatrix::get_primary_boundaries
const std::vector< Eigen::Index > & get_primary_boundaries() const
Definition sparse.hpp:526

irlba::ParallelSparseMatrix::cols
Eigen::Index cols() const
Definition sparse.hpp:487

irlba::ParallelSparseMatrix::get_values
const ValueArray_ & get_values() const
Definition sparse.hpp:495

irlba::ParallelSparseMatrix::new_adjoint_workspace
std::unique_ptr< AdjointWorkspace< EigenVector_ > > new_adjoint_workspace() const
Definition sparse.hpp:559

irlba::ParallelSparseMatrix::rows
Eigen::Index rows() const
Definition sparse.hpp:480

irlba::ParallelSparseMatrix::PointerType
I< decltype(std::declval< PointerArray_ >()[0])> PointerType
Definition sparse.hpp:517

irlba::ParallelSparseMatrix::ParallelSparseMatrix
ParallelSparseMatrix()
Definition sparse.hpp:451

irlba::ParallelSparseMatrix::new_realize_workspace
std::unique_ptr< RealizeWorkspace< EigenMatrix_ > > new_realize_workspace() const
Definition sparse.hpp:563

irlba::ParallelSparseMatrix::new_known_workspace
std::unique_ptr< ParallelSparseWorkspace< EigenVector_, ValueArray_, IndexArray_, PointerArray_ > > new_known_workspace() const
Definition sparse.hpp:571

irlba::ParallelSparseMatrix::get_secondary_nonzero_boundaries
const std::vector< std::vector< PointerType > > & get_secondary_nonzero_boundaries() const
Definition sparse.hpp:550

irlba::ParallelSparseMatrix::get_pointers
const PointerArray_ & get_pointers() const
Definition sparse.hpp:510

irlba::ParallelSparseMatrix::new_known_realize_workspace
std::unique_ptr< ParallelSparseRealizeWorkspace< EigenMatrix_, ValueArray_, IndexArray_, PointerArray_ > > new_known_realize_workspace() const
Definition sparse.hpp:585

irlba::ParallelSparseMatrix::get_indices
const IndexArray_ & get_indices() const
Definition sparse.hpp:503

irlba::ParallelSparseMatrix::new_workspace
std::unique_ptr< Workspace< EigenVector_ > > new_workspace() const
Definition sparse.hpp:555

irlba::ParallelSparseMatrix::get_secondary_boundaries
const std::vector< Eigen::Index > & get_secondary_boundaries() const
Definition sparse.hpp:537

irlba::ParallelSparseMatrix::new_known_adjoint_workspace
std::unique_ptr< ParallelSparseAdjointWorkspace< EigenVector_, ValueArray_, IndexArray_, PointerArray_ > > new_known_adjoint_workspace() const
Definition sparse.hpp:578

irlba::ParallelSparseRealizeWorkspace
Workspace for realizing a ParallelSparseMatrix.
Definition sparse.hpp:366

irlba::ParallelSparseRealizeWorkspace::realize
const EigenMatrix_ & realize(EigenMatrix_ &buffer)
Definition sparse.hpp:382

irlba::ParallelSparseWorkspace
Workspace for multiplication of a ParallelSparseMatrix.
Definition sparse.hpp:284

irlba::ParallelSparseWorkspace::multiply
void multiply(const EigenVector_ &right, EigenVector_ &output)
Definition sparse.hpp:305

irlba::RealizeWorkspace
Workspace class for realizing a Matrix.
Definition interface.hpp:99

irlba::Workspace
Workspace class for multiplying a Matrix.
Definition interface.hpp:24

interface.hpp
Interfaces for matrix inputs.

irlba
Implements IRLBA for approximate SVD.
Definition compute.hpp:22

irlba::parallelize
void parallelize(Task_ num_tasks, Run_ run_task)
Definition parallel.hpp:33

parallel.hpp
Classes for parallelized multiplication.