irlba/sparse_8hpp_source.html

#ifndef IRLBA_MATRIX_SPARSE_HPP

#define IRLBA_MATRIX_SPARSE_HPP


#include <vector>

#include <memory>

#include <cstddef>

#include <cassert>


#include "../utils.hpp"

#include "../parallel.hpp"

#include "interface.hpp"


#include "Eigen/Dense"

#include "sanisizer/sanisizer.hpp"


#ifndef IRLBA_CUSTOM_PARALLEL

#include "subpar/subpar.hpp"

#endif


namespace irlba {


template<class ValueArray_, class IndexArray_, class PointerArray_ >

class ParallelSparseMatrixCore {

public:

    typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;


public:

    ParallelSparseMatrixCore(

        Eigen::Index nrow,

        Eigen::Index ncol,

        ValueArray_ x,

        IndexArray_ i,

        PointerArray_ p,

        bool column_major,

        int num_threads

    ) :

        my_primary_dim(column_major ? ncol : nrow),

        my_secondary_dim(column_major ? nrow : ncol),

        my_num_threads(num_threads),

        my_values(std::move(x)),

        my_indices(std::move(i)),

        my_ptrs(std::move(p)),

        my_column_major(column_major)

    {

        assert(num_threads > 0);

        if (num_threads > 1) {

            const auto total_nzeros = my_ptrs[my_primary_dim]; // last element - not using back() to avoid an extra requirement on PointerArray.

            const PointerType per_thread_floor = total_nzeros / my_num_threads;

            const int per_thread_extra = total_nzeros % my_num_threads;


            // Note that we do a lot of 't + 1' incrementing, but this is guaranteed to fit in an int because 't + 1 <= my_num_threads'.

            // We just need 'my_num_threads + 1' to fit in a size_t for the various vector allocations.

            const auto nthreads_p1 = sanisizer::sum<std::size_t>(my_num_threads, 1);


            // Splitting primary dimension elements across threads so each thread processes the same number of nonzero elements.

            sanisizer::resize(my_primary_boundaries, nthreads_p1);


            Eigen::Index primary_counter = 0;

            PointerType sofar = 0;

            for (int t = 0; t < my_num_threads; ++t) {

                sofar += per_thread_floor + (t < per_thread_extra); // first few threads might get an extra element to deal with the remainder.

                while (primary_counter < my_primary_dim && my_ptrs[primary_counter + 1] <= sofar) {

                    ++primary_counter;

                }

                my_primary_boundaries[t + 1] = primary_counter;

            }

        }

    }


private:

    Eigen::Index my_primary_dim, my_secondary_dim;

    int my_num_threads;


    ValueArray_ my_values;

    IndexArray_ my_indices;

    PointerArray_ my_ptrs;

    bool my_column_major;


    std::vector<Eigen::Index> my_primary_boundaries;


public:

    Eigen::Index rows() const {

        if (my_column_major) {

            return my_secondary_dim;

        } else {

            return my_primary_dim;

        }

    }


    Eigen::Index cols() const {

        if (my_column_major) {

            return my_primary_dim;

        } else {

            return my_secondary_dim;

        }

    }


    const ValueArray_& get_values() const {

        return my_values;

    }


    const IndexArray_& get_indices() const {

        return my_indices;

    }


    const PointerArray_& get_pointers() const {

        return my_ptrs;

    }


    int get_num_threads() const {

        return my_num_threads;

    }


    bool get_column_major() const {

        return my_column_major;

    }


    const std::vector<Eigen::Index>& get_primary_boundaries() const {

        return my_primary_boundaries;

    }


public:

    template<typename EigenVector_>

    void indirect_multiply(const EigenVector_& rhs, std::vector<std::vector<typename EigenVector_::Scalar> >& thread_buffers, EigenVector_& output) const {

        if (my_num_threads == 1) {

            output.setZero();

            for (Eigen::Index c = 0; c < my_primary_dim; ++c) {

                const auto start = my_ptrs[c];

                const auto end = my_ptrs[c + 1];

                const auto val = rhs.coeff(c);

                for (PointerType s = start; s < end; ++s) {

                    output.coeffRef(my_indices[s]) += my_values[s] * val;

                }

            }

            return;

        }


        parallelize(my_num_threads, [&](int t) -> void {

            // Using a separate buffer for the other threads to avoid false

            // sharing. On first use, each buffer is allocated within each

            // thread to give malloc a chance of using thread-specific arenas.

            typename EigenVector_::Scalar* optr;

            if (t != 0) {

                auto& curbuffer = thread_buffers[t - 1];

                sanisizer::resize(curbuffer, my_secondary_dim);

                optr = curbuffer.data();

            } else {

                optr = output.data();

            }

            std::fill_n(optr, my_secondary_dim, static_cast<typename EigenVector_::Scalar>(0));


            // Using a thread-dependent reduction strategy. This results in slightly

            // different results depending on the number of threads, oh well.

            const auto curstart = my_primary_boundaries[t];

            const auto curend = my_primary_boundaries[t + 1];

            for (Eigen::Index p = curstart; p < curend; ++p) {

                const auto start = my_ptrs[p];

                const auto end = my_ptrs[p + 1];

                const auto val = rhs.coeff(p);

                for (PointerType s = start; s < end; ++s) {

                    optr[my_indices[s]] += my_values[s] * val;

                }

            }

        });


        assert(sanisizer::is_equal(thread_buffers.size(), my_num_threads - 1));

        for (const auto& buffer : thread_buffers) {

            for (Eigen::Index x = 0; x < my_secondary_dim; ++x) {

                output.coeffRef(x) += buffer[x];

            }

        }

    }


public:

    template<typename EigenVector_>

    void direct_multiply(const EigenVector_& rhs, EigenVector_& output) const {

        if (my_num_threads == 1) {

            for (Eigen::Index c = 0; c < my_primary_dim; ++c) {

                output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);

            }

            return;

        }


        parallelize(my_num_threads, [&](int t) -> void {

            const auto curstart = my_primary_boundaries[t];

            const auto curend = my_primary_boundaries[t + 1];

            for (auto c = curstart; c < curend; ++c) {

                output.coeffRef(c) = column_dot_product<typename EigenVector_::Scalar>(c, rhs);

            }

        });

    }


private:

    template<typename Scalar_, class EigenVector_>

    Scalar_ column_dot_product(Eigen::Index p, const EigenVector_& rhs) const {

        const PointerType primary_start = my_ptrs[p], primary_end = my_ptrs[p + 1];

        if (primary_start == primary_end) {

            return 0;

        }


        // Copying Eigen's use of two accumulators; effectively unrolls the loop a little for speed.

        Scalar_ dot1 = 0, dot2 = 0;


        PointerType s = primary_start;

        const PointerType primary_end_m1 = primary_end - 1;

        for (; s < primary_end_m1; s += 2) {

            dot1 += my_values[s] * rhs.coeff(my_indices[s]);

            dot2 += my_values[s + 1] * rhs.coeff(my_indices[s + 1]);

        }


        if (s < primary_end) {

            dot1 += my_values[s] * rhs.coeff(my_indices[s]);

        }


        return dot1 + dot2;

    }

};


template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >

class ParallelSparseWorkspace final : public Workspace<EigenVector_> {

public:

    ParallelSparseWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :

        my_core(core)

    {

        if (my_core.get_num_threads() > 1 && my_core.get_column_major()) {

            sanisizer::resize(my_thread_buffers, my_core.get_num_threads() - 1);

        }

    }


private:

    const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;

    std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;


public:

    void multiply(const EigenVector_& right, EigenVector_& output) {

        if (my_core.get_column_major()) {

            my_core.indirect_multiply(right, my_thread_buffers, output);

        } else {

            my_core.direct_multiply(right, output);

        }

    }

};


template<class EigenVector_, class ValueArray_, class IndexArray_, class PointerArray_ >

class ParallelSparseAdjointWorkspace final : public AdjointWorkspace<EigenVector_> {

public:

    ParallelSparseAdjointWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :

        my_core(core)

    {

        if (my_core.get_num_threads() > 1 && !my_core.get_column_major()) {

            sanisizer::resize(my_thread_buffers, my_core.get_num_threads() - 1);

        }

    }


private:

    const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;

    std::vector<std::vector<typename EigenVector_::Scalar> > my_thread_buffers;


public:

    void multiply(const EigenVector_& right, EigenVector_& output) {

        if (my_core.get_column_major()) {

            my_core.direct_multiply(right, output);

        } else {

            my_core.indirect_multiply(right, my_thread_buffers, output);

        }

    }

};


template<class EigenMatrix_, class ValueArray_, class IndexArray_, class PointerArray_ >

class ParallelSparseRealizeWorkspace final : public RealizeWorkspace<EigenMatrix_> {

public:

    ParallelSparseRealizeWorkspace(const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& core) :

        my_core(core)

    {}


private:

    const ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_>& my_core;


public:

    const EigenMatrix_& realize(EigenMatrix_& buffer) {

        const auto nr = my_core.rows(), nc = my_core.cols();

        buffer.resize(nr, nc);

        buffer.setZero();


        const auto& ptrs = my_core.get_pointers();

        const auto& indices = my_core.get_indices();

        const auto& values = my_core.get_values();


        typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;

        if (my_core.get_column_major()) {

            for (Eigen::Index c = 0; c < nc; ++c) {

                PointerType col_start = ptrs[c], col_end = ptrs[c + 1];

                for (PointerType s = col_start; s < col_end; ++s) {

                    buffer.coeffRef(indices[s], c) = values[s];

                }

            }

        } else {

            for (Eigen::Index r = 0; r < nr; ++r) {

                PointerType row_start = ptrs[r], row_end = ptrs[r + 1];

                for (PointerType s = row_start; s < row_end; ++s) {

                    buffer.coeffRef(r, indices[s]) = values[s];

                }

            }

        }


        return buffer;

    }

};

template<

    class EigenVector_,

    class EigenMatrix_,

    class ValueArray_,

    class IndexArray_,

    class PointerArray_

>


class ParallelSparseMatrix final : public Matrix<EigenVector_, EigenMatrix_> {

public:

    ParallelSparseMatrix() {}


    ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads) :

        my_core(nrow, ncol, std::move(x), std::move(i), std::move(p), column_major, num_threads)

    {}


private:

    ParallelSparseMatrixCore<ValueArray_, IndexArray_, PointerArray_> my_core;


public:


    Eigen::Index rows() const {

        return my_core.rows();

    }


    Eigen::Index cols() const {

        return my_core.cols();

    }


    const ValueArray_& get_values() const {

        return my_core.get_values();

    }


    const IndexArray_& get_indices() const {

        return my_core.get_indices();

    }


    const PointerArray_& get_pointers() const {

        return my_core.get_pointers();

    }


    typedef I<decltype(std::declval<PointerArray_>()[0])> PointerType;


    const std::vector<Eigen::Index>& get_primary_boundaries() const {

        return my_core.get_primary_boundaries();

    }


public:


    std::unique_ptr<Workspace<EigenVector_> > new_workspace() const {

        return new_known_workspace();

    }


    std::unique_ptr<AdjointWorkspace<EigenVector_> > new_adjoint_workspace() const {

        return new_known_adjoint_workspace();

    }


    std::unique_ptr<RealizeWorkspace<EigenMatrix_> > new_realize_workspace() const {

        return new_known_realize_workspace();

    }


public:


    auto new_known_workspace() const {

        return std::make_unique<ParallelSparseWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);

    }


    auto new_known_adjoint_workspace() const {

        return std::make_unique<ParallelSparseAdjointWorkspace<EigenVector_, ValueArray_, IndexArray_, PointerArray_> >(my_core);

    }


    auto new_known_realize_workspace() const {

        return std::make_unique<ParallelSparseRealizeWorkspace<EigenMatrix_, ValueArray_, IndexArray_, PointerArray_> >(my_core);

    }


};


}


#endif

irlba::Matrix
Interface for a matrix to use in compute().
Definition interface.hpp:142

irlba::ParallelSparseMatrix
Sparse matrix with customizable parallelization.
Definition sparse.hpp:359

irlba::ParallelSparseMatrix::ParallelSparseMatrix
ParallelSparseMatrix(Eigen::Index nrow, Eigen::Index ncol, ValueArray_ x, IndexArray_ i, PointerArray_ p, bool column_major, int num_threads)
Definition sparse.hpp:384

irlba::ParallelSparseMatrix::get_primary_boundaries
const std::vector< Eigen::Index > & get_primary_boundaries() const
Definition sparse.hpp:441

irlba::ParallelSparseMatrix::cols
Eigen::Index cols() const
Definition sparse.hpp:402

irlba::ParallelSparseMatrix::get_values
const ValueArray_ & get_values() const
Definition sparse.hpp:410

irlba::ParallelSparseMatrix::new_adjoint_workspace
std::unique_ptr< AdjointWorkspace< EigenVector_ > > new_adjoint_workspace() const
Definition sparse.hpp:450

irlba::ParallelSparseMatrix::rows
Eigen::Index rows() const
Definition sparse.hpp:395

irlba::ParallelSparseMatrix::PointerType
I< decltype(std::declval< PointerArray_ >()[0])> PointerType
Definition sparse.hpp:432

irlba::ParallelSparseMatrix::ParallelSparseMatrix
ParallelSparseMatrix()
Definition sparse.hpp:365

irlba::ParallelSparseMatrix::new_realize_workspace
std::unique_ptr< RealizeWorkspace< EigenMatrix_ > > new_realize_workspace() const
Definition sparse.hpp:454

irlba::ParallelSparseMatrix::new_known_adjoint_workspace
auto new_known_adjoint_workspace() const
Definition sparse.hpp:469

irlba::ParallelSparseMatrix::get_pointers
const PointerArray_ & get_pointers() const
Definition sparse.hpp:425

irlba::ParallelSparseMatrix::get_indices
const IndexArray_ & get_indices() const
Definition sparse.hpp:418

irlba::ParallelSparseMatrix::new_workspace
std::unique_ptr< Workspace< EigenVector_ > > new_workspace() const
Definition sparse.hpp:446

irlba::ParallelSparseMatrix::new_known_realize_workspace
auto new_known_realize_workspace() const
Definition sparse.hpp:476

irlba::ParallelSparseMatrix::new_known_workspace
auto new_known_workspace() const
Definition sparse.hpp:462

interface.hpp
Interfaces for matrix inputs.

irlba
Implements IRLBA for approximate SVD.
Definition compute.hpp:23

irlba::parallelize
void parallelize(Task_ num_tasks, Run_ run_task)
Definition parallel.hpp:33

parallel.hpp
Classes for parallelized multiplication.