libscran/MultiBatchPca_8hpp_source.html

#ifndef SCRAN_MULTI_BATCH_PCA

#define SCRAN_MULTI_BATCH_PCA


#include "tatami/tatami.hpp"


#include "irlba/irlba.hpp"

#include "Eigen/Dense"


#include <vector>

#include <cmath>


#include "utils.hpp"

#include "convert.hpp"

#include "wrappers.hpp"

#include "blocking.hpp"


namespace scran {


class MultiBatchPca {

public:


    struct Defaults {

        static constexpr int rank = 10;


        static constexpr bool scale = false;


        static constexpr bool transpose = true;


        static constexpr bool use_residuals = false;


        static constexpr WeightPolicy block_weight_policy = WeightPolicy::VARIABLE;


        static constexpr VariableBlockWeightParameters variable_block_weight_parameters = VariableBlockWeightParameters();


        static constexpr int num_threads = 1;


        static constexpr bool return_rotation = false;


        static constexpr bool return_center = false;


        static constexpr bool return_scale = false;

    };


private:

    bool scale = Defaults::scale;

    bool transpose = Defaults::transpose;

    int rank = Defaults::rank;


    bool use_residuals = Defaults::use_residuals;

    WeightPolicy block_weight_policy = Defaults::block_weight_policy;

    VariableBlockWeightParameters variable_block_weight_parameters = Defaults::variable_block_weight_parameters;


    bool return_rotation = Defaults::return_rotation;

    bool return_center = Defaults::return_center;

    bool return_scale = Defaults::return_scale;


    int nthreads = Defaults::num_threads;


public:


    MultiBatchPca& set_rank(int r = Defaults::rank) {

        rank = r;

        return *this;

    }


    MultiBatchPca& set_scale(bool s = Defaults::scale) {

        scale = s;

        return *this;

    }


    MultiBatchPca& set_transpose(bool t = Defaults::transpose) {

        transpose = t;

        return *this;

    }


    MultiBatchPca& set_use_residuals(bool u = Defaults::use_residuals) {

        use_residuals = u;

        return *this;

    }


    MultiBatchPca& set_block_weight_policy(WeightPolicy w = Defaults::block_weight_policy) {

        block_weight_policy = w;

        return *this;

    }


    MultiBatchPca& set_variable_block_weight_parameters(VariableBlockWeightParameters v = Defaults::variable_block_weight_parameters) {

        variable_block_weight_parameters = v;

        return *this;

    }


    MultiBatchPca& set_return_rotation(bool r = Defaults::return_rotation) {

        return_rotation = r;

        return *this;

    }


    MultiBatchPca& set_return_center(bool r = Defaults::return_center) {

        return_center = r;

        return *this;

    }


    MultiBatchPca& set_return_scale(bool r = Defaults::return_scale) {

        return_scale = r;

        return *this;

    }


    MultiBatchPca& set_num_threads(int n = Defaults::num_threads) {

        nthreads = n;

        return *this;

    }


private:

    template<typename Data_, typename Index_, typename Block_>

    void run_sparse_simple(

        const tatami::Matrix<Data_, Index_>* mat,

        const Block_* block,

        const pca_utils::BlockingDetails<true> block_details,

        const irlba::Irlba& irb,

        Eigen::MatrixXd& pcs,

        Eigen::MatrixXd& rotation,

        Eigen::VectorXd& variance_explained,

        Eigen::VectorXd& center_v,

        Eigen::VectorXd& scale_v,

        double& total_var)

    const {

        auto extracted = pca_utils::extract_sparse_for_pca(mat, nthreads); // row-major extraction.

        pca_utils::SparseMatrix emat(mat->ncol(), mat->nrow(), std::move(extracted.values), std::move(extracted.indices), std::move(extracted.ptrs), nthreads); // CSC with genes in columns.


        size_t ngenes = emat.cols();

        center_v.resize(ngenes);

        scale_v.resize(ngenes);


        tatami::parallelize([&](size_t, size_t start, size_t length) -> void {

            const auto& values = emat.get_values();

            const auto& indices = emat.get_indices();

            const auto& ptrs = emat.get_pointers();


            const auto& block_size = block_details.block_size;

            size_t nblocks = block_size.size();

            std::vector<int> block_count(nblocks);

            const auto& block_weight = block_details.per_element_weight;


            for (size_t r = start, end = start + length; r < end; ++r) {

                auto offset = ptrs[r];

                size_t num_entries = ptrs[r+1] - offset;

                auto value_ptr = values.data() + offset;

                auto index_ptr = indices.data() + offset;


                std::fill(block_count.begin(), block_count.end(), 0);


                // Computing the grand mean across all blocks.

                double& grand_mean = center_v[r];

                grand_mean = 0;

                for (size_t i = 0; i < num_entries; ++i) {

                    auto b = block[index_ptr[i]];

                    grand_mean += value_ptr[i] * block_weight[b];

                    ++(block_count[b]);

                }

                grand_mean /= block_details.total_block_weight;


                // Computing pseudo-variances where each block's contribution

                // is weighted inversely proportional to its size. This aims to

                // match up with the variances used in the PCA but not the

                // variances of the output components (where weightings are not used).

                double& proxyvar = scale_v[r];

                proxyvar = 0;

                for (size_t b = 0; b < nblocks; ++b) {

                    double zero_sum = (block_size[b] - block_count[b]) * grand_mean * grand_mean;

                    proxyvar += zero_sum * block_weight[b];

                }


                for (size_t i = 0; i < num_entries; ++i) {

                    double diff = value_ptr[i] - grand_mean;

                    proxyvar += diff * diff * block_weight[block[index_ptr[i]]];

                }


                proxyvar /= emat.rows() - 1;

            }

        }, ngenes, nthreads);


        total_var = pca_utils::process_scale_vector(scale, scale_v);


        // Now actually performing the PCA.

        irlba::Centered<decltype(emat)> centered(&emat, &center_v);

        if (scale) {

            irlba::Scaled<decltype(centered)> scaled(&centered, &scale_v);

            pca_utils::SampleScaledWrapper<decltype(scaled)> weighted(&scaled, &(block_details.expanded_weights));

            irb.run(weighted, pcs, rotation, variance_explained);

        } else {

            pca_utils::SampleScaledWrapper<decltype(centered)> weighted(&centered, &(block_details.expanded_weights));

            irb.run(weighted, pcs, rotation, variance_explained);

        }


        // This transposes 'pcs' to be a NDIM * NCELLS matrix.

        pca_utils::project_sparse_matrix(emat, pcs, rotation, scale, scale_v, nthreads);


        pca_utils::clean_up_projected<true>(pcs, variance_explained);

        if (!transpose) {

            pcs.adjointInPlace();

        }

    }


    template<typename Data_, typename Index_, typename Block_>

    void run_dense_simple(

        const tatami::Matrix<Data_, Index_>* mat,

        const Block_* block,

        const pca_utils::BlockingDetails<true>& block_details,

        const irlba::Irlba& irb,

        Eigen::MatrixXd& pcs,

        Eigen::MatrixXd& rotation,

        Eigen::VectorXd& variance_explained,

        Eigen::VectorXd& center_v,

        Eigen::VectorXd& scale_v,

        double& total_var)

    const {

        auto emat = pca_utils::extract_dense_for_pca(mat, nthreads); // row-major extraction.


        size_t ngenes = emat.cols();

        center_v.resize(ngenes);

        scale_v.resize(ngenes);


        tatami::parallelize([&](size_t, size_t start, size_t length) -> void {

            size_t nblocks = block_details.num_blocks();

            std::vector<double> mean_buffer(nblocks);

            const auto& block_weight = block_details.per_element_weight;

            size_t ncells = emat.rows();


            for (size_t c = start, end = start + length; c < end; ++c) {

                auto ptr = emat.data() + c * ncells;


                double& grand_mean = center_v[c];

                grand_mean = 0;

                for (size_t r = 0; r < ncells; ++r) {

                    grand_mean += ptr[r] * block_weight[block[r]];

                }

                grand_mean /= block_details.total_block_weight;


                // We don't actually compute the batchwise variance, but instead

                // the weighted sum of squared deltas, which is what PCA actually sees.

                double& proxyvar = scale_v[c];

                proxyvar = 0;

                for (size_t r = 0; r < ncells; ++r) {

                    double diff = ptr[r] - grand_mean;

                    proxyvar += diff * diff * block_weight[block[r]];

                }


                proxyvar /= emat.rows() - 1;

            }

        }, ngenes, nthreads);


        total_var = pca_utils::process_scale_vector(scale, scale_v);


        // Applying the centering and scaling now so we can do the PCA with fewer wrappers.

        pca_utils::apply_center_and_scale_to_dense_matrix(emat, center_v, scale, scale_v, nthreads);


        pca_utils::SampleScaledWrapper<decltype(emat)> weighted(&emat, &(block_details.expanded_weights));

        irb.run(weighted, pcs, rotation, variance_explained);


        pcs.noalias() = emat * rotation;

        pca_utils::clean_up_projected<false>(pcs, variance_explained);

        if (transpose) {

            pcs.adjointInPlace();

        }

    }


private:

    template<bool weight_, typename Matrix_, typename Block_>

    void run_residuals_internal(

        const Matrix_& emat,

        const Block_* block,

        const pca_utils::BlockingDetails<weight_>& block_details,

        const Eigen::MatrixXd& center_m,

        const Eigen::VectorXd& scale_v,

        const irlba::Irlba& irb,

        Eigen::MatrixXd& pcs,

        Eigen::MatrixXd& rotation,

        Eigen::VectorXd& variance_explained)

    const {

        pca_utils::RegressWrapper<Matrix_, Block_> centered(&emat, block, &center_m);


        if constexpr(weight_) {

            if (scale) {

                irlba::Scaled<decltype(centered)> scaled(&centered, &scale_v);

                pca_utils::SampleScaledWrapper<decltype(scaled)> weighted(&scaled, &(block_details.expanded_weights));

                irb.run(weighted, pcs, rotation, variance_explained);

            } else {

                pca_utils::SampleScaledWrapper<decltype(centered)> weighted(&centered, &(block_details.expanded_weights));

                irb.run(weighted, pcs, rotation, variance_explained);

            }


        } else {

            if (scale) {

                irlba::Scaled<decltype(centered)> scaled(&centered, &scale_v);

                irb.run(scaled, pcs, rotation, variance_explained);

            } else {

                irb.run(centered, pcs, rotation, variance_explained);

            }

        }

    }


    template<bool weight_, typename Data_, typename Index_, typename Block_>

    void run_sparse_residuals(

        const tatami::Matrix<Data_, Index_>* mat,

        const Block_* block,

        const pca_utils::BlockingDetails<weight_>& block_details,

        const irlba::Irlba& irb,

        Eigen::MatrixXd& pcs,

        Eigen::MatrixXd& rotation,

        Eigen::VectorXd& variance_explained,

        Eigen::MatrixXd& center_m,

        Eigen::VectorXd& scale_v,

        double& total_var)

    const {

        auto extracted = pca_utils::extract_sparse_for_pca(mat, nthreads); // row-major extraction.

        pca_utils::SparseMatrix emat(mat->ncol(), mat->nrow(), std::move(extracted.values), std::move(extracted.indices), std::move(extracted.ptrs), nthreads); // CSC with genes in columns.


        size_t ngenes = emat.cols();

        center_m.resize(block_details.num_blocks(), ngenes);

        scale_v.resize(ngenes);

        pca_utils::compute_mean_and_variance_regress<weight_>(emat, block, block_details, center_m, scale_v, nthreads);

        total_var = pca_utils::process_scale_vector(scale, scale_v);


        run_residuals_internal<weight_>(

            emat,

            block,

            block_details,

            center_m,

            scale_v,

            irb,

            pcs,

            rotation,

            variance_explained

        );


        // This transposes 'pcs' to be a NDIM * NCELLS matrix.

        pca_utils::project_sparse_matrix(emat, pcs, rotation, scale, scale_v, nthreads);


        pca_utils::clean_up_projected<true>(pcs, variance_explained);

        if (!transpose) {

            pcs.adjointInPlace();

        }

    }


    template<bool weight_, typename Data_, typename Index_, typename Block_>

    void run_dense_residuals(

        const tatami::Matrix<Data_, Index_>* mat,

        const Block_* block,

        const pca_utils::BlockingDetails<weight_>& block_details,

        const irlba::Irlba& irb,

        Eigen::MatrixXd& pcs,

        Eigen::MatrixXd& rotation,

        Eigen::VectorXd& variance_explained,

        Eigen::MatrixXd& center_m,

        Eigen::VectorXd& scale_v,

        double& total_var)

    const {

        auto emat = pca_utils::extract_dense_for_pca(mat, nthreads); // get a column-major matrix with genes in columns.


        size_t ngenes = emat.cols();

        center_m.resize(block_details.num_blocks(), ngenes);

        scale_v.resize(ngenes);

        pca_utils::compute_mean_and_variance_regress<weight_>(emat, block, block_details, center_m, scale_v, nthreads);

        total_var = pca_utils::process_scale_vector(scale, scale_v);


        // No choice but to use wrappers here, as we still need the original matrix for projection.

        run_residuals_internal<weight_>(

            emat,

            block,

            block_details,

            center_m,

            scale_v,

            irb,

            pcs,

            rotation,

            variance_explained

        );


        if (scale) {

            pcs.noalias() = emat * (rotation.array().colwise() / scale_v.array()).matrix();

        } else {

            pcs.noalias() = emat * rotation;

        }


        pca_utils::clean_up_projected<false>(pcs, variance_explained);

        if (transpose) {

            pcs.adjointInPlace();

        }

    }


private:

    template<typename Data_, typename Index_, typename Block_>

    void run_internal(

        const tatami::Matrix<Data_, Index_>* mat,

        const Block_* block,

        Eigen::MatrixXd& pcs,

        Eigen::MatrixXd& rotation,

        Eigen::VectorXd& variance_explained,

        Eigen::MatrixXd& center_m,

        Eigen::VectorXd& scale_v,

        double& total_var)

    const {

        irlba::EigenThreadScope t(nthreads);

        irlba::Irlba irb;

        irb.set_number(rank);

        irb.set_cap_number(true);


        if (use_residuals) {

            if (block_weight_policy == WeightPolicy::NONE) {

                auto bdetails = pca_utils::compute_blocking_details(mat->ncol(), block);

                if (mat->sparse()) {

                    run_sparse_residuals<false>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);

                } else {

                    run_dense_residuals<false>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);

                }


            } else {

                auto bdetails = pca_utils::compute_blocking_details(mat->ncol(), block, block_weight_policy, variable_block_weight_parameters);

                if (mat->sparse()) {

                    run_sparse_residuals<true>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);

                } else {

                    run_dense_residuals<true>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);

                }

            }


        } else {

            if (block_weight_policy == WeightPolicy::NONE) {

                throw std::runtime_error("block weight policy cannot be NONE when 'use_residuals = true', use SimplePca instead");

            }


            auto bdetails = pca_utils::compute_blocking_details(mat->ncol(), block, block_weight_policy, variable_block_weight_parameters);


            Eigen::VectorXd center_v;

            if (mat->sparse()) {

                run_sparse_simple(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_v, scale_v, total_var);

            } else {

                run_dense_simple(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_v, scale_v, total_var);

            }


            if (return_center) {

                center_m.resize(1, center_v.size());

                center_m.row(0) = center_v;

            }

        }

    }


public:


    struct Results {

        Eigen::MatrixXd pcs;


        Eigen::VectorXd variance_explained;


        double total_variance = 0;


        Eigen::MatrixXd rotation;


        Eigen::MatrixXd center;


        Eigen::VectorXd scale;

    };


    template<typename T, typename IDX, typename Batch>


    Results run(const tatami::Matrix<T, IDX>* mat, const Batch* batch) const {

        Results output;

        Eigen::MatrixXd rotation, center_m;

        Eigen::VectorXd scale_v;


        run_internal(mat, batch, output.pcs, rotation, output.variance_explained, center_m, scale_v, output.total_variance);


        // Shifting them if we want to keep them.

        if (return_rotation) {

            output.rotation = std::move(rotation);

        }

        if (return_center) {

            output.center = center_m.adjoint();

        }

        if (return_scale) {

            output.scale = std::move(scale_v);

        }


        return output;

    }


    template<typename T, typename IDX, typename Batch, typename X>


    Results run(const tatami::Matrix<T, IDX>* mat, const Batch* batch, const X* features) const {

        Results output;

        if (!features) {

            return run(mat, batch);

        } else {

            auto subsetted = pca_utils::subset_matrix_by_features(mat, features);

            return run(subsetted.get(), batch);

        }

    }


};


}


#endif

blocking.hpp
Utilities for handling blocks of cells.

scran::MultiBatchPca
Compute PCA after adjusting for differences between batch sizes.
Definition MultiBatchPca.hpp:44

scran::MultiBatchPca::run
Results run(const tatami::Matrix< T, IDX > *mat, const Batch *batch) const
Definition MultiBatchPca.hpp:631

scran::MultiBatchPca::set_rank
MultiBatchPca & set_rank(int r=Defaults::rank)
Definition MultiBatchPca.hpp:124

scran::MultiBatchPca::set_return_rotation
MultiBatchPca & set_return_rotation(bool r=Defaults::return_rotation)
Definition MultiBatchPca.hpp:186

scran::MultiBatchPca::set_block_weight_policy
MultiBatchPca & set_block_weight_policy(WeightPolicy w=Defaults::block_weight_policy)
Definition MultiBatchPca.hpp:165

scran::MultiBatchPca::set_transpose
MultiBatchPca & set_transpose(bool t=Defaults::transpose)
Definition MultiBatchPca.hpp:145

scran::MultiBatchPca::set_use_residuals
MultiBatchPca & set_use_residuals(bool u=Defaults::use_residuals)
Definition MultiBatchPca.hpp:155

scran::MultiBatchPca::set_return_scale
MultiBatchPca & set_return_scale(bool r=Defaults::return_scale)
Definition MultiBatchPca.hpp:206

scran::MultiBatchPca::set_num_threads
MultiBatchPca & set_num_threads(int n=Defaults::num_threads)
Definition MultiBatchPca.hpp:215

scran::MultiBatchPca::set_return_center
MultiBatchPca & set_return_center(bool r=Defaults::return_center)
Definition MultiBatchPca.hpp:196

scran::MultiBatchPca::run
Results run(const tatami::Matrix< T, IDX > *mat, const Batch *batch, const X *features) const
Definition MultiBatchPca.hpp:673

scran::MultiBatchPca::set_variable_block_weight_parameters
MultiBatchPca & set_variable_block_weight_parameters(VariableBlockWeightParameters v=Defaults::variable_block_weight_parameters)
Definition MultiBatchPca.hpp:176

scran::MultiBatchPca::set_scale
MultiBatchPca & set_scale(bool s=Defaults::scale)
Definition MultiBatchPca.hpp:134

scran
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18

scran::WeightPolicy
WeightPolicy
Definition blocking.hpp:82

scran::MultiBatchPca::Defaults
Default parameter settings.
Definition MultiBatchPca.hpp:49

scran::MultiBatchPca::Defaults::transpose
static constexpr bool transpose
Definition MultiBatchPca.hpp:63

scran::MultiBatchPca::Defaults::rank
static constexpr int rank
Definition MultiBatchPca.hpp:53

scran::MultiBatchPca::Defaults::return_rotation
static constexpr bool return_rotation
Definition MultiBatchPca.hpp:88

scran::MultiBatchPca::Defaults::block_weight_policy
static constexpr WeightPolicy block_weight_policy
Definition MultiBatchPca.hpp:73

scran::MultiBatchPca::Defaults::num_threads
static constexpr int num_threads
Definition MultiBatchPca.hpp:83

scran::MultiBatchPca::Defaults::return_center
static constexpr bool return_center
Definition MultiBatchPca.hpp:93

scran::MultiBatchPca::Defaults::scale
static constexpr bool scale
Definition MultiBatchPca.hpp:58

scran::MultiBatchPca::Defaults::return_scale
static constexpr bool return_scale
Definition MultiBatchPca.hpp:98

scran::MultiBatchPca::Defaults::use_residuals
static constexpr bool use_residuals
Definition MultiBatchPca.hpp:68

scran::MultiBatchPca::Defaults::variable_block_weight_parameters
static constexpr VariableBlockWeightParameters variable_block_weight_parameters
Definition MultiBatchPca.hpp:78

scran::MultiBatchPca::Results
Container for the PCA results.
Definition MultiBatchPca.hpp:560

scran::MultiBatchPca::Results::center
Eigen::MatrixXd center
Definition MultiBatchPca.hpp:605

scran::MultiBatchPca::Results::variance_explained
Eigen::VectorXd variance_explained
Definition MultiBatchPca.hpp:576

scran::MultiBatchPca::Results::total_variance
double total_variance
Definition MultiBatchPca.hpp:582

scran::MultiBatchPca::Results::rotation
Eigen::MatrixXd rotation
Definition MultiBatchPca.hpp:590

scran::MultiBatchPca::Results::scale
Eigen::VectorXd scale
Definition MultiBatchPca.hpp:612

scran::MultiBatchPca::Results::pcs
Eigen::MatrixXd pcs
Definition MultiBatchPca.hpp:567

scran::VariableBlockWeightParameters
Parameters for variable_block_weight().
Definition blocking.hpp:87