libscran/MedianSizeFactors_8hpp_source.html

#ifndef SCRAN_MEDIAN_SIZE_FACTORS_HPP

#define SCRAN_MEDIAN_SIZE_FACTORS_HPP


#include "../utils/macros.hpp"


#include <vector>

#include <limits>

#include "tatami/tatami.hpp"

#include "CenterSizeFactors.hpp"


namespace scran {


class MedianSizeFactors {

public:


    struct Defaults {

        static constexpr bool center = true;


        static constexpr double prior_count = 10;


        static constexpr int num_threads = 1;

    };


    MedianSizeFactors& set_center(bool c = Defaults::center) {

        center = c;

        return *this;

    }


    MedianSizeFactors& set_prior_count(double p = Defaults::prior_count) {

        prior_count = p;

        return *this;

    }


    MedianSizeFactors& set_num_threads(int n = Defaults::num_threads) {

        num_threads = n;

        return *this;

    }


private:

    bool center = Defaults::center;

    double prior_count = Defaults::prior_count;

    int num_threads = Defaults::num_threads;


public:

    template<typename T, typename IDX, typename Ref, typename Out>


    void run(const tatami::Matrix<T, IDX>* mat, const Ref* ref, Out* output) const {

        auto NR = mat->nrow(), NC = mat->ncol();


        std::vector<T> sums(NC);

        tatami::parallelize([&](size_t, IDX start, IDX length) -> void {

            auto ext = tatami::consecutive_extractor<false, false>(mat, start, length);

            std::vector<T> buffer(NR);


            for (IDX c = start, end = start + length; c < end; ++c) {

                auto ptr = ext->fetch(c, buffer.data());

                sums[c] = std::accumulate(ptr, ptr + NR, static_cast<T>(0));


                size_t sofar = 0;

                for (IDX r = 0; r < NR; ++r) {

                    if (ref[r] == 0 && ptr[r] == 0) {

                        continue;

                    }


                    // potential overwriting of 'buffer' should be safe, as 'sofar' is always behind 'i'.

                    if (ref[r] == 0) {

                        buffer[sofar] = std::numeric_limits<T>::infinity();

                    } else {

                        buffer[sofar] = ptr[r] / ref[r];

                    }


                    ++sofar;

                }


                // TODO: convince tatami maintainers to document this.

                output[c] = tatami::stats::compute_median<Out>(buffer.data(), sofar);

            }

        }, NC, num_threads);


        /* Mild squeezing towards library size-derived factors. Basically,

         * we're adding a scaled version of the reference profile to each

         * column, before normalizing against the reference profile. Given gene

         * i and column j:

         *

         *   ratio_{ij} = (y_{ij} + ref_i * extra_j) / ref_i

         *              = (y_{ij} / ref_i) + extra_j

         *

         * which means that the "shrunken" size factor is:

         *

         *   median(ratio_{ij}) = median(y_{ij} / ref_i) + extra_j

         *

         * This allows us to avoid the actual addition, as we can compute the

         * unshrunken size factor first and then add the j-specific scaling

         * later. This is important as otherwise we'd need to make two passes

         * over the matrix; once to get the mean library size to compute

         * extra_j, and then again to compute the shrunken factors.

         *

         * Incidentally, extra_j is defined as:

         *

         *   extra_j = p * t_j / T / R

         *

         * where p is the constant prior count, t_j is the library size for j,

         * T is the mean library size across all j, and R is the library size

         * for the reference profile.  Basically, p * ref_i / R is how we

         * "spread out" the prior count across all genes based on their

         * relative abundance in the reference profile, while t_j / T

         * represents the library size factor that we are shrinking towards.

         *

         * The addition of extra_j means that the shrunken size factor is

         * slightly too big to normalize against the reference. To adjust for

         * this, imagine that we have some unknown position-invariant function

         * f() such that E[f(y_{ij} / ref_i)] yields the true size factor x_j.

         * For example, f() would be the mean if there wasn't any differential

         * expression between columns; or the median, if the counts were large

         * enough. Our shrunken size factor can then be written as

         *

         *   f(ratio_{ij}) = x_j + extra_j

         *

         * To correct our shrunken size factor to x_j, we need to divide it by:

         *

         *   (x_j + extra_j) / x_j = 1 + (extra_j / x_j)

         *

         * Of course, we don't actually know x_j, so we need to approximate by

         * assuming x_j =~ t_j / R, i.e., library size normalization against

         * the reference. This simplifies the correction:

         *

         *   1 + (p / T)

         *

         * which is what we use to divide our median-based shrunken factor.

         * It's not exactly correct but it be good enough when T >> p, and

         * it at least ensures that the normalization is accurate in the

         * simplest case where the only difference is due to library size.

         */

        if (prior_count && NR && NC) {

            double mean = std::accumulate(sums.begin(), sums.end(), static_cast<T>(0));

            mean /= NC;


            double reftotal = std::accumulate(ref, ref + NR, static_cast<double>(0));


            if (mean && reftotal) {

                double scaling = prior_count / mean;

                for (size_t i = 0; i < NC; ++i) {

                    output[i] += sums[i] * scaling / reftotal;

                    output[i] /= 1.0 + scaling;

                }

            }

        }


        // Throwing in some centering.

        if (center) {

            CenterSizeFactors centerer;

            centerer.run(NC, output);

        }


        return;

    }


    template<typename T, typename IDX, typename Out>


    void run_with_mean(const tatami::Matrix<T, IDX>* mat, Out* output) const {

        auto ref = tatami::row_sums(mat, num_threads);

        if (ref.size()) {

            double NC = mat->ncol();

            for (auto& r : ref) {

                r /= NC;

            }

        }

        run(mat, ref.data(), output);

        return;

    }


public:

    template<typename Out>


    struct Results {

        Results(size_t NC) : factors(NC) {}

        std::vector<Out> factors;

    };


    template<typename Out = double, typename T, typename IDX, typename Ref>


    Results<Out> run(const tatami::Matrix<T, IDX>* mat, const Ref* ref) const {

        Results<Out> output(mat->ncol());

        run(mat, ref, output.factors.data());

        return output;

    }


    template<typename Out = double, typename T, typename IDX>


    Results<Out> run_with_mean(const tatami::Matrix<T, IDX>* mat) const {

        Results<Out> output(mat->ncol());

        run_with_mean(mat, output.factors.data());

        return output;

    }


};


}


#endif

CenterSizeFactors.hpp
Center size factors prior to normalization.

scran::CenterSizeFactors
Center size factors prior to scaling normalization.
Definition CenterSizeFactors.hpp:27

scran::CenterSizeFactors::run
SizeFactorValidity run(size_t n, T *size_factors) const
Definition CenterSizeFactors.hpp:103

scran::MedianSizeFactors
Compute median-based size factors to handle composition bias.
Definition MedianSizeFactors.hpp:34

scran::MedianSizeFactors::run
Results< Out > run(const tatami::Matrix< T, IDX > *mat, const Ref *ref) const
Definition MedianSizeFactors.hpp:296

scran::MedianSizeFactors::run
void run(const tatami::Matrix< T, IDX > *mat, const Ref *ref, Out *output) const
Definition MedianSizeFactors.hpp:121

scran::MedianSizeFactors::run_with_mean
Results< Out > run_with_mean(const tatami::Matrix< T, IDX > *mat) const
Definition MedianSizeFactors.hpp:315

scran::MedianSizeFactors::run_with_mean
void run_with_mean(const tatami::Matrix< T, IDX > *mat, Out *output) const
Definition MedianSizeFactors.hpp:245

scran::MedianSizeFactors::set_prior_count
MedianSizeFactors & set_prior_count(double p=Defaults::prior_count)
Definition MedianSizeFactors.hpp:85

scran::MedianSizeFactors::set_num_threads
MedianSizeFactors & set_num_threads(int n=Defaults::num_threads)
Definition MedianSizeFactors.hpp:94

scran::MedianSizeFactors::set_center
MedianSizeFactors & set_center(bool c=Defaults::center)
Definition MedianSizeFactors.hpp:64

scran
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18

scran::MedianSizeFactors::Defaults
Default parameter settings.
Definition MedianSizeFactors.hpp:39

scran::MedianSizeFactors::Defaults::num_threads
static constexpr int num_threads
Definition MedianSizeFactors.hpp:53

scran::MedianSizeFactors::Defaults::prior_count
static constexpr double prior_count
Definition MedianSizeFactors.hpp:48

scran::MedianSizeFactors::Defaults::center
static constexpr bool center
Definition MedianSizeFactors.hpp:43

scran::MedianSizeFactors::Results
Result of the size factor calculation.
Definition MedianSizeFactors.hpp:264

scran::MedianSizeFactors::Results::factors
std::vector< Out > factors
Definition MedianSizeFactors.hpp:277