scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
MedianSizeFactors.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MEDIAN_SIZE_FACTORS_HPP
2#define SCRAN_MEDIAN_SIZE_FACTORS_HPP
3
4#include "../utils/macros.hpp"
5
6#include <vector>
7#include <limits>
8#include "tatami/tatami.hpp"
10
17namespace scran {
18
35public:
39 struct Defaults {
43 static constexpr bool center = true;
44
48 static constexpr double prior_count = 10;
49
53 static constexpr int num_threads = 1;
54 };
55
65 center = c;
66 return *this;
67 }
68
86 prior_count = p;
87 return *this;
88 }
89
95 num_threads = n;
96 return *this;
97 }
98
99private:
100 bool center = Defaults::center;
101 double prior_count = Defaults::prior_count;
102 int num_threads = Defaults::num_threads;
103
104public:
120 template<typename T, typename IDX, typename Ref, typename Out>
121 void run(const tatami::Matrix<T, IDX>* mat, const Ref* ref, Out* output) const {
122 auto NR = mat->nrow(), NC = mat->ncol();
123
124 std::vector<T> sums(NC);
125 tatami::parallelize([&](size_t, IDX start, IDX length) -> void {
126 auto ext = tatami::consecutive_extractor<false, false>(mat, start, length);
127 std::vector<T> buffer(NR);
128
129 for (IDX c = start, end = start + length; c < end; ++c) {
130 auto ptr = ext->fetch(c, buffer.data());
131 sums[c] = std::accumulate(ptr, ptr + NR, static_cast<T>(0));
132
133 size_t sofar = 0;
134 for (IDX r = 0; r < NR; ++r) {
135 if (ref[r] == 0 && ptr[r] == 0) {
136 continue;
137 }
138
139 // potential overwriting of 'buffer' should be safe, as 'sofar' is always behind 'i'.
140 if (ref[r] == 0) {
141 buffer[sofar] = std::numeric_limits<T>::infinity();
142 } else {
143 buffer[sofar] = ptr[r] / ref[r];
144 }
145
146 ++sofar;
147 }
148
149 // TODO: convince tatami maintainers to document this.
150 output[c] = tatami::stats::compute_median<Out>(buffer.data(), sofar);
151 }
152 }, NC, num_threads);
153
154 /* Mild squeezing towards library size-derived factors. Basically,
155 * we're adding a scaled version of the reference profile to each
156 * column, before normalizing against the reference profile. Given gene
157 * i and column j:
158 *
159 * ratio_{ij} = (y_{ij} + ref_i * extra_j) / ref_i
160 * = (y_{ij} / ref_i) + extra_j
161 *
162 * which means that the "shrunken" size factor is:
163 *
164 * median(ratio_{ij}) = median(y_{ij} / ref_i) + extra_j
165 *
166 * This allows us to avoid the actual addition, as we can compute the
167 * unshrunken size factor first and then add the j-specific scaling
168 * later. This is important as otherwise we'd need to make two passes
169 * over the matrix; once to get the mean library size to compute
170 * extra_j, and then again to compute the shrunken factors.
171 *
172 * Incidentally, extra_j is defined as:
173 *
174 * extra_j = p * t_j / T / R
175 *
176 * where p is the constant prior count, t_j is the library size for j,
177 * T is the mean library size across all j, and R is the library size
178 * for the reference profile. Basically, p * ref_i / R is how we
179 * "spread out" the prior count across all genes based on their
180 * relative abundance in the reference profile, while t_j / T
181 * represents the library size factor that we are shrinking towards.
182 *
183 * The addition of extra_j means that the shrunken size factor is
184 * slightly too big to normalize against the reference. To adjust for
185 * this, imagine that we have some unknown position-invariant function
186 * f() such that E[f(y_{ij} / ref_i)] yields the true size factor x_j.
187 * For example, f() would be the mean if there wasn't any differential
188 * expression between columns; or the median, if the counts were large
189 * enough. Our shrunken size factor can then be written as
190 *
191 * f(ratio_{ij}) = x_j + extra_j
192 *
193 * To correct our shrunken size factor to x_j, we need to divide it by:
194 *
195 * (x_j + extra_j) / x_j = 1 + (extra_j / x_j)
196 *
197 * Of course, we don't actually know x_j, so we need to approximate by
198 * assuming x_j =~ t_j / R, i.e., library size normalization against
199 * the reference. This simplifies the correction:
200 *
201 * 1 + (p / T)
202 *
203 * which is what we use to divide our median-based shrunken factor.
204 * It's not exactly correct but it be good enough when T >> p, and
205 * it at least ensures that the normalization is accurate in the
206 * simplest case where the only difference is due to library size.
207 */
208 if (prior_count && NR && NC) {
209 double mean = std::accumulate(sums.begin(), sums.end(), static_cast<T>(0));
210 mean /= NC;
211
212 double reftotal = std::accumulate(ref, ref + NR, static_cast<double>(0));
213
214 if (mean && reftotal) {
215 double scaling = prior_count / mean;
216 for (size_t i = 0; i < NC; ++i) {
217 output[i] += sums[i] * scaling / reftotal;
218 output[i] /= 1.0 + scaling;
219 }
220 }
221 }
222
223 // Throwing in some centering.
224 if (center) {
225 CenterSizeFactors centerer;
226 centerer.run(NC, output);
227 }
228
229 return;
230 }
231
244 template<typename T, typename IDX, typename Out>
245 void run_with_mean(const tatami::Matrix<T, IDX>* mat, Out* output) const {
246 auto ref = tatami::row_sums(mat, num_threads);
247 if (ref.size()) {
248 double NC = mat->ncol();
249 for (auto& r : ref) {
250 r /= NC;
251 }
252 }
253 run(mat, ref.data(), output);
254 return;
255 }
256
257public:
263 template<typename Out>
264 struct Results {
268 Results(size_t NC) : factors(NC) {}
277 std::vector<Out> factors;
278 };
279
295 template<typename Out = double, typename T, typename IDX, typename Ref>
296 Results<Out> run(const tatami::Matrix<T, IDX>* mat, const Ref* ref) const {
297 Results<Out> output(mat->ncol());
298 run(mat, ref, output.factors.data());
299 return output;
300 }
301
314 template<typename Out = double, typename T, typename IDX>
315 Results<Out> run_with_mean(const tatami::Matrix<T, IDX>* mat) const {
316 Results<Out> output(mat->ncol());
317 run_with_mean(mat, output.factors.data());
318 return output;
319 }
320};
321
322}
323
324#endif
Center size factors prior to normalization.
Center size factors prior to scaling normalization.
Definition CenterSizeFactors.hpp:27
SizeFactorValidity run(size_t n, T *size_factors) const
Definition CenterSizeFactors.hpp:103
Compute median-based size factors to handle composition bias.
Definition MedianSizeFactors.hpp:34
Results< Out > run(const tatami::Matrix< T, IDX > *mat, const Ref *ref) const
Definition MedianSizeFactors.hpp:296
void run(const tatami::Matrix< T, IDX > *mat, const Ref *ref, Out *output) const
Definition MedianSizeFactors.hpp:121
Results< Out > run_with_mean(const tatami::Matrix< T, IDX > *mat) const
Definition MedianSizeFactors.hpp:315
void run_with_mean(const tatami::Matrix< T, IDX > *mat, Out *output) const
Definition MedianSizeFactors.hpp:245
MedianSizeFactors & set_prior_count(double p=Defaults::prior_count)
Definition MedianSizeFactors.hpp:85
MedianSizeFactors & set_num_threads(int n=Defaults::num_threads)
Definition MedianSizeFactors.hpp:94
MedianSizeFactors & set_center(bool c=Defaults::center)
Definition MedianSizeFactors.hpp:64
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
Default parameter settings.
Definition MedianSizeFactors.hpp:39
static constexpr int num_threads
Definition MedianSizeFactors.hpp:53
static constexpr double prior_count
Definition MedianSizeFactors.hpp:48
static constexpr bool center
Definition MedianSizeFactors.hpp:43
Result of the size factor calculation.
Definition MedianSizeFactors.hpp:264
std::vector< Out > factors
Definition MedianSizeFactors.hpp:277