scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
GroupedSizeFactors.hpp
Go to the documentation of this file.
1#ifndef SCRAN_CLUSTERED_SIZE_FACTORS_HPP
2#define SCRAN_CLUSTERED_SIZE_FACTORS_HPP
3
4#include "../utils/macros.hpp"
5
7#include "SanitizeSizeFactors.hpp"
9#include "../aggregation/AggregateAcrossCells.hpp"
10
11#include "tatami/tatami.hpp"
12
13#include "../utils/blocking.hpp"
14
15#include <memory>
16#include <vector>
17#include <algorithm>
18
25namespace scran {
26
44public:
48 struct Defaults {
52 static constexpr bool center = true;
53
57 static constexpr bool handle_zeros = false;
58
62 static constexpr bool handle_non_finite = false;
63
67 static constexpr int num_threads = 1;
68 };
69
70public:
80 center = c;
81 return *this;
82 }
83
90 prior_count = p;
91 return *this;
92 }
93
104 handle_zeros = z;
105 return *this;
106 }
107
118 handle_non_finite = n;
119 return *this;
120 }
121
127 num_threads = n;
128 return *this;
129 }
130
131private:
132 bool center = Defaults::center;
133 double prior_count = MedianSizeFactors::Defaults::prior_count;
134 int num_threads = Defaults::num_threads;
135 bool handle_zeros = Defaults::handle_zeros;
136 bool handle_non_finite = Defaults::handle_non_finite;
137
138public:
156 template<typename T, typename IDX, typename Group, typename Out>
157 void run(const tatami::Matrix<T, IDX>* mat, const Group* group, Out* output) const {
158 run_internal(mat, group, false, output);
159 }
160
178 template<typename T, typename IDX, typename Group, typename Out>
179 void run(const tatami::Matrix<T, IDX>* mat, const Group* group, size_t reference, Out* output) const {
180 run_internal(mat, group, reference, output);
181 }
182
183private:
184 template<typename T, typename IDX, typename Group, typename Ref, typename Out>
185 void run_internal(const tatami::Matrix<T, IDX>* mat, const Group* group, Ref reference, Out* output) const {
186 size_t NR = mat->nrow(), NC = mat->ncol();
187 if (!NC) {
188 return;
189 }
190
191 // Aggregating each group to get a pseudo-bulk sample.
192 auto ngroups = count_ids(NC, group);
193 std::vector<double> combined(ngroups * NR);
194 {
195 std::vector<double*> sums(ngroups);
196 for (size_t i = 0; i < ngroups; ++i) {
197 sums[i] = combined.data() + i * NR;
198 }
199 AggregateAcrossCells aggregator;
200 aggregator.set_num_threads(num_threads).run(mat, group, std::move(sums), std::vector<int*>());
201 }
202
203 size_t ref = 0;
204 if constexpr(std::is_same<Ref, size_t>::value) {
205 ref = reference;
206 } else {
207 // Choosing one of them to be the reference. Here, we borrow some logic
208 // from edgeR and use the one with the largest sum of square roots,
209 // which provides a compromise between transcriptome coverage and
210 // complexity. The root ensures that we don't pick a sample that just
211 // has very high expression in a small subset of genes, while still
212 // remaining responsive to the overall coverage level.
213 double best = 0;
214
215 for (size_t i = 0; i < ngroups; ++i) {
216 auto start = combined.data() + i * NR;
217 double current = 0;
218 for (size_t j = 0; j < NR; ++j) {
219 current += std::sqrt(start[j]);
220 }
221 if (current > best) {
222 ref = i;
223 best = current;
224 }
225 }
226 }
227
228 // Computing median-based size factors. No need to center
229 // here as we'll be recentering afterwards anyway.
230 tatami::ArrayView view(combined.data(), combined.size());
231 tatami::DenseColumnMatrix<T, IDX, decltype(view)> aggmat(NR, ngroups, std::move(view));
232
233 MedianSizeFactors med;
234 med.set_num_threads(num_threads).set_center(false).set_prior_count(prior_count);
235 auto mres = med.run(&aggmat, combined.data() + ref * NR);
236
237 // We need to rescue odd size factors here; if they're zero or infinite,
238 // we lose information about the relative scaling of cells within each group.
239 auto cres = validate_size_factors(mres.factors.size(), mres.factors.data());
240 SanitizeSizeFactors sanitizer;
241 sanitizer.set_handle_zero(handle_zeros ? SanitizeSizeFactors::HandlerAction::SANITIZE : SanitizeSizeFactors::HandlerAction::ERROR);
242 sanitizer.set_handle_non_finite(handle_non_finite ? SanitizeSizeFactors::HandlerAction::SANITIZE : SanitizeSizeFactors::HandlerAction::ERROR);
243 sanitizer.run(mres.factors.size(), mres.factors.data(), cres);
244
245 // Propagating to each cell via library size-based normalization.
246 auto aggcolsums = tatami::column_sums(&aggmat, num_threads);
247 auto colsums = tatami::column_sums(mat, num_threads);
248 for (size_t i = 0; i < NC; ++i) {
249 auto curgroup = group[i];
250 if (aggcolsums[curgroup]) {
251 auto scale = static_cast<double>(colsums[i])/static_cast<double>(aggcolsums[curgroup]);
252 output[i] = scale * mres.factors[curgroup];
253 } else {
254 // implies colsums[i] == 0, so instead of getting NaN size factors,
255 // we just set the factors to zero, given that everything in the group is also zero.
256 output[i] = 0;
257 }
258 }
259
260 if (center) {
261 CenterSizeFactors centerer;
262 centerer.run(NC, output);
263 }
264 }
265
266public:
271 template<typename Out>
272 struct Results {
276 Results(size_t NC) : factors(NC) {}
285 std::vector<Out> factors;
286 };
287
303 template<typename Out = double, typename T, typename IDX, typename Group>
304 Results<Out> run(const tatami::Matrix<T, IDX>* mat, const Group* group) const {
305 Results<Out> output(mat->ncol());
306 run(mat, group, output.factors.data());
307 return output;
308 }
309
326 template<typename Out = double, typename T, typename IDX, typename Group>
327 Results<Out> run(const tatami::Matrix<T, IDX>* mat, const Group* group, size_t reference) const {
328 Results<Out> output(mat->ncol());
329 run(mat, group, reference, output.factors.data());
330 return output;
331 }
332};
333
334}
335
336#endif
Center size factors prior to normalization.
Compute median-based size factors.
Compute grouped size factors to handle composition bias.
Definition GroupedSizeFactors.hpp:43
GroupedSizeFactors & set_handle_non_finite(bool n=Defaults::handle_non_finite)
Definition GroupedSizeFactors.hpp:117
void run(const tatami::Matrix< T, IDX > *mat, const Group *group, size_t reference, Out *output) const
Definition GroupedSizeFactors.hpp:179
Results< Out > run(const tatami::Matrix< T, IDX > *mat, const Group *group, size_t reference) const
Definition GroupedSizeFactors.hpp:327
GroupedSizeFactors & set_center(bool c=Defaults::center)
Definition GroupedSizeFactors.hpp:79
GroupedSizeFactors & set_handle_zeros(bool z=Defaults::handle_zeros)
Definition GroupedSizeFactors.hpp:103
GroupedSizeFactors & set_num_threads(int n=Defaults::num_threads)
Definition GroupedSizeFactors.hpp:126
Results< Out > run(const tatami::Matrix< T, IDX > *mat, const Group *group) const
Definition GroupedSizeFactors.hpp:304
GroupedSizeFactors & set_prior_count(double p=MedianSizeFactors::Defaults::prior_count)
Definition GroupedSizeFactors.hpp:89
void run(const tatami::Matrix< T, IDX > *mat, const Group *group, Out *output) const
Definition GroupedSizeFactors.hpp:157
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
SizeFactorValidity validate_size_factors(size_t n, const T *size_factors)
Definition SanitizeSizeFactors.hpp:46
size_t count_ids(size_t length, const Id_ *ids)
Definition blocking.hpp:29
Default parameter settings.
Definition GroupedSizeFactors.hpp:48
static constexpr bool handle_non_finite
Definition GroupedSizeFactors.hpp:62
static constexpr int num_threads
Definition GroupedSizeFactors.hpp:67
static constexpr bool handle_zeros
Definition GroupedSizeFactors.hpp:57
static constexpr bool center
Definition GroupedSizeFactors.hpp:52
Result of the size factor calculations.
Definition GroupedSizeFactors.hpp:272
std::vector< Out > factors
Definition GroupedSizeFactors.hpp:285
static constexpr double prior_count
Definition MedianSizeFactors.hpp:48