scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
quick_grouped_size_factors.hpp
Go to the documentation of this file.
1#ifndef SCRAN_QUICK_GROUPED_SIZE_FACTORS_HPP
2#define SCRAN_QUICK_GROUPED_SIZE_FACTORS_HPP
3
4#include "../utils/macros.hpp"
5
6#include <algorithm>
7#include <vector>
8#include <cmath>
9#include <functional>
10#include <memory>
11
12#include "tatami/tatami.hpp"
13#include "kmeans/Kmeans.hpp"
14#include "kmeans/InitializePCAPartition.hpp"
15
16#include "../utils/blocking.hpp"
17#include "../dimensionality_reduction/SimplePca.hpp"
18#include "LogNormCounts.hpp"
20
26namespace scran {
27
32namespace quick_grouped_size_factors {
33
39template<
40 typename Block_ = int,
41 typename SizeFactor_ = double
42>
43struct Options {
47 int rank = 25;
48
53 std::function<size_t(size_t)> clusters;
54
61 const Block_* block = NULL;
62
68 const SizeFactor_* initial_factors = NULL;
69
73 int num_threads = 1;
74};
75
79namespace internal {
80
81template<
82 typename Value_,
83 typename Index_
84>
85auto cluster(const tatami::Matrix<Value_, Index_>* mat, int rank, size_t clusters, int num_threads) {
86 SimplePca pca_runner;
87 pca_runner.set_rank(rank);
88 pca_runner.set_num_threads(num_threads);
89 auto pc_out = pca_runner.run(mat);
90 const auto& pcs = pc_out.pcs;
91
92 kmeans::Kmeans kmeans_runner;
93 kmeans_runner.set_num_threads(num_threads);
94 kmeans::InitializePCAPartition<Value_, Index_, Index_> init;
95 return kmeans_runner.run(
96 pcs.rows(),
97 pcs.cols(),
98 pcs.data(),
99 clusters,
100 &init
101 );
102}
103
104}
134template<
135 typename Value_,
136 typename Index_,
137 typename OutputFactor_,
138 typename Block_,
139 typename SizeFactor_
140>
141void run(const tatami::Matrix<Value_, Index_>* mat, OutputFactor_* output, const Options<Block_, SizeFactor_>& opt) {
142 std::vector<Index_> clusters;
143 Index_ NC = mat->ncol();
144 auto ptr = tatami::wrap_shared_ptr(mat);
145
146 LogNormCounts logger;
147 logger.set_num_threads(opt.num_threads);
148
149 auto fun = opt.clusters;
150 if (!fun) {
151 fun = [](size_t n) -> size_t {
152 size_t candidate = std::sqrt(static_cast<double>(n));
153 return std::min(candidate, static_cast<size_t>(50));
154 };
155 }
156
157 if (opt.block) {
158 auto nblocks = count_ids(NC, opt.block);
159 std::vector<std::vector<Index_> > assignments(nblocks);
160 for (Index_ c = 0; c < NC; ++c) {
161 assignments[opt.block[c]].push_back(c);
162 }
163
164 clusters.resize(NC);
165 Index_ last_cluster = 0;
166
167 for (size_t b = 0; b < nblocks; ++b) {
168 const auto& inblock = assignments[b];
169 auto subptr = tatami::make_DelayedSubset<1>(ptr, tatami::ArrayView<Index_>(inblock.data(), inblock.size()));
170
171 std::shared_ptr<tatami::Matrix<Value_, Index_> > normalized;
172 if (opt.initial_factors) {
173 std::vector<SizeFactor_> fac;
174 fac.reserve(inblock.size());
175 for (auto i : inblock) {
176 fac.push_back(opt.initial_factors[i]);
177 }
178 normalized = logger.run(std::move(subptr), std::move(fac));
179 } else {
180 normalized = logger.run(std::move(subptr));
181 }
182
183 auto res = internal::cluster(normalized.get(), opt.rank, fun(inblock.size()), opt.num_threads);
184 auto cIt = res.clusters.begin();
185 for (auto i : inblock) {
186 clusters[i] = *cIt + last_cluster;
187 ++cIt;
188 }
189 last_cluster += *std::max_element(res.clusters.begin(), res.clusters.end()) + 1;
190 }
191
192 } else {
193 std::shared_ptr<const tatami::Matrix<Value_, Index_> > normalized; // TODO: avoid propagating const'ness from LogNormCounts.
194 if (opt.initial_factors) {
195 std::vector<SizeFactor_> fac(opt.initial_factors, opt.initial_factors + NC);
196 normalized = logger.run(std::move(ptr), std::move(fac));
197 } else {
198 normalized = logger.run(std::move(ptr));
199 }
200
201 auto res = internal::cluster(normalized.get(), opt.rank, fun(NC), opt.num_threads);
202 clusters = std::move(res.clusters);
203 }
204
205 GroupedSizeFactors group_runner;
206 group_runner.set_num_threads(opt.num_threads);
207 group_runner.run(mat, clusters.data(), output);
208 return;
209}
210
222template<
223 typename Value_,
224 typename Index_,
225 typename OutputFactor_
226>
227void run(const tatami::Matrix<Value_, Index_>* mat, OutputFactor_* output) {
228 run(mat, output, Options<>());
229}
230
245template<
246 typename OutputFactor_ = double,
247 typename Value_,
248 typename Index_,
249 typename Block_,
250 typename SizeFactor_
251>
252std::vector<OutputFactor_> run(const tatami::Matrix<Value_, Index_>* mat, const Options<Block_, SizeFactor_>& opt) {
253 std::vector<OutputFactor_> output(mat->ncol());
254 run(mat, output.data(), opt);
255 return output;
256}
257
269template<
270 typename OutputFactor_ = double,
271 typename Value_,
272 typename Index_
273>
274std::vector<OutputFactor_> run(const tatami::Matrix<Value_, Index_>* mat) {
275 std::vector<OutputFactor_> output(mat->ncol());
276 run(mat, output.data(), Options<>());
277 return output;
278}
279
280}
281
282}
283
284#endif
Compute size factors for groups of cells.
Compute log-normalized expression values.
Compute grouped size factors to handle composition bias.
Definition GroupedSizeFactors.hpp:43
GroupedSizeFactors & set_num_threads(int n=Defaults::num_threads)
Definition GroupedSizeFactors.hpp:126
void run(const tatami::Matrix< T, IDX > *mat, const Group *group, Out *output) const
Definition GroupedSizeFactors.hpp:157
Compute log-normalized expression values.
Definition LogNormCounts.hpp:33
LogNormCounts & set_num_threads(int n=Defaults::num_threads)
Definition LogNormCounts.hpp:186
std::shared_ptr< MAT > run(std::shared_ptr< MAT > mat, V size_factors) const
Definition LogNormCounts.hpp:247
Perform a simple PCA on a gene-cell matrix.
Definition SimplePca.hpp:33
SimplePca & set_rank(int r=Defaults::rank)
Definition SimplePca.hpp:93
SimplePca & set_num_threads(int n=Defaults::num_threads)
Definition SimplePca.hpp:153
Results run(const tatami::Matrix< T, IDX > *mat) const
Definition SimplePca.hpp:301
void run(const tatami::Matrix< Value_, Index_ > *mat, OutputFactor_ *output, const Options< Block_, SizeFactor_ > &opt)
Definition quick_grouped_size_factors.hpp:141
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
size_t count_ids(size_t length, const Id_ *ids)
Definition blocking.hpp:29
Eigen::MatrixXd pcs
Definition SimplePca.hpp:252
Options for run().
Definition quick_grouped_size_factors.hpp:43
const Block_ * block
Definition quick_grouped_size_factors.hpp:61
const SizeFactor_ * initial_factors
Definition quick_grouped_size_factors.hpp:68
int num_threads
Definition quick_grouped_size_factors.hpp:73
std::function< size_t(size_t)> clusters
Definition quick_grouped_size_factors.hpp:53
int rank
Definition quick_grouped_size_factors.hpp:47