scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
MultiBatchPca.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MULTI_BATCH_PCA
2#define SCRAN_MULTI_BATCH_PCA
3
4#include "tatami/tatami.hpp"
5
6#include "irlba/irlba.hpp"
7#include "Eigen/Dense"
8
9#include <vector>
10#include <cmath>
11
12#include "utils.hpp"
13#include "convert.hpp"
14#include "wrappers.hpp"
15#include "blocking.hpp"
16
23namespace scran {
24
45public:
49 struct Defaults {
53 static constexpr int rank = 10;
54
58 static constexpr bool scale = false;
59
63 static constexpr bool transpose = true;
64
68 static constexpr bool use_residuals = false;
69
73 static constexpr WeightPolicy block_weight_policy = WeightPolicy::VARIABLE;
74
79
83 static constexpr int num_threads = 1;
84
88 static constexpr bool return_rotation = false;
89
93 static constexpr bool return_center = false;
94
98 static constexpr bool return_scale = false;
99 };
100
101private:
102 bool scale = Defaults::scale;
103 bool transpose = Defaults::transpose;
104 int rank = Defaults::rank;
105
106 bool use_residuals = Defaults::use_residuals;
107 WeightPolicy block_weight_policy = Defaults::block_weight_policy;
109
110 bool return_rotation = Defaults::return_rotation;
111 bool return_center = Defaults::return_center;
112 bool return_scale = Defaults::return_scale;
113
114 int nthreads = Defaults::num_threads;
115
116public:
125 rank = r;
126 return *this;
127 }
128
135 scale = s;
136 return *this;
137 }
138
146 transpose = t;
147 return *this;
148 }
149
156 use_residuals = u;
157 return *this;
158 }
159
166 block_weight_policy = w;
167 return *this;
168 }
169
180
187 return_rotation = r;
188 return *this;
189 }
190
197 return_center = r;
198 return *this;
199 }
200
207 return_scale = r;
208 return *this;
209 }
210
216 nthreads = n;
217 return *this;
218 }
219
220private:
221 template<typename Data_, typename Index_, typename Block_>
222 void run_sparse_simple(
223 const tatami::Matrix<Data_, Index_>* mat,
224 const Block_* block,
225 const pca_utils::BlockingDetails<true> block_details,
226 const irlba::Irlba& irb,
227 Eigen::MatrixXd& pcs,
228 Eigen::MatrixXd& rotation,
229 Eigen::VectorXd& variance_explained,
230 Eigen::VectorXd& center_v,
231 Eigen::VectorXd& scale_v,
232 double& total_var)
233 const {
234 auto extracted = pca_utils::extract_sparse_for_pca(mat, nthreads); // row-major extraction.
235 pca_utils::SparseMatrix emat(mat->ncol(), mat->nrow(), std::move(extracted.values), std::move(extracted.indices), std::move(extracted.ptrs), nthreads); // CSC with genes in columns.
236
237 size_t ngenes = emat.cols();
238 center_v.resize(ngenes);
239 scale_v.resize(ngenes);
240
241 tatami::parallelize([&](size_t, size_t start, size_t length) -> void {
242 const auto& values = emat.get_values();
243 const auto& indices = emat.get_indices();
244 const auto& ptrs = emat.get_pointers();
245
246 const auto& block_size = block_details.block_size;
247 size_t nblocks = block_size.size();
248 std::vector<int> block_count(nblocks);
249 const auto& block_weight = block_details.per_element_weight;
250
251 for (size_t r = start, end = start + length; r < end; ++r) {
252 auto offset = ptrs[r];
253 size_t num_entries = ptrs[r+1] - offset;
254 auto value_ptr = values.data() + offset;
255 auto index_ptr = indices.data() + offset;
256
257 std::fill(block_count.begin(), block_count.end(), 0);
258
259 // Computing the grand mean across all blocks.
260 double& grand_mean = center_v[r];
261 grand_mean = 0;
262 for (size_t i = 0; i < num_entries; ++i) {
263 auto b = block[index_ptr[i]];
264 grand_mean += value_ptr[i] * block_weight[b];
265 ++(block_count[b]);
266 }
267 grand_mean /= block_details.total_block_weight;
268
269 // Computing pseudo-variances where each block's contribution
270 // is weighted inversely proportional to its size. This aims to
271 // match up with the variances used in the PCA but not the
272 // variances of the output components (where weightings are not used).
273 double& proxyvar = scale_v[r];
274 proxyvar = 0;
275 for (size_t b = 0; b < nblocks; ++b) {
276 double zero_sum = (block_size[b] - block_count[b]) * grand_mean * grand_mean;
277 proxyvar += zero_sum * block_weight[b];
278 }
279
280 for (size_t i = 0; i < num_entries; ++i) {
281 double diff = value_ptr[i] - grand_mean;
282 proxyvar += diff * diff * block_weight[block[index_ptr[i]]];
283 }
284
285 proxyvar /= emat.rows() - 1;
286 }
287 }, ngenes, nthreads);
288
289 total_var = pca_utils::process_scale_vector(scale, scale_v);
290
291 // Now actually performing the PCA.
292 irlba::Centered<decltype(emat)> centered(&emat, &center_v);
293 if (scale) {
294 irlba::Scaled<decltype(centered)> scaled(&centered, &scale_v);
295 pca_utils::SampleScaledWrapper<decltype(scaled)> weighted(&scaled, &(block_details.expanded_weights));
296 irb.run(weighted, pcs, rotation, variance_explained);
297 } else {
298 pca_utils::SampleScaledWrapper<decltype(centered)> weighted(&centered, &(block_details.expanded_weights));
299 irb.run(weighted, pcs, rotation, variance_explained);
300 }
301
302 // This transposes 'pcs' to be a NDIM * NCELLS matrix.
303 pca_utils::project_sparse_matrix(emat, pcs, rotation, scale, scale_v, nthreads);
304
305 pca_utils::clean_up_projected<true>(pcs, variance_explained);
306 if (!transpose) {
307 pcs.adjointInPlace();
308 }
309 }
310
311 template<typename Data_, typename Index_, typename Block_>
312 void run_dense_simple(
313 const tatami::Matrix<Data_, Index_>* mat,
314 const Block_* block,
315 const pca_utils::BlockingDetails<true>& block_details,
316 const irlba::Irlba& irb,
317 Eigen::MatrixXd& pcs,
318 Eigen::MatrixXd& rotation,
319 Eigen::VectorXd& variance_explained,
320 Eigen::VectorXd& center_v,
321 Eigen::VectorXd& scale_v,
322 double& total_var)
323 const {
324 auto emat = pca_utils::extract_dense_for_pca(mat, nthreads); // row-major extraction.
325
326 size_t ngenes = emat.cols();
327 center_v.resize(ngenes);
328 scale_v.resize(ngenes);
329
330 tatami::parallelize([&](size_t, size_t start, size_t length) -> void {
331 size_t nblocks = block_details.num_blocks();
332 std::vector<double> mean_buffer(nblocks);
333 const auto& block_weight = block_details.per_element_weight;
334 size_t ncells = emat.rows();
335
336 for (size_t c = start, end = start + length; c < end; ++c) {
337 auto ptr = emat.data() + c * ncells;
338
339 double& grand_mean = center_v[c];
340 grand_mean = 0;
341 for (size_t r = 0; r < ncells; ++r) {
342 grand_mean += ptr[r] * block_weight[block[r]];
343 }
344 grand_mean /= block_details.total_block_weight;
345
346 // We don't actually compute the batchwise variance, but instead
347 // the weighted sum of squared deltas, which is what PCA actually sees.
348 double& proxyvar = scale_v[c];
349 proxyvar = 0;
350 for (size_t r = 0; r < ncells; ++r) {
351 double diff = ptr[r] - grand_mean;
352 proxyvar += diff * diff * block_weight[block[r]];
353 }
354
355 proxyvar /= emat.rows() - 1;
356 }
357 }, ngenes, nthreads);
358
359 total_var = pca_utils::process_scale_vector(scale, scale_v);
360
361 // Applying the centering and scaling now so we can do the PCA with fewer wrappers.
362 pca_utils::apply_center_and_scale_to_dense_matrix(emat, center_v, scale, scale_v, nthreads);
363
364 pca_utils::SampleScaledWrapper<decltype(emat)> weighted(&emat, &(block_details.expanded_weights));
365 irb.run(weighted, pcs, rotation, variance_explained);
366
367 pcs.noalias() = emat * rotation;
368 pca_utils::clean_up_projected<false>(pcs, variance_explained);
369 if (transpose) {
370 pcs.adjointInPlace();
371 }
372 }
373
374private:
375 template<bool weight_, typename Matrix_, typename Block_>
376 void run_residuals_internal(
377 const Matrix_& emat,
378 const Block_* block,
379 const pca_utils::BlockingDetails<weight_>& block_details,
380 const Eigen::MatrixXd& center_m,
381 const Eigen::VectorXd& scale_v,
382 const irlba::Irlba& irb,
383 Eigen::MatrixXd& pcs,
384 Eigen::MatrixXd& rotation,
385 Eigen::VectorXd& variance_explained)
386 const {
387 pca_utils::RegressWrapper<Matrix_, Block_> centered(&emat, block, &center_m);
388
389 if constexpr(weight_) {
390 if (scale) {
391 irlba::Scaled<decltype(centered)> scaled(&centered, &scale_v);
392 pca_utils::SampleScaledWrapper<decltype(scaled)> weighted(&scaled, &(block_details.expanded_weights));
393 irb.run(weighted, pcs, rotation, variance_explained);
394 } else {
395 pca_utils::SampleScaledWrapper<decltype(centered)> weighted(&centered, &(block_details.expanded_weights));
396 irb.run(weighted, pcs, rotation, variance_explained);
397 }
398
399 } else {
400 if (scale) {
401 irlba::Scaled<decltype(centered)> scaled(&centered, &scale_v);
402 irb.run(scaled, pcs, rotation, variance_explained);
403 } else {
404 irb.run(centered, pcs, rotation, variance_explained);
405 }
406 }
407 }
408
409 template<bool weight_, typename Data_, typename Index_, typename Block_>
410 void run_sparse_residuals(
411 const tatami::Matrix<Data_, Index_>* mat,
412 const Block_* block,
413 const pca_utils::BlockingDetails<weight_>& block_details,
414 const irlba::Irlba& irb,
415 Eigen::MatrixXd& pcs,
416 Eigen::MatrixXd& rotation,
417 Eigen::VectorXd& variance_explained,
418 Eigen::MatrixXd& center_m,
419 Eigen::VectorXd& scale_v,
420 double& total_var)
421 const {
422 auto extracted = pca_utils::extract_sparse_for_pca(mat, nthreads); // row-major extraction.
423 pca_utils::SparseMatrix emat(mat->ncol(), mat->nrow(), std::move(extracted.values), std::move(extracted.indices), std::move(extracted.ptrs), nthreads); // CSC with genes in columns.
424
425 size_t ngenes = emat.cols();
426 center_m.resize(block_details.num_blocks(), ngenes);
427 scale_v.resize(ngenes);
428 pca_utils::compute_mean_and_variance_regress<weight_>(emat, block, block_details, center_m, scale_v, nthreads);
429 total_var = pca_utils::process_scale_vector(scale, scale_v);
430
431 run_residuals_internal<weight_>(
432 emat,
433 block,
434 block_details,
435 center_m,
436 scale_v,
437 irb,
438 pcs,
439 rotation,
440 variance_explained
441 );
442
443 // This transposes 'pcs' to be a NDIM * NCELLS matrix.
444 pca_utils::project_sparse_matrix(emat, pcs, rotation, scale, scale_v, nthreads);
445
446 pca_utils::clean_up_projected<true>(pcs, variance_explained);
447 if (!transpose) {
448 pcs.adjointInPlace();
449 }
450 }
451
452 template<bool weight_, typename Data_, typename Index_, typename Block_>
453 void run_dense_residuals(
454 const tatami::Matrix<Data_, Index_>* mat,
455 const Block_* block,
456 const pca_utils::BlockingDetails<weight_>& block_details,
457 const irlba::Irlba& irb,
458 Eigen::MatrixXd& pcs,
459 Eigen::MatrixXd& rotation,
460 Eigen::VectorXd& variance_explained,
461 Eigen::MatrixXd& center_m,
462 Eigen::VectorXd& scale_v,
463 double& total_var)
464 const {
465 auto emat = pca_utils::extract_dense_for_pca(mat, nthreads); // get a column-major matrix with genes in columns.
466
467 size_t ngenes = emat.cols();
468 center_m.resize(block_details.num_blocks(), ngenes);
469 scale_v.resize(ngenes);
470 pca_utils::compute_mean_and_variance_regress<weight_>(emat, block, block_details, center_m, scale_v, nthreads);
471 total_var = pca_utils::process_scale_vector(scale, scale_v);
472
473 // No choice but to use wrappers here, as we still need the original matrix for projection.
474 run_residuals_internal<weight_>(
475 emat,
476 block,
477 block_details,
478 center_m,
479 scale_v,
480 irb,
481 pcs,
482 rotation,
483 variance_explained
484 );
485
486 if (scale) {
487 pcs.noalias() = emat * (rotation.array().colwise() / scale_v.array()).matrix();
488 } else {
489 pcs.noalias() = emat * rotation;
490 }
491
492 pca_utils::clean_up_projected<false>(pcs, variance_explained);
493 if (transpose) {
494 pcs.adjointInPlace();
495 }
496 }
497
498private:
499 template<typename Data_, typename Index_, typename Block_>
500 void run_internal(
501 const tatami::Matrix<Data_, Index_>* mat,
502 const Block_* block,
503 Eigen::MatrixXd& pcs,
504 Eigen::MatrixXd& rotation,
505 Eigen::VectorXd& variance_explained,
506 Eigen::MatrixXd& center_m,
507 Eigen::VectorXd& scale_v,
508 double& total_var)
509 const {
510 irlba::EigenThreadScope t(nthreads);
511 irlba::Irlba irb;
512 irb.set_number(rank);
513 irb.set_cap_number(true);
514
515 if (use_residuals) {
516 if (block_weight_policy == WeightPolicy::NONE) {
517 auto bdetails = pca_utils::compute_blocking_details(mat->ncol(), block);
518 if (mat->sparse()) {
519 run_sparse_residuals<false>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);
520 } else {
521 run_dense_residuals<false>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);
522 }
523
524 } else {
525 auto bdetails = pca_utils::compute_blocking_details(mat->ncol(), block, block_weight_policy, variable_block_weight_parameters);
526 if (mat->sparse()) {
527 run_sparse_residuals<true>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);
528 } else {
529 run_dense_residuals<true>(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_m, scale_v, total_var);
530 }
531 }
532
533 } else {
534 if (block_weight_policy == WeightPolicy::NONE) {
535 throw std::runtime_error("block weight policy cannot be NONE when 'use_residuals = true', use SimplePca instead");
536 }
537
538 auto bdetails = pca_utils::compute_blocking_details(mat->ncol(), block, block_weight_policy, variable_block_weight_parameters);
539
540 Eigen::VectorXd center_v;
541 if (mat->sparse()) {
542 run_sparse_simple(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_v, scale_v, total_var);
543 } else {
544 run_dense_simple(mat, block, bdetails, irb, pcs, rotation, variance_explained, center_v, scale_v, total_var);
545 }
546
547 if (return_center) {
548 center_m.resize(1, center_v.size());
549 center_m.row(0) = center_v;
550 }
551 }
552 }
553
554public:
560 struct Results {
567 Eigen::MatrixXd pcs;
568
576 Eigen::VectorXd variance_explained;
577
582 double total_variance = 0;
583
590 Eigen::MatrixXd rotation;
591
605 Eigen::MatrixXd center;
606
612 Eigen::VectorXd scale;
613 };
614
630 template<typename T, typename IDX, typename Batch>
631 Results run(const tatami::Matrix<T, IDX>* mat, const Batch* batch) const {
632 Results output;
633 Eigen::MatrixXd rotation, center_m;
634 Eigen::VectorXd scale_v;
635
636 run_internal(mat, batch, output.pcs, rotation, output.variance_explained, center_m, scale_v, output.total_variance);
637
638 // Shifting them if we want to keep them.
639 if (return_rotation) {
640 output.rotation = std::move(rotation);
641 }
642 if (return_center) {
643 output.center = center_m.adjoint();
644 }
645 if (return_scale) {
646 output.scale = std::move(scale_v);
647 }
648
649 return output;
650 }
651
672 template<typename T, typename IDX, typename Batch, typename X>
673 Results run(const tatami::Matrix<T, IDX>* mat, const Batch* batch, const X* features) const {
674 Results output;
675 if (!features) {
676 return run(mat, batch);
677 } else {
678 auto subsetted = pca_utils::subset_matrix_by_features(mat, features);
679 return run(subsetted.get(), batch);
680 }
681 }
682};
683
684}
685
686#endif
Utilities for handling blocks of cells.
Compute PCA after adjusting for differences between batch sizes.
Definition MultiBatchPca.hpp:44
Results run(const tatami::Matrix< T, IDX > *mat, const Batch *batch) const
Definition MultiBatchPca.hpp:631
MultiBatchPca & set_rank(int r=Defaults::rank)
Definition MultiBatchPca.hpp:124
MultiBatchPca & set_return_rotation(bool r=Defaults::return_rotation)
Definition MultiBatchPca.hpp:186
MultiBatchPca & set_block_weight_policy(WeightPolicy w=Defaults::block_weight_policy)
Definition MultiBatchPca.hpp:165
MultiBatchPca & set_transpose(bool t=Defaults::transpose)
Definition MultiBatchPca.hpp:145
MultiBatchPca & set_use_residuals(bool u=Defaults::use_residuals)
Definition MultiBatchPca.hpp:155
MultiBatchPca & set_return_scale(bool r=Defaults::return_scale)
Definition MultiBatchPca.hpp:206
MultiBatchPca & set_num_threads(int n=Defaults::num_threads)
Definition MultiBatchPca.hpp:215
MultiBatchPca & set_return_center(bool r=Defaults::return_center)
Definition MultiBatchPca.hpp:196
Results run(const tatami::Matrix< T, IDX > *mat, const Batch *batch, const X *features) const
Definition MultiBatchPca.hpp:673
MultiBatchPca & set_variable_block_weight_parameters(VariableBlockWeightParameters v=Defaults::variable_block_weight_parameters)
Definition MultiBatchPca.hpp:176
MultiBatchPca & set_scale(bool s=Defaults::scale)
Definition MultiBatchPca.hpp:134
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
WeightPolicy
Definition blocking.hpp:82
Default parameter settings.
Definition MultiBatchPca.hpp:49
static constexpr bool transpose
Definition MultiBatchPca.hpp:63
static constexpr int rank
Definition MultiBatchPca.hpp:53
static constexpr bool return_rotation
Definition MultiBatchPca.hpp:88
static constexpr WeightPolicy block_weight_policy
Definition MultiBatchPca.hpp:73
static constexpr int num_threads
Definition MultiBatchPca.hpp:83
static constexpr bool return_center
Definition MultiBatchPca.hpp:93
static constexpr bool scale
Definition MultiBatchPca.hpp:58
static constexpr bool return_scale
Definition MultiBatchPca.hpp:98
static constexpr bool use_residuals
Definition MultiBatchPca.hpp:68
static constexpr VariableBlockWeightParameters variable_block_weight_parameters
Definition MultiBatchPca.hpp:78
Container for the PCA results.
Definition MultiBatchPca.hpp:560
Eigen::MatrixXd center
Definition MultiBatchPca.hpp:605
Eigen::VectorXd variance_explained
Definition MultiBatchPca.hpp:576
double total_variance
Definition MultiBatchPca.hpp:582
Eigen::MatrixXd rotation
Definition MultiBatchPca.hpp:590
Eigen::VectorXd scale
Definition MultiBatchPca.hpp:612
Eigen::MatrixXd pcs
Definition MultiBatchPca.hpp:567
Parameters for variable_block_weight().
Definition blocking.hpp:87