scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
PerCellQcMetrics.hpp
Go to the documentation of this file.
1#ifndef SCRAN_PER_CELL_QC_METRICS_HPP
2#define SCRAN_PER_CELL_QC_METRICS_HPP
3
4#include "../utils/macros.hpp"
5
6#include <vector>
7#include <algorithm>
8#include <limits>
9#include <cstdint>
10
11#include "tatami/tatami.hpp"
12#include "../utils/vector_to_pointers.hpp"
13
20namespace scran {
21
46public:
53 struct Results {
57 Results() {}
58
59 Results(size_t nsubsets) : subset_total(nsubsets), subset_detected(nsubsets) {}
68 std::vector<double> total;
69
74 std::vector<int> detected;
75
81 std::vector<int> max_index;
82
87 std::vector<double> max_count;
88
94 std::vector<std::vector<double> > subset_total;
95
101 std::vector<std::vector<int> > subset_detected;
102 };
103
104public:
110 template<typename Float = double, typename Integer = int>
111 struct Buffers {
115 Buffers() {}
116
117 Buffers(size_t nsubsets) : subset_total(nsubsets, NULL), subset_detected(nsubsets, NULL) {}
126 Float* total = NULL;
127
132 Integer* detected = NULL;
133
138 Integer* max_index = NULL;
139
144 Float* max_count = NULL;
145
152 std::vector<Float*> subset_total;
153
160 std::vector<Integer*> subset_detected;
161
167 bool already_zeroed = false;
168 };
169
170public:
174 struct Defaults {
178 static constexpr bool compute_total = true;
179
183 static constexpr bool compute_detected = true;
184
188 static constexpr bool compute_max_count = true;
189
193 static constexpr bool compute_max_index = true;
194
198 static constexpr bool compute_subset_total = true;
199
203 static constexpr bool compute_subset_detected = true;
204
208 static constexpr int num_threads = 1;
209 };
210
211private:
212 bool compute_total = Defaults::compute_total;
213 bool compute_detected = Defaults::compute_detected;
214 bool compute_max_count = Defaults::compute_max_count;
215 bool compute_max_index = Defaults::compute_max_index;
216 bool compute_subset_total = Defaults::compute_subset_total;
217 bool compute_subset_detected = Defaults::compute_subset_detected;
218 int num_threads = Defaults::num_threads;
219
220public:
228 compute_total = s;
229 return *this;
230 }
231
239 compute_detected = s;
240 return *this;
241 }
242
250 compute_max_count = s;
251 return *this;
252 }
253
261 compute_max_index = s;
262 return *this;
263 }
264
272 compute_subset_total = s;
273 return *this;
274 }
275
283 compute_subset_detected = s;
284 return *this;
285 }
286
293 num_threads = n;
294 return *this;
295 }
296
297public:
312 template<class Matrix, typename Subset = const uint8_t*>
313 Results run(const Matrix* mat, const std::vector<Subset>& subsets) const {
314 Results output;
315 Buffers<> buffers;
316 auto ncells = mat->ncol();
317
318 if (compute_total) {
319 output.total.resize(ncells);
320 buffers.total = output.total.data();
321 }
322 if (compute_detected) {
323 output.detected.resize(ncells);
324 buffers.detected = output.detected.data();
325 }
326 if (compute_max_index) {
327 output.max_index.resize(ncells);
328 buffers.max_index = output.max_index.data();
329 }
330 if (compute_max_count) {
331 output.max_count.resize(ncells, pick_fill_value<double>());
332 buffers.max_count = output.max_count.data();
333 }
334
335 size_t nsubsets = subsets.size();
336
337 if (compute_subset_total) {
338 output.subset_total.resize(nsubsets);
339 buffers.subset_total.resize(nsubsets);
340 for (size_t s = 0; s < nsubsets; ++s) {
341 output.subset_total[s].resize(ncells);
342 buffers.subset_total[s] = output.subset_total[s].data();
343 }
344 }
345
346 if (compute_subset_detected) {
347 output.subset_detected.resize(nsubsets);
348 buffers.subset_detected.resize(nsubsets);
349 for (size_t s = 0; s < nsubsets; ++s) {
350 output.subset_detected[s].resize(ncells);
351 buffers.subset_detected[s] = output.subset_detected[s].data();
352 }
353 }
354
355 buffers.already_zeroed = true;
356 run(mat, subsets, buffers);
357 return output;
358 }
359
360private:
361 template<typename T>
362 static constexpr T pick_fill_value() {
363 if constexpr(std::numeric_limits<T>::has_infinity) {
364 return -std::numeric_limits<T>::infinity();
365 } else {
366 return std::numeric_limits<T>::lowest();
367 }
368 }
369
370private:
371 template<typename Data_, typename Index_, typename Subset_, typename Float_, typename Integer_>
372 void compute_direct_dense(const tatami::Matrix<Data_, Index_>* mat, const std::vector<Subset_>& subsets, Buffers<Float_, Integer_>& output) const {
373 std::vector<std::vector<int> > subset_indices;
374 if (!output.subset_total.empty() || !output.subset_detected.empty()) {
375 size_t nsubsets = subsets.size();
376 subset_indices.resize(nsubsets);
377 auto NR = mat->nrow();
378
379 for (size_t s = 0; s < nsubsets; ++s) {
380 auto& current = subset_indices[s];
381 const auto& source = subsets[s];
382
383 for (int i = 0; i < NR; ++i) {
384 if (source[i]) {
385 current.push_back(i);
386 }
387 }
388 }
389 }
390
391 tatami::parallelize([&](size_t, Index_ start, Index_ length) {
392 auto NR = mat->nrow();
393 auto ext = tatami::consecutive_extractor<false, false>(mat, start, length);
394 std::vector<Data_> vbuffer(NR);
395 bool do_max = output.max_index || output.max_count;
396
397 for (Index_ c = start, end = start + length; c < end; ++c) {
398 auto ptr = ext->fetch(c, vbuffer.data());
399
400 if (output.total) {
401 output.total[c] = std::accumulate(ptr, ptr + NR, static_cast<Float_>(0));
402 }
403
404 if (output.detected) {
405 Integer_ count = 0;
406 for (Index_ r = 0; r < NR; ++r) {
407 count += (ptr[r] != 0);
408 }
409 output.detected[c] = count;
410 }
411
412 if (do_max) {
413 auto max_count = PerCellQcMetrics::pick_fill_value<Float_>();
414 Integer_ max_index = 0;
415 for (Index_ r = 0; r < NR; ++r) {
416 if (max_count < ptr[r]) {
417 max_count = ptr[r];
418 max_index = r;
419 }
420 }
421
422 if (output.max_index) {
423 output.max_index[c] = max_index;
424 }
425 if (output.max_count) {
426 output.max_count[c] = max_count;
427 }
428 }
429
430 size_t nsubsets = subset_indices.size();
431 for (size_t s = 0; s < nsubsets; ++s) {
432 const auto& sub = subset_indices[s];
433
434 if (!output.subset_total.empty() && output.subset_total[s]) {
435 Float_ current = 0;
436 for (auto r : sub) {
437 current += ptr[r];
438 }
439 output.subset_total[s][c] = current;
440 }
441
442 if (!output.subset_detected.empty() && output.subset_detected[s]) {
443 Integer_ current = 0;
444 for (auto r : sub) {
445 current += ptr[r] != 0;
446 }
447 output.subset_detected[s][c] = current;
448 }
449 }
450 }
451 }, mat->ncol(), num_threads);
452 }
453
454 template<typename Data_, typename Index_, typename Subset_, typename Float_, typename Integer_>
455 void compute_direct_sparse(const tatami::Matrix<Data_, Index_>* mat, const std::vector<Subset_>& subsets, Buffers<Float_, Integer_>& output) const {
456 tatami::Options opt;
457 opt.sparse_ordered_index = false;
458
459 tatami::parallelize([&](size_t, Index_ start, Index_ length) {
460 auto NR = mat->nrow();
461 auto ext = tatami::consecutive_extractor<false, true>(mat, start, length, opt);
462 std::vector<Data_> vbuffer(NR);
463 std::vector<Index_> ibuffer(NR);
464
465 bool do_max = output.max_index || output.max_count;
466 std::vector<unsigned char> internal_is_nonzero(output.max_index ? NR : 0);
467
468 for (Index_ c = start, end = start + length; c < end; ++c) {
469 auto range = ext->fetch(c, vbuffer.data(), ibuffer.data());
470
471 if (output.total) {
472 output.total[c] = std::accumulate(range.value, range.value + range.number, static_cast<Float_>(0));
473 }
474
475 if (output.detected) {
476 Integer_ current = 0;
477 for (Index_ i = 0; i < range.number; ++i) {
478 current += (range.value[i] != 0);
479 }
480 output.detected[c] = current;
481 }
482
483 if (do_max) {
484 auto max_count = PerCellQcMetrics::pick_fill_value<Float_>();
485 Integer_ max_index = 0;
486 for (Index_ i = 0; i < range.number; ++i) {
487 if (max_count < range.value[i]) {
488 max_count = range.value[i];
489 max_index = range.index[i];
490 }
491 }
492
493 if (max_count <= 0 && range.number < NR) {
494 // Zero is the max.
495 max_count = 0;
496
497 // Finding the index of the first zero by tracking all
498 // indices with non-zero values. This isn't the fastest
499 // approach but it's simple and avoids assuming that
500 // indices are sorted. Hopefully we don't have to hit
501 // this section often.
502 if (output.max_index) {
503 for (Index_ i = 0; i < range.number; ++i) {
504 if (range.value[i]) {
505 internal_is_nonzero[range.index[i]] = 1;
506 }
507 }
508 for (Index_ r = 0; r < NR; ++r) {
509 if (internal_is_nonzero[r] == 0) {
510 max_index = r;
511 break;
512 }
513 }
514 for (Index_ i = 0; i < range.number; ++i) { // setting back to zero.
515 internal_is_nonzero[range.index[i]] = 0;
516 }
517 }
518 }
519
520 if (output.max_index) {
521 output.max_index[c] = max_index;
522 }
523 if (output.max_count) {
524 output.max_count[c] = max_count;
525 }
526 }
527
528 size_t nsubsets = subsets.size();
529 for (size_t s = 0; s < nsubsets; ++s) {
530 const auto& sub = subsets[s];
531
532 if (!output.subset_total.empty() && output.subset_total[s]) {
533 auto& current = output.subset_total[s][c];
534 for (Index_ i = 0; i < range.number; ++i) {
535 current += (sub[range.index[i]] != 0) * range.value[i];
536 }
537 }
538
539 if (!output.subset_detected.empty() && output.subset_detected[s]) {
540 auto& current = output.subset_detected[s][c];
541 for (Index_ i = 0; i < range.number; ++i) {
542 current += (sub[range.index[i]] != 0) * (range.value[i] != 0);
543 }
544 }
545 }
546 }
547 }, mat->ncol(), num_threads);
548 }
549
550 template<typename Data_, typename Index_, typename Subset_, typename Float_, typename Integer_>
551 void compute_running_dense(const tatami::Matrix<Data_, Index_>* mat, const std::vector<Subset_>& subsets, Buffers<Float_, Integer_>& output) const {
552 tatami::parallelize([&](size_t, Index_ start, Index_ len) {
553 auto NR = mat->nrow();
554 auto ext = tatami::consecutive_extractor<true, false>(mat, 0, NR, start, len);
555 std::vector<Data_> vbuffer(len);
556 bool do_max = output.max_index || output.max_count;
557 std::vector<Float_> internal_max_count(do_max && !output.max_count ? len : 0);
558
559 for (Index_ r = 0; r < NR; ++r) {
560 auto ptr = ext->fetch(r, vbuffer.data());
561
562 if (output.total) {
563 auto outt = output.total + start;
564 for (Index_ i = 0; i < len; ++i) {
565 outt[i] += ptr[i];
566 }
567 }
568
569 if (output.detected) {
570 auto outd = output.detected + start;
571 for (Index_ i = 0; i < len; ++i) {
572 outd[i] += (ptr[i] != 0);
573 }
574 }
575
576 if (do_max) {
577 auto outmc = (output.max_count ? output.max_count + start : internal_max_count.data());
578
579 if (r == 0) {
580 std::copy(ptr, ptr + len, outmc);
581 if (output.max_index) {
582 auto outmi = output.max_index + start;
583 std::fill(outmi, outmi + len, 0);
584 }
585
586 } else {
587 for (Index_ i = 0; i < len; ++i) {
588 auto& curmax = outmc[i];
589 if (curmax < ptr[i]) {
590 curmax = ptr[i];
591 if (output.max_index) {
592 output.max_index[i + start] = r;
593 }
594 }
595 }
596 }
597 }
598
599 size_t nsubsets = subsets.size();
600 for (size_t s = 0; s < nsubsets; ++s) {
601 const auto& sub = subsets[s];
602 if (sub[r] == 0) {
603 continue;
604 }
605
606 if (!output.subset_total.empty() && output.subset_total[s]) {
607 auto current = output.subset_total[s] + start;
608 for (Index_ i = 0; i < len; ++i) {
609 current[i] += ptr[i];
610 }
611 }
612
613 if (!output.subset_detected.empty() && output.subset_detected[s]) {
614 auto current = output.subset_detected[s] + start;
615 for (Index_ i = 0; i < len; ++i) {
616 current[i] += (ptr[i] != 0);
617 }
618 }
619 }
620 }
621 }, mat->ncol(), num_threads);
622 }
623
624 template<typename Data_, typename Index_, typename Subset_, typename Float_, typename Integer_>
625 void compute_running_sparse(const tatami::Matrix<Data_, Index_>* mat, const std::vector<Subset_>& subsets, Buffers<Float_, Integer_>& output) const {
626 tatami::Options opt;
627 opt.sparse_ordered_index = false;
628
629 Index_ NC = mat->ncol();
630 bool do_max = output.max_index || output.max_count;
631 std::vector<Float_> internal_max_count(do_max && !output.max_count ? NC : 0);
632 std::vector<Integer_> internal_last_consecutive_nonzero(do_max ? NC : 0);
633
634 tatami::parallelize([&](size_t t, Index_ start, Index_ len) {
635 auto NR = mat->nrow();
636 auto ext = tatami::consecutive_extractor<true, true>(mat, 0, NR, start, len, opt);
637 std::vector<Data_> vbuffer(len);
638 std::vector<Index_> ibuffer(len);
639
640 for (Index_ r = 0; r < NR; ++r) {
641 auto range = ext->fetch(r, vbuffer.data(), ibuffer.data());
642
643 if (output.total) {
644 for (Index_ i = 0; i < range.number; ++i) {
645 output.total[range.index[i]] += range.value[i];
646 }
647 }
648
649 if (output.detected) {
650 for (Index_ i = 0; i < range.number; ++i) {
651 output.detected[range.index[i]] += (range.value[i] != 0);
652 }
653 }
654
655 if (do_max) {
656 auto outmc = (output.max_count ? output.max_count : internal_max_count.data());
657
658 for (Index_ i = 0; i < range.number; ++i) {
659 auto& curmax = outmc[range.index[i]];
660 if (curmax < range.value[i]) {
661 curmax = range.value[i];
662 if (output.max_index) {
663 output.max_index[range.index[i]] = r;
664 }
665 }
666
667 // Getting the index of the last consecutive non-zero entry, so that
668 // we can check if zero is the max and gets its first occurrence, if necessary.
669 auto& last = internal_last_consecutive_nonzero[range.index[i]];
670 if (static_cast<Index_>(last) == r) {
671 if (range.value[i] != 0) {
672 ++last;
673 }
674 }
675 }
676 }
677
678 size_t nsubsets = subsets.size();
679 for (size_t s = 0; s < nsubsets; ++s) {
680 const auto& sub = subsets[s];
681 if (sub[r] == 0) {
682 continue;
683 }
684
685 if (!output.subset_total.empty() && output.subset_total[s]) {
686 auto& current = output.subset_total[s];
687 for (size_t i = 0; i < range.number; ++i) {
688 current[range.index[i]] += range.value[i];
689 }
690 }
691
692 if (!output.subset_detected.empty() && output.subset_detected[s]) {
693 auto& current = output.subset_detected[s];
694 for (size_t i = 0; i < range.number; ++i) {
695 current[range.index[i]] += (range.value[i] != 0);
696 }
697 }
698 }
699 }
700 }, NC, num_threads);
701
702 if (do_max) {
703 auto outmc = (output.max_count ? output.max_count : internal_max_count.data());
704 auto NR = mat->nrow();
705
706 // Checking anything with non-positive maximum, and replacing it with zero
707 // if there are any zeros (i.e., consecutive non-zeros is not equal to the number of rows).
708 for (Index_ c = 0; c < NC; ++c) {
709 auto& current = outmc[c];
710 if (current > 0) {
711 continue;
712 }
713
714 auto last_nz = internal_last_consecutive_nonzero[c];
715 if (last_nz == NR) {
716 continue;
717 }
718
719 current = 0;
720 if (output.max_index) {
721 output.max_index[c] = last_nz;
722 }
723 }
724 }
725 }
726
727public:
740 template<class Matrix, typename Subset = const uint8_t*, typename Float, typename Integer>
741 void run(const Matrix* mat, const std::vector<Subset>& subsets, Buffers<Float, Integer>& output) const {
742 if (!output.already_zeroed) {
743 size_t n = mat->ncol();
744 auto check_and_fill = [&](auto* ptr, auto value) -> void {
745 if (ptr) {
746 std::fill(ptr, ptr + n, value);
747 }
748 };
749
750 check_and_fill(output.total, static_cast<Float>(0));
751 check_and_fill(output.detected, static_cast<Integer>(0));
752 check_and_fill(output.max_count, pick_fill_value<Float>());
753 check_and_fill(output.max_index, static_cast<Integer>(0));
754
755 for (size_t s = 0; s < subsets.size(); ++s) {
756 if (!output.subset_total.empty()) {
757 check_and_fill(output.subset_total[s], static_cast<Float>(0));
758 }
759 if (!output.subset_detected.empty()) {
760 check_and_fill(output.subset_detected[s], static_cast<Integer>(0));
761 }
762 }
763 }
764
765 if (mat->sparse()) {
766 if (mat->prefer_rows()) {
767 compute_running_sparse(mat, subsets, output);
768 } else {
769 compute_direct_sparse(mat, subsets, output);
770 }
771 } else {
772 if (mat->prefer_rows()) {
773 compute_running_dense(mat, subsets, output);
774 } else {
775 compute_direct_dense(mat, subsets, output);
776 }
777 }
778 }
779};
780
781}
782
783#endif
Compute a variety of per-cell quality control metrics from a count matrix.
Definition PerCellQcMetrics.hpp:45
PerCellQcMetrics & set_compute_total(bool s=Defaults::compute_total)
Definition PerCellQcMetrics.hpp:227
PerCellQcMetrics & set_num_threads(int n=Defaults::num_threads)
Definition PerCellQcMetrics.hpp:292
PerCellQcMetrics & set_compute_subset_detected(bool s=Defaults::compute_subset_detected)
Definition PerCellQcMetrics.hpp:282
PerCellQcMetrics & set_compute_max_count(bool s=Defaults::compute_max_count)
Definition PerCellQcMetrics.hpp:249
PerCellQcMetrics & set_compute_subset_total(bool s=Defaults::compute_subset_total)
Definition PerCellQcMetrics.hpp:271
PerCellQcMetrics & set_compute_detected(bool s=Defaults::compute_detected)
Definition PerCellQcMetrics.hpp:238
PerCellQcMetrics & set_compute_max_index(bool s=Defaults::compute_max_index)
Definition PerCellQcMetrics.hpp:260
Results run(const Matrix *mat, const std::vector< Subset > &subsets) const
Definition PerCellQcMetrics.hpp:313
void run(const Matrix *mat, const std::vector< Subset > &subsets, Buffers< Float, Integer > &output) const
Definition PerCellQcMetrics.hpp:741
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
Buffers for direct storage of the calculated statistics.
Definition PerCellQcMetrics.hpp:111
Float * max_count
Definition PerCellQcMetrics.hpp:144
std::vector< Integer * > subset_detected
Definition PerCellQcMetrics.hpp:160
Integer * max_index
Definition PerCellQcMetrics.hpp:138
bool already_zeroed
Definition PerCellQcMetrics.hpp:167
std::vector< Float * > subset_total
Definition PerCellQcMetrics.hpp:152
Integer * detected
Definition PerCellQcMetrics.hpp:132
Float * total
Definition PerCellQcMetrics.hpp:126
Default parameters.
Definition PerCellQcMetrics.hpp:174
static constexpr bool compute_subset_detected
Definition PerCellQcMetrics.hpp:203
static constexpr bool compute_max_index
Definition PerCellQcMetrics.hpp:193
static constexpr int num_threads
Definition PerCellQcMetrics.hpp:208
static constexpr bool compute_max_count
Definition PerCellQcMetrics.hpp:188
static constexpr bool compute_total
Definition PerCellQcMetrics.hpp:178
static constexpr bool compute_subset_total
Definition PerCellQcMetrics.hpp:198
static constexpr bool compute_detected
Definition PerCellQcMetrics.hpp:183
Result store for QC metric calculations.
Definition PerCellQcMetrics.hpp:53
std::vector< double > total
Definition PerCellQcMetrics.hpp:68
std::vector< double > max_count
Definition PerCellQcMetrics.hpp:87
std::vector< int > detected
Definition PerCellQcMetrics.hpp:74
std::vector< std::vector< int > > subset_detected
Definition PerCellQcMetrics.hpp:101
std::vector< std::vector< double > > subset_total
Definition PerCellQcMetrics.hpp:94
std::vector< int > max_index
Definition PerCellQcMetrics.hpp:81