scran
C++ library for basic single-cell RNA-seq analyses
Loading...
Searching...
No Matches
HypergeometricTail.hpp
Go to the documentation of this file.
1#ifndef SCRAN_HYPERGEOMETRIC_TAIL_HPP
2#define SCRAN_HYPERGEOMETRIC_TAIL_HPP
3
4#include "../utils/macros.hpp"
5
6#include <cmath>
7
14namespace scran {
15
25public:
29 struct Defaults {
33 static constexpr bool log = false;
34
38 static constexpr bool upper_tail = true;
39 };
40
41private:
42 bool log = Defaults::log;
43 bool upper_tail = Defaults::upper_tail;
44
45public:
51 log = l;
52 return *this;
53 }
54
63 upper_tail = u;
64 return *this;
65 }
66
67public: // only exported for testing.
71 static double lfactorial(int x) {
72 // Computing it exactly for small numbers, to avoid unnecessarily
73 // large relative inaccuracy from the approximation. Threshold of
74 // 12 is chosen more-or-less arbitrarily... but 12! is the largest
75 // value that can be represented by a 32-bit int, if that helps.
76 switch(x) {
77 case 0: case 1: return 0;
78 case 2: return std::log(2.0);
79 case 3: return std::log(6.0);
80 case 4: return std::log(24.0);
81 case 5: return std::log(120.0);
82 case 6: return std::log(720.0);
83 case 7: return std::log(5040.0);
84 case 8: return std::log(40320.0);
85 case 9: return std::log(362880.0);
86 case 10: return std::log(3628800.0);
87 case 11: return std::log(39916800.0);
88 case 12: return std::log(479001600.0);
89 }
90
91 // For large numbers, using Ramanujan's approximation rather than R's complicated thing.
92 // Check out https://www.johndcook.com/blog/2012/09/25/ramanujans-factorial-approximation/.
93 double y = x;
94 return 1.0/6.0 * std::log(y * (1 + 4 * y * (1 + 2 * y)) + 1.0/30.0) + y * std::log(y) - y + 0.5 * std::log(3.14159265358979323846);
95 }
100private:
101 /*
102 * Computing the cumulative sum after factorizing out the probability mass at drawn_inside.
103 * This allows us to do one pass from k to 0 to compute the probability.
104 *
105 * We can check the accuracy of our calculations with:
106 * sum(choose(num_inside, 0:drawn_inside) * choose(num_outside, num_drawn - 0:drawn_inside)) / max(choose(num_inside, num_drawn - num_outside), choose(num_outside, num_drawn)) - 1
107 */
108 static long double compute_cumulative(int drawn_inside, int num_inside, int num_outside, int num_drawn) {
109 // Improved precision for this step, possibly involving small probabilities.
110 long double probability = 1;
111
112 // We need to add 1 for the probability mass at drawn_inside,
113 // but we'll do this in compute_tail_probability() to make use of the more precise log1p.
114 long double cumulative = 0;
115
116 for (int k = drawn_inside; k > 0 && probability > 0; --k) {
117 probability *= static_cast<double>(k) * static_cast<double>(num_outside - num_drawn + k) / static_cast<double>(num_inside - k + 1) / static_cast<double>(num_drawn - k + 1);
118 cumulative += probability;
119 }
120
121 return cumulative;
122 }
123
124 static double compute_probability_mass(int drawn_inside, int num_inside, int num_outside, int num_drawn) {
125 int num_total = num_inside + num_outside;
126 return lfactorial(num_inside) - lfactorial(drawn_inside) - lfactorial(num_inside - drawn_inside) // lchoose(num_inside, drawn_inside)
127 + lfactorial(num_outside) - lfactorial(num_drawn - drawn_inside) - lfactorial(num_outside - num_drawn + drawn_inside) // lchoose(num_outside, num_drawn - drawn_inside)
128 - lfactorial(num_total) + lfactorial(num_drawn) + lfactorial(num_total - num_drawn); // -lchoose(num_total, num_drawn)
129 }
130
131private:
132 static double invert_tail_log(double val) {
133 // Logic from https://github.com/SurajGupta/r-source/blob/master/src/nmath/dpq.h;
134 // if 'lp' is close to zero, exp(lp) will be close to 1, and thus the precision of
135 // expm1 is more important. If 'lp' is large and negative, exp(lp) will be close to
136 // zero, and thus the precision of log1p is more important.
137 if (val > -std::log(2)) {
138 auto p = -std::expm1(val);
139 return (p > 0 ? std::log(p) : -std::numeric_limits<double>::infinity());
140 } else {
141 auto p = -std::exp(val);
142 return (p > -1 ? std::log1p(p) : -std::numeric_limits<double>::infinity());
143 }
144 }
145
146 double compute_tail_probability(long double cumulative, double scale, bool do_lower) const {
147 if (log) {
148 double lp = std::log1p(cumulative) + scale;
149 if (do_lower) {
150 return lp;
151 } else {
152 return invert_tail_log(lp);
153 }
154 } else {
155 auto p = (1 + cumulative) * std::exp(scale);
156 return (do_lower ? p : 1 - p);
157 }
158 }
159
160 double edge_handler(bool zero) const {
161 if (zero) {
162 return (log ? -std::numeric_limits<double>::infinity() : 0);
163 } else {
164 return (log ? 0 : 1);
165 }
166 }
167
168private:
169 double core(int drawn_inside, int num_inside, int num_outside, int num_drawn) const {
170 // Subtracting 1 to include the PMF of 'drawn_inside' in the upper tail calculations.
171 if (upper_tail) {
172 --drawn_inside;
173 }
174
175 if (drawn_inside <= 0 || drawn_inside < num_drawn - num_outside) {
176 return edge_handler(!upper_tail);
177 } else if (drawn_inside >= num_drawn || drawn_inside >= num_inside) {
178 return edge_handler(upper_tail);
179 }
180
181 // Flipping the tails to avoid having to calculate large summations.
182 // This it avoids having to subtract large sums from 1 when computing
183 // upper tails, which could result in catastrophic cancellation. It can
184 // also sometimes improve efficiency by summing over the smaller tail.
185 bool do_lower = !upper_tail;
186 if (drawn_inside * (num_inside + num_outside) > num_drawn * num_inside) {
187 auto tmp = num_inside;
188 num_inside = num_outside;
189 num_outside = tmp;
190 drawn_inside = num_drawn - drawn_inside - 1;
191 do_lower = !do_lower;
192 }
193
194 double logscale = compute_probability_mass(drawn_inside, num_inside, num_outside, num_drawn);
195 auto cum = compute_cumulative(drawn_inside, num_inside, num_outside, num_drawn);
196 return compute_tail_probability(cum, logscale, do_lower);
197 }
198
199public:
209 double run(int drawn_inside, int num_inside, int num_outside, int num_drawn) const {
210 return core(drawn_inside, num_inside, num_outside, num_drawn);
211 }
212};
213
214}
215
216#endif
Compute hypergeometric tail probabilities.
Definition HypergeometricTail.hpp:24
HypergeometricTail & set_log(bool l=Defaults::log)
Definition HypergeometricTail.hpp:50
double run(int drawn_inside, int num_inside, int num_outside, int num_drawn) const
Definition HypergeometricTail.hpp:209
HypergeometricTail & set_upper_tail(bool u=Defaults::upper_tail)
Definition HypergeometricTail.hpp:62
Functions for single-cell RNA-seq analyses.
Definition AggregateAcrossCells.hpp:18
Default parameters.
Definition HypergeometricTail.hpp:29
static constexpr bool upper_tail
Definition HypergeometricTail.hpp:38
static constexpr bool log
Definition HypergeometricTail.hpp:33