phyper
Hypergeometric tail calculations
Loading...
Searching...
No Matches
phyper.hpp
Go to the documentation of this file.
1#ifndef PHYPER_PHYPER_HPP
2#define PHYPER_PHYPER_HPP
3
4#include <cmath>
5
15namespace phyper {
16
20struct Options {
24 bool log = false;
25
31 bool upper_tail = true;
32};
33
37namespace internal {
38
39template<typename Count_>
40long double lfactorial(Count_ x) {
41 // Computing it exactly for small numbers, to avoid unnecessarily
42 // large relative inaccuracy from the approximation. Threshold of
43 // 12 is chosen more-or-less arbitrarily... but 12! is the largest
44 // value that can be represented by a 32-bit int, if that helps.
45 switch(x) {
46 case 0: case 1: return 0;
47 case 2: return std::log(2.0);
48 case 3: return std::log(6.0);
49 case 4: return std::log(24.0);
50 case 5: return std::log(120.0);
51 case 6: return std::log(720.0);
52 case 7: return std::log(5040.0);
53 case 8: return std::log(40320.0);
54 case 9: return std::log(362880.0);
55 case 10: return std::log(3628800.0);
56 case 11: return std::log(39916800.0);
57 case 12: return std::log(479001600.0);
58 }
59
60 // For large numbers, using Ramanujan's approximation rather than R's complicated thing.
61 // Check out https://www.johndcook.com/blog/2012/09/25/ramanujans-factorial-approximation/.
62 long double y = x;
63 return 1.0/6.0 * std::log(y * (1 + 4 * y * (1 + 2 * y)) + 1.0/30.0) + y * std::log(y) - y + 0.5 * std::log(3.14159265358979323846);
64}
65
66}
88template<typename Count_>
89double compute(Count_ drawn_inside, Count_ num_inside, Count_ num_outside, Count_ num_drawn, const Options& options) {
90 // Handling all the edge cases.
91 if (options.upper_tail) {
92 if (drawn_inside <= 0 || (num_drawn >= num_outside && drawn_inside <= num_drawn - num_outside)) {
93 return (options.log ? 0 : 1);
94 }
95 if (drawn_inside > num_drawn || drawn_inside > num_inside) {
96 return (options.log ? -std::numeric_limits<double>::infinity() : 0);
97 }
98 } else {
99 if (drawn_inside < 0 || (num_drawn >= num_outside && drawn_inside < num_drawn - num_outside)) {
100 return (options.log ? -std::numeric_limits<double>::infinity() : 0);
101 }
102 if (drawn_inside >= num_drawn || drawn_inside >= num_inside) {
103 return (options.log ? 0 : 1);
104 }
105 }
106
107 // Subtracting 1 to include the probably mass of 'drawn_inside' in the upper tail calculations.
108 if (options.upper_tail) {
109 --drawn_inside;
110 }
111
112 // We flip the tail to reduce the number of iterations during the
113 // cumulative sum. Note that 'alt_drawn_inside' is guaranteed to be
114 // non-negative due to edge case protection, keeping in mind we already
115 // decremented drawn_inside when upper_tail = true.
116 bool needs_upper = options.upper_tail;
117 Count_ alt_drawn_inside = num_drawn - drawn_inside - 1;
118 if (drawn_inside > alt_drawn_inside) {
119 std::swap(num_inside, num_outside);
120 drawn_inside = alt_drawn_inside;
121 needs_upper = !needs_upper;
122 }
123
124 /*
125 * Computing the cumulative sum after factorizing out the probability mass at drawn_inside.
126 * This allows us to do one pass from drawn_inside to 0 to compute the probability.
127 * We use long doubles to mitigate the loss of precision on these cumulative operations.
128 *
129 * We can check the accuracy of our calculations with:
130 * > sum(choose(num_inside, 0:drawn_inside) * choose(num_outside, num_drawn - 0:drawn_inside)) / max(choose(num_inside, num_drawn - num_outside), choose(num_outside, num_drawn)) - 1
131 *
132 * We start from the probability mass at drawn_inside observations, and we work our way downwards.
133 * This avoids problems with floating point overflow when computing the cumulative product.
134 */
135 Count_ denom1a = drawn_inside, denom1b = num_inside - denom1a;
136 Count_ denom2a = num_drawn - drawn_inside, denom2b = num_outside - denom2a; // be careful with the second subtraction to avoid underflow for unsigned Count_.
137 Count_ num_total = num_outside + num_inside;
138 long double log_probability =
139 + internal::lfactorial(num_inside) - internal::lfactorial(denom1a) - internal::lfactorial(denom1b) // lchoose(num_inside, num_inside)
140 + internal::lfactorial(num_outside) - internal::lfactorial(denom2a) - internal::lfactorial(denom2b) // lchoose(num_outside, num_drawn - drawn_inside)
141 - internal::lfactorial(num_total) + internal::lfactorial(num_drawn) + internal::lfactorial(num_total - num_drawn); // -lchoose(num_total, num_drawn)
142
143 long double cumulative = 0; // will add 1 via log1p.
144 long double partial_probability = 1;
145
146 for (Count_ k = drawn_inside; k > 0; --k) {
147 ++denom1b;
148 ++denom2a;
149
150 long double mult = (static_cast<long double>(denom1a) * static_cast<long double>(denom2b)) / (static_cast<long double>(denom1b) * static_cast<long double>(denom2a));
151 partial_probability *= mult;
152 if (partial_probability == 0) { // underflow to zero, no point continuing...
153 break;
154 }
155 cumulative += partial_probability;
156
157 --denom1a;
158 --denom2b;
159 }
160
161 long double log_cumulative = std::log1p(cumulative) + log_probability;
162 if (!needs_upper) {
163 if (options.log) {
164 return log_cumulative;
165 } else {
166 return std::exp(log_cumulative);
167 }
168 }
169
170 if (options.log) {
171 // Logic from https://github.com/SurajGupta/r-source/blob/master/src/nmath/dpq.h;
172 // if 'logcum' is close to zero, exp(logcum) will be close to 1, and thus the precision of
173 // expm1 is more important. If 'logcum' is large and negative, exp(logcum) will be close to
174 // zero, and thus the precision of log1p is more important.
175 if (log_cumulative > -std::log(2)) {
176 auto p = -std::expm1(log_cumulative);
177 return (p > 0 ? std::log(p) : -std::numeric_limits<double>::infinity());
178 } else {
179 auto p = -std::exp(log_cumulative);
180 return (p > -1 ? std::log1p(p) : -std::numeric_limits<double>::infinity());
181 }
182 } else {
183 return -std::expm1(log_cumulative);
184 }
185}
186
187}
188
189#endif
Compute hypergeometric tail probabilities.
double compute(Count_ drawn_inside, Count_ num_inside, Count_ num_outside, Count_ num_drawn, const Options &options)
Definition phyper.hpp:89
Options for compute().
Definition phyper.hpp:20
bool upper_tail
Definition phyper.hpp:31
bool log
Definition phyper.hpp:24