phyper
Hypergeometric tail calculations in C++
Loading...
Searching...
No Matches
phyper.hpp
Go to the documentation of this file.
1#ifndef PHYPER_PHYPER_HPP
2#define PHYPER_PHYPER_HPP
3
4#include <cmath>
5#include <limits>
6
16namespace phyper {
17
21struct Options {
25 bool log = false;
26
32 bool upper_tail = true;
33};
34
38namespace internal {
39
40template<typename Count_>
41long double lfactorial(const Count_ x) {
42 // Computing it exactly for small numbers, to avoid unnecessarily
43 // large relative inaccuracy from the approximation. Threshold of
44 // 12 is chosen more-or-less arbitrarily... but 12! is the largest
45 // value that can be represented by a 32-bit int, if that helps.
46 switch(x) {
47 case 0: case 1: return 0;
48 case 2: return std::log(2.0);
49 case 3: return std::log(6.0);
50 case 4: return std::log(24.0);
51 case 5: return std::log(120.0);
52 case 6: return std::log(720.0);
53 case 7: return std::log(5040.0);
54 case 8: return std::log(40320.0);
55 case 9: return std::log(362880.0);
56 case 10: return std::log(3628800.0);
57 case 11: return std::log(39916800.0);
58 case 12: return std::log(479001600.0);
59 }
60
61 // For large numbers, using Ramanujan's approximation rather than R's complicated thing.
62 // Check out https://www.johndcook.com/blog/2012/09/25/ramanujans-factorial-approximation/.
63 long double y = x;
64 return 1.0/6.0 * std::log(y * (1 + 4 * y * (1 + 2 * y)) + 1.0/30.0) + y * std::log(y) - y + 0.5 * std::log(3.14159265358979323846);
65}
66
67}
90template<typename Count_>
91double compute(Count_ drawn_inside, Count_ num_inside, Count_ num_outside, const Count_ num_drawn, const Options& options) {
92 // Handling all the edge cases.
93 if (options.upper_tail) {
94 if (drawn_inside <= 0 || (num_drawn >= num_outside && drawn_inside <= num_drawn - num_outside)) {
95 return (options.log ? 0 : 1);
96 }
97 if (drawn_inside > num_drawn || drawn_inside > num_inside) {
98 return (options.log ? -std::numeric_limits<double>::infinity() : 0);
99 }
100 } else {
101 if (drawn_inside < 0 || (num_drawn >= num_outside && drawn_inside < num_drawn - num_outside)) {
102 return (options.log ? -std::numeric_limits<double>::infinity() : 0);
103 }
104 if (drawn_inside >= num_drawn || drawn_inside >= num_inside) {
105 return (options.log ? 0 : 1);
106 }
107 }
108
109 if (std::numeric_limits<Count_>::max() - num_inside < num_outside) {
110 throw std::runtime_error("sum of 'num_inside' and 'num_outside' results in integer overflow");
111 }
112 const Count_ num_total = num_outside + num_inside;
113
114 // Subtracting 1 to include the probably mass of 'drawn_inside' in the upper tail calculations.
115 if (options.upper_tail) {
116 --drawn_inside;
117 }
118
119 // We flip the problem to ensure that we're always computing the smaller tail for accuracy.
120 // (Smaller in terms of the cumulative probability, and usually - but not necessarily - the number of iterations.)
121 // If that's the tail that we wanted, then great; we can compute it directly without worrying about loss of precision from '1 - [some larger tail]'.
122 // If it's not the tail we wanted, then we compute '1 - [this smaller tail]' and we don't have to worry about accumulation of errors from summation towards 1.
123 // In addition, the smaller tail is usually faster to compute but this is a secondary effect.
124 bool needs_upper = options.upper_tail;
125 if (static_cast<double>(drawn_inside) * static_cast<double>(num_total) > static_cast<double>(num_drawn) * static_cast<double>(num_inside)) {
126 std::swap(num_inside, num_outside);
127 drawn_inside = num_drawn - drawn_inside - 1; // Guaranteed to be non-negative due to edge case protection; we already decremented drawn_inside when upper_tail = true.
128 needs_upper = !needs_upper;
129 }
130
131 /*
132 * Computing the cumulative sum after factorizing out the probability mass at drawn_inside.
133 * This allows us to do one pass from drawn_inside to 0 to compute the probability.
134 * We use long doubles to mitigate the loss of precision on these cumulative operations.
135 *
136 * We can check the accuracy of our calculations with:
137 * > sum(choose(num_inside, 0:drawn_inside) * choose(num_outside, num_drawn - 0:drawn_inside)) / max(choose(num_inside, num_drawn - num_outside), choose(num_outside, num_drawn)) - 1
138 *
139 * We start from the probability mass at drawn_inside observations, and we work our way downwards.
140 * This avoids problems with floating point overflow when computing the cumulative product.
141 */
142 Count_ denom1a = drawn_inside, denom1b = num_inside - denom1a;
143 Count_ denom2a = num_drawn - drawn_inside, denom2b = num_outside - denom2a; // be careful with the second subtraction to avoid underflow for unsigned Count_.
144 const long double log_probability =
145 + internal::lfactorial(num_inside) - internal::lfactorial(denom1a) - internal::lfactorial(denom1b) // lchoose(num_inside, num_inside)
146 + internal::lfactorial(num_outside) - internal::lfactorial(denom2a) - internal::lfactorial(denom2b) // lchoose(num_outside, num_drawn - drawn_inside)
147 - internal::lfactorial(num_total) + internal::lfactorial(num_drawn) + internal::lfactorial(num_total - num_drawn); // -lchoose(num_total, num_drawn)
148
149 long double cumulative = 0; // will add 1 via log1p.
150 long double partial_probability = 1;
151
152 for (Count_ k = drawn_inside; k > 0; --k) {
153 ++denom1b;
154 ++denom2a;
155
156 long double mult = (static_cast<long double>(denom1a) * static_cast<long double>(denom2b)) / (static_cast<long double>(denom1b) * static_cast<long double>(denom2a));
157 partial_probability *= mult;
158 if (partial_probability == 0) { // underflow to zero, no point continuing...
159 break;
160 }
161 cumulative += partial_probability;
162
163 --denom1a;
164 --denom2b;
165 }
166
167 const long double log_cumulative = std::log1p(cumulative) + log_probability;
168 if (!needs_upper) {
169 if (options.log) {
170 return log_cumulative;
171 } else {
172 return std::exp(log_cumulative);
173 }
174 }
175
176 if (options.log) {
177 // Basically, we want to compute 'log(1 - exp(log_cumulative))', but need to be a bit smart about where the subtraction from 1 occurs.
178 // The logic is derived from https://github.com/SurajGupta/r-source/blob/master/src/nmath/dpq.h, specifically:
179 // - if 'log_cumulative' is close to zero, 'exp(log_cumulative)' will be close to 1, and thus the precision of 'expm1' is more important.
180 // - if 'log_cumulative' is large and negative, 'exp(log_cumulative)' will be close to zero, and thus the precision of 'log1p' is more important.
181 if (log_cumulative > -std::log(2)) {
182 const auto p = -std::expm1(log_cumulative);
183 return (p > 0 ? std::log(p) : -std::numeric_limits<double>::infinity());
184 } else {
185 const auto p = -std::exp(log_cumulative);
186 return (p > -1 ? std::log1p(p) : -std::numeric_limits<double>::infinity());
187 }
188 } else {
189 return -std::expm1(log_cumulative);
190 }
191}
192
193}
194
195#endif
Compute hypergeometric tail probabilities.
double compute(Count_ drawn_inside, Count_ num_inside, Count_ num_outside, const Count_ num_drawn, const Options &options)
Definition phyper.hpp:91
Options for compute().
Definition phyper.hpp:21
bool upper_tail
Definition phyper.hpp:32
bool log
Definition phyper.hpp:25