phyper
Hypergeometric tail calculations
Loading...
Searching...
No Matches
phyper.hpp
Go to the documentation of this file.
1#ifndef PHYPER_PHYPER_HPP
2#define PHYPER_PHYPER_HPP
3
4#include <cmath>
5
15namespace phyper {
16
20struct Options {
24 bool log = false;
25
31 bool upper_tail = true;
32};
33
37namespace internal {
38
39template<typename Count_>
40long double lfactorial(Count_ x) {
41 // Computing it exactly for small numbers, to avoid unnecessarily
42 // large relative inaccuracy from the approximation. Threshold of
43 // 12 is chosen more-or-less arbitrarily... but 12! is the largest
44 // value that can be represented by a 32-bit int, if that helps.
45 switch(x) {
46 case 0: case 1: return 0;
47 case 2: return std::log(2.0);
48 case 3: return std::log(6.0);
49 case 4: return std::log(24.0);
50 case 5: return std::log(120.0);
51 case 6: return std::log(720.0);
52 case 7: return std::log(5040.0);
53 case 8: return std::log(40320.0);
54 case 9: return std::log(362880.0);
55 case 10: return std::log(3628800.0);
56 case 11: return std::log(39916800.0);
57 case 12: return std::log(479001600.0);
58 }
59
60 // For large numbers, using Ramanujan's approximation rather than R's complicated thing.
61 // Check out https://www.johndcook.com/blog/2012/09/25/ramanujans-factorial-approximation/.
62 long double y = x;
63 return 1.0/6.0 * std::log(y * (1 + 4 * y * (1 + 2 * y)) + 1.0/30.0) + y * std::log(y) - y + 0.5 * std::log(3.14159265358979323846);
64}
65
66}
88template<typename Count_>
89double compute(Count_ drawn_inside, Count_ num_inside, Count_ num_outside, Count_ num_drawn, const Options& options) {
90 // Handling all the edge cases.
91 if (options.upper_tail) {
92 if (drawn_inside <= 0 || (num_drawn >= num_outside && drawn_inside <= num_drawn - num_outside)) {
93 return (options.log ? 0 : 1);
94 }
95 if (drawn_inside > num_drawn || drawn_inside > num_inside) {
96 return (options.log ? -std::numeric_limits<double>::infinity() : 0);
97 }
98 } else {
99 if (drawn_inside < 0 || (num_drawn >= num_outside && drawn_inside < num_drawn - num_outside)) {
100 return (options.log ? -std::numeric_limits<double>::infinity() : 0);
101 }
102 if (drawn_inside >= num_drawn || drawn_inside >= num_inside) {
103 return (options.log ? 0 : 1);
104 }
105 }
106
107 // Subtracting 1 to include the probably mass of 'drawn_inside' in the upper tail calculations.
108 if (options.upper_tail) {
109 --drawn_inside;
110 }
111
112 // We flip the problem to ensure that we're always computing the smaller tail for accuracy.
113 // If that's the tail that we wanted, then great; we can compute it directly without worrying about loss of precision from '1 - [some larger tail]'.
114 // If it's not the tail we wanted, then we compute '1 - [this smaller tail]' and we don't have to worry about accumulation of errors from summation.
115 // The smaller tail is usually also faster to compute but this is a secondary effect.
116 bool needs_upper = options.upper_tail;
117 if (static_cast<double>(drawn_inside) * static_cast<double>(num_inside + num_outside) > static_cast<double>(num_drawn) * static_cast<double>(num_inside)) {
118 std::swap(num_inside, num_outside);
119 drawn_inside = num_drawn - drawn_inside - 1; // Guaranteed to be non-negative due to edge case protection; we already decremented drawn_inside when upper_tail = true.
120 needs_upper = !needs_upper;
121 }
122
123 /*
124 * Computing the cumulative sum after factorizing out the probability mass at drawn_inside.
125 * This allows us to do one pass from drawn_inside to 0 to compute the probability.
126 * We use long doubles to mitigate the loss of precision on these cumulative operations.
127 *
128 * We can check the accuracy of our calculations with:
129 * > sum(choose(num_inside, 0:drawn_inside) * choose(num_outside, num_drawn - 0:drawn_inside)) / max(choose(num_inside, num_drawn - num_outside), choose(num_outside, num_drawn)) - 1
130 *
131 * We start from the probability mass at drawn_inside observations, and we work our way downwards.
132 * This avoids problems with floating point overflow when computing the cumulative product.
133 */
134 Count_ denom1a = drawn_inside, denom1b = num_inside - denom1a;
135 Count_ denom2a = num_drawn - drawn_inside, denom2b = num_outside - denom2a; // be careful with the second subtraction to avoid underflow for unsigned Count_.
136 Count_ num_total = num_outside + num_inside;
137 long double log_probability =
138 + internal::lfactorial(num_inside) - internal::lfactorial(denom1a) - internal::lfactorial(denom1b) // lchoose(num_inside, num_inside)
139 + internal::lfactorial(num_outside) - internal::lfactorial(denom2a) - internal::lfactorial(denom2b) // lchoose(num_outside, num_drawn - drawn_inside)
140 - internal::lfactorial(num_total) + internal::lfactorial(num_drawn) + internal::lfactorial(num_total - num_drawn); // -lchoose(num_total, num_drawn)
141
142 long double cumulative = 0; // will add 1 via log1p.
143 long double partial_probability = 1;
144
145 for (Count_ k = drawn_inside; k > 0; --k) {
146 ++denom1b;
147 ++denom2a;
148
149 long double mult = (static_cast<long double>(denom1a) * static_cast<long double>(denom2b)) / (static_cast<long double>(denom1b) * static_cast<long double>(denom2a));
150 partial_probability *= mult;
151 if (partial_probability == 0) { // underflow to zero, no point continuing...
152 break;
153 }
154 cumulative += partial_probability;
155
156 --denom1a;
157 --denom2b;
158 }
159
160 long double log_cumulative = std::log1p(cumulative) + log_probability;
161 if (!needs_upper) {
162 if (options.log) {
163 return log_cumulative;
164 } else {
165 return std::exp(log_cumulative);
166 }
167 }
168
169 if (options.log) {
170 // Logic from https://github.com/SurajGupta/r-source/blob/master/src/nmath/dpq.h;
171 // if 'logcum' is close to zero, exp(logcum) will be close to 1, and thus the precision of
172 // expm1 is more important. If 'logcum' is large and negative, exp(logcum) will be close to
173 // zero, and thus the precision of log1p is more important.
174 if (log_cumulative > -std::log(2)) {
175 auto p = -std::expm1(log_cumulative);
176 return (p > 0 ? std::log(p) : -std::numeric_limits<double>::infinity());
177 } else {
178 auto p = -std::exp(log_cumulative);
179 return (p > -1 ? std::log1p(p) : -std::numeric_limits<double>::infinity());
180 }
181 } else {
182 return -std::expm1(log_cumulative);
183 }
184}
185
186}
187
188#endif
Compute hypergeometric tail probabilities.
double compute(Count_ drawn_inside, Count_ num_inside, Count_ num_outside, Count_ num_drawn, const Options &options)
Definition phyper.hpp:89
Options for compute().
Definition phyper.hpp:20
bool upper_tail
Definition phyper.hpp:31
bool log
Definition phyper.hpp:24