topicks
Pick top genes for downstream analyses
Loading...
Searching...
No Matches
pick_top_genes.hpp
Go to the documentation of this file.
1#ifndef TOPICKS_PICK_TOP_GENES_HPP
2#define TOPICKS_PICK_TOP_GENES_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstddef>
8#include <type_traits>
9#include <limits>
10#include <cmath>
11
12#include "sanisizer/sanisizer.hpp"
13
19namespace topicks {
20
25template<typename Stat_>
38 std::optional<Stat_> bound;
39
44 bool open_bound = true;
45
50 bool keep_ties = true;
51
56 bool check_nan = false;
57};
58
62namespace internal {
63
64template<typename Input_>
65std::remove_cv_t<std::remove_reference_t<Input_> > I(Input_ x) {
66 return x;
67}
68
69template<bool keep_index_, typename Index_, typename Stat_, class Output_, class Cmp_>
70void filter_genes_by_threshold(const Index_ n, const Stat_* statistic, Output_& output, const Cmp_ cmp, const Stat_ threshold) {
71 // This function is inherently safe as 'ok' will always be false for any comparison involving NaNs.
72 for (Index_ i = 0; i < n; ++i) {
73 const bool ok = cmp(statistic[i], threshold);
74 if constexpr(keep_index_) {
75 if (ok) {
76 output.push_back(i);
77 }
78 } else {
79 output[i] = ok;
80 }
81 }
82}
83
84template<bool keep_index_, typename Index_, typename Stat_, class Output_, class Cmp_>
85void select_top_genes_by_threshold(const Index_ top, const Stat_* statistic, Output_& output, const Cmp_ cmp, const Stat_ threshold, const std::vector<Index_>& semi_sorted) {
86 Index_ counter = top;
87 while (counter > 0) {
88 --counter;
89 const auto pos = semi_sorted[counter];
90 if (cmp(statistic[pos], threshold)) {
91 if constexpr(keep_index_) {
92 output.push_back(pos);
93 } else {
94 output[pos] = true;
95 }
96 }
97 }
98}
99
100template<bool keep_index_, typename Index_, typename Stat_, class Output_, class CmpNotEqual_, class CmpEqual_>
101void pick_top_genes(const Index_ n, const Stat_* statistic, const Index_ top, Output_& output, const CmpNotEqual_ cmpne, const CmpEqual_ cmpeq, const PickTopGenesOptions<Stat_>& options) {
102 if (top == 0) {
103 if constexpr(keep_index_) {
104 ; // no-op, we assume it's already empty.
105 } else {
106 std::fill_n(output, n, false);
107 }
108 return;
109 }
110
111 Index_ num_nan = 0;
112 if constexpr(std::numeric_limits<Stat_>::has_quiet_NaN) {
113 if (options.check_nan) {
114 for (Index_ i = 0; i < n; ++i) {
115 num_nan += std::isnan(statistic[i]);
116 }
117 }
118 }
119
120 if (top >= n - num_nan) {
121 if (options.bound.has_value()) {
122 if (options.open_bound) {
123 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpne, *(options.bound));
124 } else {
125 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpeq, *(options.bound));
126 }
127 } else if (num_nan == 0) {
128 if constexpr(keep_index_) {
129 sanisizer::resize(output, n);
130 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
131 } else {
132 std::fill_n(output, n, true);
133 }
134 } else {
135 if constexpr(keep_index_) {
136 output.reserve(n - num_nan);
137 for (Index_ i = 0; i < n; ++i) {
138 if (!std::isnan(statistic[i])) {
139 output.push_back(i);
140 }
141 }
142 } else {
143 for (Index_ i = 0; i < n; ++i) {
144 output[i] = !std::isnan(statistic[i]);
145 }
146 }
147 }
148 return;
149 }
150
151 std::vector<Index_> semi_sorted;
152 if (num_nan == 0) {
153 sanisizer::resize(semi_sorted, n);
154 std::iota(semi_sorted.begin(), semi_sorted.end(), static_cast<Index_>(0));
155 } else {
156 semi_sorted.reserve(n - num_nan);
157 for (Index_ i = 0; i < n; ++i) {
158 if (!std::isnan(statistic[i])) {
159 semi_sorted.push_back(i);
160 }
161 }
162 }
163
164 const auto cBegin = semi_sorted.begin(), cMid = cBegin + top - 1, cEnd = semi_sorted.end();
165 std::nth_element(cBegin, cMid, cEnd, [&](const Index_ l, const Index_ r) -> bool {
166 const auto L = statistic[l], R = statistic[r];
167 if (L == R) {
168 return l < r; // always favor the earlier index for a stable sort, even if larger = false.
169 } else {
170 return cmpne(L, R);
171 }
172 });
173 const Stat_ threshold = statistic[*cMid];
174
175 if (options.keep_ties) {
176 if (options.bound.has_value()) {
177 const auto bound = *(options.bound);
178
179 if (options.open_bound) {
180 if (!cmpne(threshold, bound)) {
181 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpne, *(options.bound));
182 return;
183 }
184 } else {
185 if (!cmpeq(threshold, bound)) {
186 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpeq, *(options.bound));
187 return;
188 }
189 }
190 }
191
192 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpeq, threshold);
193 return;
194 }
195
196 if constexpr(keep_index_) {
197 output.reserve(sanisizer::cast<decltype(I(output.size()))>(top));
198 } else {
199 std::fill_n(output, n, false);
200 }
201
202 if (options.bound.has_value()) {
203 // cast of 'top' to Index_ is known safe as top <= n by this point.
204 if (options.open_bound) {
205 select_top_genes_by_threshold<keep_index_>(static_cast<Index_>(top), statistic, output, cmpne, *(options.bound), semi_sorted);
206 } else {
207 select_top_genes_by_threshold<keep_index_>(static_cast<Index_>(top), statistic, output, cmpeq, *(options.bound), semi_sorted);
208 }
209 } else {
210 if constexpr(keep_index_) {
211 output.insert(output.end(), semi_sorted.begin(), semi_sorted.begin() + top);
212 } else {
213 for (decltype(I(top)) i = 0; i < top; ++i) {
214 output[semi_sorted[i]] = true;
215 }
216 }
217 }
218
219 if constexpr(keep_index_) {
220 std::sort(output.begin(), output.end());
221 }
222}
223
224}
242template<typename Stat_, typename Bool_>
243void pick_top_genes(const std::size_t n, const Stat_* const statistic, const std::size_t top, const bool larger, Bool_* const output, const PickTopGenesOptions<Stat_>& options) {
244 if (larger) {
245 internal::pick_top_genes<false>(
246 n,
247 statistic,
248 top,
249 output,
250 [](Stat_ l, Stat_ r) -> bool { return l > r; },
251 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
252 options
253 );
254 } else {
255 internal::pick_top_genes<false>(
256 n,
257 statistic,
258 top,
259 output,
260 [](Stat_ l, Stat_ r) -> bool { return l < r; },
261 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
262 options
263 );
264 }
265}
266
280template<typename Bool_, typename Stat_>
281std::vector<Bool_> pick_top_genes(const std::size_t n, const Stat_* const statistic, const std::size_t top, const bool larger, const PickTopGenesOptions<Stat_>& options) {
282 auto output = sanisizer::create<std::vector<Bool_> >(n
283#ifdef SCRAN_VARIANCES_TEST_INIT
284 , SCRAN_VARIANCES_TEST_INIT
285#endif
286 );
287 pick_top_genes(n, statistic, top, larger, output.data(), options);
288 return output;
289}
290
305template<typename Index_, typename Stat_>
306std::vector<Index_> pick_top_genes_index(const Index_ n, const Stat_* const statistic, const Index_ top, const bool larger, const PickTopGenesOptions<Stat_>& options) {
307 std::vector<Index_> output;
308 if (larger) {
309 internal::pick_top_genes<true>(
310 n,
311 statistic,
312 top,
313 output,
314 [](Stat_ l, Stat_ r) -> bool { return l > r; },
315 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
316 options
317 );
318 } else {
319 internal::pick_top_genes<true>(
320 n,
321 statistic,
322 top,
323 output,
324 [](Stat_ l, Stat_ r) -> bool { return l < r; },
325 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
326 options
327 );
328 }
329 return output;
330}
331
332}
333
334#endif
Pick top genes from their statistics.
Definition pick_top_genes.hpp:19
std::vector< Index_ > pick_top_genes_index(const Index_ n, const Stat_ *const statistic, const Index_ top, const bool larger, const PickTopGenesOptions< Stat_ > &options)
Definition pick_top_genes.hpp:306
void pick_top_genes(const std::size_t n, const Stat_ *const statistic, const std::size_t top, const bool larger, Bool_ *const output, const PickTopGenesOptions< Stat_ > &options)
Definition pick_top_genes.hpp:243
Options for pick_top_genes().
Definition pick_top_genes.hpp:26
bool keep_ties
Definition pick_top_genes.hpp:50
bool check_nan
Definition pick_top_genes.hpp:56
bool open_bound
Definition pick_top_genes.hpp:44
std::optional< Stat_ > bound
Definition pick_top_genes.hpp:38