topicks
Pick top genes for downstream analyses
Loading...
Searching...
No Matches
pick_top_genes.hpp
Go to the documentation of this file.
1#ifndef TOPICKS_PICK_TOP_GENES_HPP
2#define TOPICKS_PICK_TOP_GENES_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstddef>
8#include <type_traits>
9#include <limits>
10#include <cmath>
11
12#include "sanisizer/sanisizer.hpp"
13
19namespace topicks {
20
25template<typename Stat_>
38 std::optional<Stat_> bound;
39
44 bool open_bound = true;
45
50 bool keep_ties = true;
51
56 bool check_nan = false;
57};
58
62namespace internal {
63
64template<typename Input_>
65using I = typename std::remove_cv<typename std::remove_reference<Input_>::type>::type;
66
67template<bool keep_index_, typename Index_, typename Stat_, class Output_, class Cmp_>
68void filter_genes_by_threshold(const Index_ n, const Stat_* statistic, Output_& output, const Cmp_ cmp, const Stat_ threshold) {
69 // This function is inherently safe as 'ok' will always be false for any comparison involving NaNs.
70 for (Index_ i = 0; i < n; ++i) {
71 const bool ok = cmp(statistic[i], threshold);
72 if constexpr(keep_index_) {
73 if (ok) {
74 output.push_back(i);
75 }
76 } else {
77 output[i] = ok;
78 }
79 }
80}
81
82template<bool keep_index_, typename Index_, typename Stat_, class Output_, class Cmp_>
83void select_top_genes_by_threshold(const Index_ top, const Stat_* statistic, Output_& output, const Cmp_ cmp, const Stat_ threshold, const std::vector<Index_>& semi_sorted) {
84 Index_ counter = top;
85 while (counter > 0) {
86 --counter;
87 const auto pos = semi_sorted[counter];
88 if (cmp(statistic[pos], threshold)) {
89 if constexpr(keep_index_) {
90 output.push_back(pos);
91 } else {
92 output[pos] = true;
93 }
94 }
95 }
96}
97
98template<bool keep_index_, typename Index_, typename Stat_, class Output_, class CmpNotEqual_, class CmpEqual_>
99void pick_top_genes(const Index_ n, const Stat_* statistic, const Index_ top, Output_& output, const CmpNotEqual_ cmpne, const CmpEqual_ cmpeq, const PickTopGenesOptions<Stat_>& options) {
100 if (top == 0) {
101 if constexpr(keep_index_) {
102 ; // no-op, we assume it's already empty.
103 } else {
104 std::fill_n(output, n, false);
105 }
106 return;
107 }
108
109 Index_ num_nan = 0;
110 if constexpr(std::numeric_limits<Stat_>::has_quiet_NaN) {
111 if (options.check_nan) {
112 for (Index_ i = 0; i < n; ++i) {
113 num_nan += std::isnan(statistic[i]);
114 }
115 }
116 }
117
118 if (top >= n - num_nan) {
119 if (options.bound.has_value()) {
120 if (options.open_bound) {
121 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpne, *(options.bound));
122 } else {
123 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpeq, *(options.bound));
124 }
125 } else if (num_nan == 0) {
126 if constexpr(keep_index_) {
127 sanisizer::resize(output, n);
128 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
129 } else {
130 std::fill_n(output, n, true);
131 }
132 } else {
133 if constexpr(keep_index_) {
134 output.reserve(n - num_nan);
135 for (Index_ i = 0; i < n; ++i) {
136 if (!std::isnan(statistic[i])) {
137 output.push_back(i);
138 }
139 }
140 } else {
141 for (Index_ i = 0; i < n; ++i) {
142 output[i] = !std::isnan(statistic[i]);
143 }
144 }
145 }
146 return;
147 }
148
149 std::vector<Index_> semi_sorted;
150 if (num_nan == 0) {
151 sanisizer::resize(semi_sorted, n);
152 std::iota(semi_sorted.begin(), semi_sorted.end(), static_cast<Index_>(0));
153 } else {
154 semi_sorted.reserve(n - num_nan);
155 for (Index_ i = 0; i < n; ++i) {
156 if (!std::isnan(statistic[i])) {
157 semi_sorted.push_back(i);
158 }
159 }
160 }
161
162 const auto cBegin = semi_sorted.begin(), cMid = cBegin + top - 1, cEnd = semi_sorted.end();
163 std::nth_element(cBegin, cMid, cEnd, [&](const Index_ l, const Index_ r) -> bool {
164 const auto L = statistic[l], R = statistic[r];
165 if (L == R) {
166 return l < r; // always favor the earlier index for a stable sort, even if larger = false.
167 } else {
168 return cmpne(L, R);
169 }
170 });
171 const Stat_ threshold = statistic[*cMid];
172
173 if (options.keep_ties) {
174 if (options.bound.has_value()) {
175 const auto bound = *(options.bound);
176
177 if (options.open_bound) {
178 if (!cmpne(threshold, bound)) {
179 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpne, *(options.bound));
180 return;
181 }
182 } else {
183 if (!cmpeq(threshold, bound)) {
184 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpeq, *(options.bound));
185 return;
186 }
187 }
188 }
189
190 filter_genes_by_threshold<keep_index_>(n, statistic, output, cmpeq, threshold);
191 return;
192 }
193
194 if constexpr(keep_index_) {
195 output.reserve(sanisizer::cast<I<decltype(output.size())> >(top));
196 } else {
197 std::fill_n(output, n, false);
198 }
199
200 if (options.bound.has_value()) {
201 // cast of 'top' to Index_ is known safe as top <= n by this point.
202 if (options.open_bound) {
203 select_top_genes_by_threshold<keep_index_>(static_cast<Index_>(top), statistic, output, cmpne, *(options.bound), semi_sorted);
204 } else {
205 select_top_genes_by_threshold<keep_index_>(static_cast<Index_>(top), statistic, output, cmpeq, *(options.bound), semi_sorted);
206 }
207 } else {
208 if constexpr(keep_index_) {
209 output.insert(output.end(), semi_sorted.begin(), semi_sorted.begin() + top);
210 } else {
211 for (I<decltype(top)> i = 0; i < top; ++i) {
212 output[semi_sorted[i]] = true;
213 }
214 }
215 }
216
217 if constexpr(keep_index_) {
218 std::sort(output.begin(), output.end());
219 }
220}
221
222}
240template<typename Stat_, typename Bool_>
241void pick_top_genes(const std::size_t n, const Stat_* const statistic, const std::size_t top, const bool larger, Bool_* const output, const PickTopGenesOptions<Stat_>& options) {
242 if (larger) {
243 internal::pick_top_genes<false>(
244 n,
245 statistic,
246 top,
247 output,
248 [](Stat_ l, Stat_ r) -> bool { return l > r; },
249 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
250 options
251 );
252 } else {
253 internal::pick_top_genes<false>(
254 n,
255 statistic,
256 top,
257 output,
258 [](Stat_ l, Stat_ r) -> bool { return l < r; },
259 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
260 options
261 );
262 }
263}
264
278template<typename Bool_, typename Stat_>
279std::vector<Bool_> pick_top_genes(const std::size_t n, const Stat_* const statistic, const std::size_t top, const bool larger, const PickTopGenesOptions<Stat_>& options) {
280 auto output = sanisizer::create<std::vector<Bool_> >(n
281#ifdef SCRAN_VARIANCES_TEST_INIT
282 , SCRAN_VARIANCES_TEST_INIT
283#endif
284 );
285 pick_top_genes(n, statistic, top, larger, output.data(), options);
286 return output;
287}
288
303template<typename Index_, typename Stat_>
304std::vector<Index_> pick_top_genes_index(const Index_ n, const Stat_* const statistic, const Index_ top, const bool larger, const PickTopGenesOptions<Stat_>& options) {
305 std::vector<Index_> output;
306 if (larger) {
307 internal::pick_top_genes<true>(
308 n,
309 statistic,
310 top,
311 output,
312 [](Stat_ l, Stat_ r) -> bool { return l > r; },
313 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
314 options
315 );
316 } else {
317 internal::pick_top_genes<true>(
318 n,
319 statistic,
320 top,
321 output,
322 [](Stat_ l, Stat_ r) -> bool { return l < r; },
323 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
324 options
325 );
326 }
327 return output;
328}
329
330}
331
332#endif
Pick top genes from their statistics.
Definition pick_top_genes.hpp:19
std::vector< Index_ > pick_top_genes_index(const Index_ n, const Stat_ *const statistic, const Index_ top, const bool larger, const PickTopGenesOptions< Stat_ > &options)
Definition pick_top_genes.hpp:304
void pick_top_genes(const std::size_t n, const Stat_ *const statistic, const std::size_t top, const bool larger, Bool_ *const output, const PickTopGenesOptions< Stat_ > &options)
Definition pick_top_genes.hpp:241
Options for pick_top_genes().
Definition pick_top_genes.hpp:26
bool keep_ties
Definition pick_top_genes.hpp:50
bool check_nan
Definition pick_top_genes.hpp:56
bool open_bound
Definition pick_top_genes.hpp:44
std::optional< Stat_ > bound
Definition pick_top_genes.hpp:38