topicks
Pick top genes for downstream analyses
Loading...
Searching...
No Matches
topicks.hpp
Go to the documentation of this file.
1#ifndef TOPICKER_TOPICKER_HPP
2#define TOPICKER_TOPICKER_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstddef>
8
9#include "sanisizer/sanisizer.hpp"
10
16namespace topicks {
17
22template<typename Stat_>
29 std::optional<Stat_> bound;
30
35 bool keep_ties = true;
36};
37
41namespace internal {
42
43template<bool keep_index_, typename Index_, typename Stat_, typename Top_, class Output_, class Cmp_, class CmpEqual_>
44void pick_top_genes(Index_ n, const Stat_* statistic, Top_ top, Output_& output, Cmp_ cmp, CmpEqual_ cmpeq, const PickTopGenesOptions<Stat_>& options) {
45 if (top == 0) {
46 if constexpr(keep_index_) {
47 ; // no-op, we assume it's already empty.
48 } else {
49 std::fill_n(output, n, false);
50 }
51 return;
52 }
53
54 if (sanisizer::is_greater_than(top, n)) {
55 if (options.bound.has_value()) {
56 auto bound = *(options.bound);
57 for (Index_ i = 0; i < n; ++i) {
58 bool ok = cmp(statistic[i], bound);
59 if constexpr(keep_index_) {
60 if (ok) {
61 output.push_back(i);
62 }
63 } else {
64 output[i] = ok;
65 }
66 }
67 } else {
68 if constexpr(keep_index_) {
69 output.resize(sanisizer::cast<decltype(output.size())>(n));
70 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
71 } else {
72 std::fill_n(output, n, true);
73 }
74 }
75 return;
76 }
77
78 auto semi_sorted = sanisizer::create<std::vector<Index_> >(n);
79 std::iota(semi_sorted.begin(), semi_sorted.end(), static_cast<Index_>(0));
80 auto cBegin = semi_sorted.begin(), cMid = cBegin + top - 1, cEnd = semi_sorted.end();
81 std::nth_element(cBegin, cMid, cEnd, [&](Index_ l, Index_ r) -> bool {
82 auto L = statistic[l], R = statistic[r];
83 if (L == R) {
84 return l < r; // always favor the earlier index for a stable sort, even if larger = false.
85 } else {
86 return cmp(L, R);
87 }
88 });
89 Stat_ threshold = statistic[*cMid];
90
91 if (options.keep_ties) {
92 if (options.bound.has_value() && !cmp(threshold, *(options.bound))) {
93 auto bound = *(options.bound);
94 for (Index_ i = 0; i < n; ++i) {
95 bool ok = cmp(statistic[i], bound);
96 if constexpr(keep_index_) {
97 if (ok) {
98 output.push_back(i);
99 }
100 } else {
101 output[i] = ok;
102 }
103 }
104 } else {
105 for (Index_ i = 0; i < n; ++i) {
106 bool ok = cmpeq(statistic[i], threshold);
107 if constexpr(keep_index_) {
108 if (ok) {
109 output.push_back(i);
110 }
111 } else {
112 output[i] = ok;
113 }
114 }
115 }
116 return;
117 }
118
119 if constexpr(keep_index_) {
120 output.reserve(sanisizer::cast<decltype(output.size())>(top));
121 } else {
122 std::fill_n(output, n, false);
123 }
124
125 if (options.bound.has_value()) {
126 auto bound = *(options.bound);
127 Index_ counter = top; // cast is known safe as top <= n.
128 while (counter > 0) {
129 --counter;
130 auto pos = semi_sorted[counter];
131 if (cmp(statistic[pos], bound)) {
132 if constexpr(keep_index_) {
133 output.push_back(pos);
134 } else {
135 output[pos] = true;
136 }
137 }
138 }
139 } else {
140 if constexpr(keep_index_) {
141 output.insert(output.end(), semi_sorted.begin(), semi_sorted.begin() + top);
142 } else {
143 for (decltype(top) i = 0; i < top; ++i) {
144 output[semi_sorted[i]] = true;
145 }
146 }
147 }
148
149 if constexpr(keep_index_) {
150 std::sort(output.begin(), output.end());
151 }
152}
153
154}
178template<typename Stat_, typename Top_, typename Bool_>
179void pick_top_genes(std::size_t n, const Stat_* statistic, Top_ top, bool larger, Bool_* output, const PickTopGenesOptions<Stat_>& options) {
180 if (larger) {
181 internal::pick_top_genes<false>(
182 n,
183 statistic,
184 top,
185 output,
186 [](Stat_ l, Stat_ r) -> bool { return l > r; },
187 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
188 options
189 );
190 } else {
191 internal::pick_top_genes<false>(
192 n,
193 statistic,
194 top,
195 output,
196 [](Stat_ l, Stat_ r) -> bool { return l < r; },
197 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
198 options
199 );
200 }
201}
202
216template<typename Bool_, typename Stat_, typename Top_>
217std::vector<Bool_> pick_top_genes(std::size_t n, const Stat_* statistic, Top_ top, bool larger, const PickTopGenesOptions<Stat_>& options) {
218 auto output = sanisizer::create<std::vector<Bool_> >(n
219#ifdef SCRAN_VARIANCES_TEST_INIT
220 , SCRAN_VARIANCES_TEST_INIT
221#endif
222 );
223 pick_top_genes(n, statistic, top, larger, output.data(), options);
224 return output;
225}
226
241template<typename Index_, typename Stat_, typename Top_>
242std::vector<Index_> pick_top_genes_index(Index_ n, const Stat_* statistic, Top_ top, bool larger, const PickTopGenesOptions<Stat_>& options) {
243 std::vector<Index_> output;
244 if (larger) {
245 internal::pick_top_genes<true>(
246 n,
247 statistic,
248 top,
249 output,
250 [](Stat_ l, Stat_ r) -> bool { return l > r; },
251 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
252 options
253 );
254 } else {
255 internal::pick_top_genes<true>(
256 n,
257 statistic,
258 top,
259 output,
260 [](Stat_ l, Stat_ r) -> bool { return l < r; },
261 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
262 options
263 );
264 }
265 return output;
266}
267
268}
269
270#endif
Options for pick_top_genes().
Definition topicks.hpp:23
bool keep_ties
Definition topicks.hpp:35
std::optional< Stat_ > bound
Definition topicks.hpp:29
std::vector< Index_ > pick_top_genes_index(Index_ n, const Stat_ *statistic, Top_ top, bool larger, const PickTopGenesOptions< Stat_ > &options)
Definition topicks.hpp:242
void pick_top_genes(std::size_t n, const Stat_ *statistic, Top_ top, bool larger, Bool_ *output, const PickTopGenesOptions< Stat_ > &options)
Definition topicks.hpp:179