topicks
Pick top genes for downstream analyses
Loading...
Searching...
No Matches
topicks.hpp
Go to the documentation of this file.
1#ifndef TOPICKER_TOPICKER_HPP
2#define TOPICKER_TOPICKER_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstddef>
8#include <type_traits>
9
10#include "sanisizer/sanisizer.hpp"
11
17namespace topicks {
18
23template<typename Stat_>
30 std::optional<Stat_> bound;
31
36 bool keep_ties = true;
37};
38
42namespace internal {
43
44template<typename Input_>
45std::remove_cv_t<std::remove_reference_t<Input_> > I(Input_ x) {
46 return x;
47}
48
49template<bool keep_index_, typename Index_, typename Stat_, typename Top_, class Output_, class Cmp_, class CmpEqual_>
50void pick_top_genes(const Index_ n, const Stat_* statistic, const Top_ top, Output_& output, const Cmp_ cmp, const CmpEqual_ cmpeq, const PickTopGenesOptions<Stat_>& options) {
51 if (top == 0) {
52 if constexpr(keep_index_) {
53 ; // no-op, we assume it's already empty.
54 } else {
55 std::fill_n(output, n, false);
56 }
57 return;
58 }
59
60 if (sanisizer::is_greater_than(top, n)) {
61 if (options.bound.has_value()) {
62 const auto bound = *(options.bound);
63 for (Index_ i = 0; i < n; ++i) {
64 const bool ok = cmp(statistic[i], bound);
65 if constexpr(keep_index_) {
66 if (ok) {
67 output.push_back(i);
68 }
69 } else {
70 output[i] = ok;
71 }
72 }
73 } else {
74 if constexpr(keep_index_) {
75 output.resize(sanisizer::cast<decltype(I(output.size()))>(n));
76 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
77 } else {
78 std::fill_n(output, n, true);
79 }
80 }
81 return;
82 }
83
84 auto semi_sorted = sanisizer::create<std::vector<Index_> >(n);
85 std::iota(semi_sorted.begin(), semi_sorted.end(), static_cast<Index_>(0));
86 const auto cBegin = semi_sorted.begin(), cMid = cBegin + top - 1, cEnd = semi_sorted.end();
87 std::nth_element(cBegin, cMid, cEnd, [&](Index_ l, Index_ r) -> bool {
88 auto L = statistic[l], R = statistic[r];
89 if (L == R) {
90 return l < r; // always favor the earlier index for a stable sort, even if larger = false.
91 } else {
92 return cmp(L, R);
93 }
94 });
95 const Stat_ threshold = statistic[*cMid];
96
97 if (options.keep_ties) {
98 if (options.bound.has_value() && !cmp(threshold, *(options.bound))) {
99 const auto bound = *(options.bound);
100 for (Index_ i = 0; i < n; ++i) {
101 const bool ok = cmp(statistic[i], bound);
102 if constexpr(keep_index_) {
103 if (ok) {
104 output.push_back(i);
105 }
106 } else {
107 output[i] = ok;
108 }
109 }
110 } else {
111 for (Index_ i = 0; i < n; ++i) {
112 const bool ok = cmpeq(statistic[i], threshold);
113 if constexpr(keep_index_) {
114 if (ok) {
115 output.push_back(i);
116 }
117 } else {
118 output[i] = ok;
119 }
120 }
121 }
122 return;
123 }
124
125 if constexpr(keep_index_) {
126 output.reserve(sanisizer::cast<decltype(I(output.size()))>(top));
127 } else {
128 std::fill_n(output, n, false);
129 }
130
131 if (options.bound.has_value()) {
132 const auto bound = *(options.bound);
133 Index_ counter = top; // cast is known safe as top <= n.
134 while (counter > 0) {
135 --counter;
136 auto pos = semi_sorted[counter];
137 if (cmp(statistic[pos], bound)) {
138 if constexpr(keep_index_) {
139 output.push_back(pos);
140 } else {
141 output[pos] = true;
142 }
143 }
144 }
145 } else {
146 if constexpr(keep_index_) {
147 output.insert(output.end(), semi_sorted.begin(), semi_sorted.begin() + top);
148 } else {
149 for (decltype(I(top)) i = 0; i < top; ++i) {
150 output[semi_sorted[i]] = true;
151 }
152 }
153 }
154
155 if constexpr(keep_index_) {
156 std::sort(output.begin(), output.end());
157 }
158}
159
160}
184template<typename Stat_, typename Top_, typename Bool_>
185void pick_top_genes(const std::size_t n, const Stat_* const statistic, const Top_ top, const bool larger, Bool_* const output, const PickTopGenesOptions<Stat_>& options) {
186 if (larger) {
187 internal::pick_top_genes<false>(
188 n,
189 statistic,
190 top,
191 output,
192 [](Stat_ l, Stat_ r) -> bool { return l > r; },
193 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
194 options
195 );
196 } else {
197 internal::pick_top_genes<false>(
198 n,
199 statistic,
200 top,
201 output,
202 [](Stat_ l, Stat_ r) -> bool { return l < r; },
203 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
204 options
205 );
206 }
207}
208
222template<typename Bool_, typename Stat_, typename Top_>
223std::vector<Bool_> pick_top_genes(const std::size_t n, const Stat_* const statistic, const Top_ top, const bool larger, const PickTopGenesOptions<Stat_>& options) {
224 auto output = sanisizer::create<std::vector<Bool_> >(n
225#ifdef SCRAN_VARIANCES_TEST_INIT
226 , SCRAN_VARIANCES_TEST_INIT
227#endif
228 );
229 pick_top_genes(n, statistic, top, larger, output.data(), options);
230 return output;
231}
232
247template<typename Index_, typename Stat_, typename Top_>
248std::vector<Index_> pick_top_genes_index(const Index_ n, const Stat_* const statistic, const Top_ top, const bool larger, const PickTopGenesOptions<Stat_>& options) {
249 std::vector<Index_> output;
250 if (larger) {
251 internal::pick_top_genes<true>(
252 n,
253 statistic,
254 top,
255 output,
256 [](Stat_ l, Stat_ r) -> bool { return l > r; },
257 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
258 options
259 );
260 } else {
261 internal::pick_top_genes<true>(
262 n,
263 statistic,
264 top,
265 output,
266 [](Stat_ l, Stat_ r) -> bool { return l < r; },
267 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
268 options
269 );
270 }
271 return output;
272}
273
274}
275
276#endif
Options for pick_top_genes().
Definition topicks.hpp:24
bool keep_ties
Definition topicks.hpp:36
std::optional< Stat_ > bound
Definition topicks.hpp:30
void pick_top_genes(const std::size_t n, const Stat_ *const statistic, const Top_ top, const bool larger, Bool_ *const output, const PickTopGenesOptions< Stat_ > &options)
Definition topicks.hpp:185
std::vector< Index_ > pick_top_genes_index(const Index_ n, const Stat_ *const statistic, const Top_ top, const bool larger, const PickTopGenesOptions< Stat_ > &options)
Definition topicks.hpp:248