scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
choose_highly_variable_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_VARIANCES_CHOOSE_HIGHLY_VARIABLE_GENES_HPP
2#define SCRAN_VARIANCES_CHOOSE_HIGHLY_VARIABLE_GENES_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstdint>
8
14namespace scran_variances {
15
30 size_t top = 4000;
31
35 bool larger = true;
36
41 bool use_bound = false;
42
48 double bound = 0;
49
54 bool keep_ties = true;
55};
56
60namespace internal {
61
62template<bool keep_index_, typename Index_, typename Stat_, class Output_, class Cmp_, class CmpEqual_>
63void choose_highly_variable_genes(Index_ n, const Stat_* statistic, Output_& output, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
64 if (options.top == 0) {
65 if constexpr(keep_index_) {
66 ; // no-op, we assume it's already empty.
67 } else {
68 std::fill_n(output, n, false);
69 }
70 return;
71 }
72
73 Stat_ bound = options.bound;
74 if (static_cast<size_t>(options.top) >= static_cast<size_t>(n)) {
75 if (options.use_bound) {
76 for (Index_ i = 0; i < n; ++i) {
77 bool ok = cmp(statistic[i], bound);
78 if constexpr(keep_index_) {
79 if (ok) {
80 output.push_back(i);
81 }
82 } else {
83 output[i] = ok;
84 }
85 }
86 } else {
87 if constexpr(keep_index_) {
88 output.resize(n);
89 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
90 } else {
91 std::fill_n(output, n, true);
92 }
93 }
94 return;
95 }
96
97 std::vector<Index_> semi_sorted(n);
98 std::iota(semi_sorted.begin(), semi_sorted.end(), static_cast<Index_>(0));
99 auto cBegin = semi_sorted.begin(), cMid = cBegin + options.top - 1, cEnd = semi_sorted.end();
100 std::nth_element(cBegin, cMid, cEnd, [&](Index_ l, Index_ r) -> bool {
101 auto L = statistic[l], R = statistic[r];
102 if (L == R) {
103 return l < r; // always favor the earlier index for a stable sort, even if options.larger = false.
104 } else {
105 return cmp(L, R);
106 }
107 });
108
109 Stat_ threshold = statistic[semi_sorted[options.top - 1]];
110
111 if (options.keep_ties) {
112 if (options.use_bound && !cmp(threshold, bound)) {
113 for (Index_ i = 0; i < n; ++i) {
114 bool ok = cmp(statistic[i], bound);
115 if constexpr(keep_index_) {
116 if (ok) {
117 output.push_back(i);
118 }
119 } else {
120 output[i] = ok;
121 }
122 }
123 } else {
124 for (Index_ i = 0; i < n; ++i) {
125 bool ok = cmpeq(statistic[i], threshold);
126 if constexpr(keep_index_) {
127 if (ok) {
128 output.push_back(i);
129 }
130 } else {
131 output[i] = ok;
132 }
133 }
134 }
135 return;
136 }
137
138 if constexpr(keep_index_) {
139 output.reserve(options.top);
140 } else {
141 std::fill_n(output, n, false);
142 }
143
144 if (options.use_bound) {
145 Index_ counter = options.top;
146 while (counter > 0) {
147 --counter;
148 auto pos = semi_sorted[counter];
149 if (cmp(statistic[pos], bound)) {
150 if constexpr(keep_index_) {
151 output.push_back(pos);
152 } else {
153 output[pos] = true;
154 }
155 }
156 }
157 } else {
158 if constexpr(keep_index_) {
159 output.insert(output.end(), semi_sorted.begin(), semi_sorted.begin() + options.top);
160 } else {
161 for (Index_ i = 0, end = options.top; i < end; ++i) {
162 output[semi_sorted[i]] = true;
163 }
164 }
165 }
166
167 if constexpr(keep_index_) {
168 std::sort(output.begin(), output.end());
169 }
170}
171
172}
187template<typename Stat_, typename Bool_>
188void choose_highly_variable_genes(size_t n, const Stat_* statistic, Bool_* output, const ChooseHighlyVariableGenesOptions& options) {
189 if (options.larger) {
190 internal::choose_highly_variable_genes<false>(
191 n,
192 statistic,
193 output,
194 [](Stat_ l, Stat_ r) -> bool { return l > r; },
195 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
196 options
197 );
198 } else {
199 internal::choose_highly_variable_genes<false>(
200 n,
201 statistic,
202 output,
203 [](Stat_ l, Stat_ r) -> bool { return l < r; },
204 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
205 options
206 );
207 }
208}
209
220template<typename Bool_ = uint8_t, typename Stat_>
222 std::vector<Bool_> output(n
225#endif
226 );
228 return output;
229}
230
242template<typename Index_, typename Stat_>
244 std::vector<Index_> output;
245 if (options.larger) {
246 internal::choose_highly_variable_genes<true>(
247 n,
248 statistic,
249 output,
250 [](Stat_ l, Stat_ r) -> bool { return l > r; },
251 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
252 options
253 );
254 } else {
255 internal::choose_highly_variable_genes<true>(
256 n,
257 statistic,
258 output,
259 [](Stat_ l, Stat_ r) -> bool { return l < r; },
260 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
261 options
262 );
263 }
264 return output;
265}
266
267}
268
269#endif
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:14
std::vector< Index_ > choose_highly_variable_genes_index(Index_ n, const Stat_ *statistic, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:243
void choose_highly_variable_genes(size_t n, const Stat_ *statistic, Bool_ *output, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:188
Options for choose_highly_variable_genes().
Definition choose_highly_variable_genes.hpp:19
bool use_bound
Definition choose_highly_variable_genes.hpp:41
double bound
Definition choose_highly_variable_genes.hpp:48
bool keep_ties
Definition choose_highly_variable_genes.hpp:54
bool larger
Definition choose_highly_variable_genes.hpp:35
size_t top
Definition choose_highly_variable_genes.hpp:30