scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
choose_highly_variable_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_VARIANCES_CHOOSE_HIGHLY_VARIABLE_GENES_HPP
2#define SCRAN_VARIANCES_CHOOSE_HIGHLY_VARIABLE_GENES_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstdint>
8
14namespace scran_variances {
15
28 size_t top = 4000;
29
33 bool larger = true;
34
39 bool use_bound = false;
40
46 double bound = 0;
47
52 bool keep_ties = true;
53};
54
58namespace internal {
59
60template<typename Index_, typename Stat_, class Cmp_>
61std::vector<Index_> create_semisorted_indices(size_t n, const Stat_* statistic, Cmp_ cmp, size_t top) {
62 std::vector<Index_> collected(n);
63 std::iota(collected.begin(), collected.end(), static_cast<Index_>(0));
64 auto cBegin = collected.begin(), cMid = cBegin + top - 1, cEnd = collected.end();
65 std::nth_element(cBegin, cMid, cEnd, [&](Index_ l, Index_ r) -> bool {
66 auto L = statistic[l], R = statistic[r];
67 if (L == R) {
68 return l < r; // always favor the earlier index for a stable sort, even if options.larger = false.
69 } else {
70 return cmp(L, R);
71 }
72 });
73 return collected;
74}
75
76template<typename Stat_, class Output_, class Cmp_, class CmpEqual_>
77void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* output, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
78 if (options.top == 0) {
79 std::fill_n(output, n, false);
80 return;
81 }
82
83 Stat_ bound = options.bound;
84 if (options.top >= n) {
85 if (options.use_bound) {
86 for (size_t i = 0; i < n; ++i) {
87 output[i] = cmp(statistic[i], bound);
88 }
89 } else {
90 std::fill_n(output, n, true);
91 }
92 return;
93 }
94
95 auto collected = create_semisorted_indices<size_t>(n, statistic, cmp, options.top);
96 Stat_ threshold = statistic[collected[options.top - 1]];
97
98 if (options.keep_ties) {
99 if (options.use_bound && !cmp(threshold, bound)) {
100 for (size_t i = 0; i < n; ++i) {
101 output[i] = cmp(statistic[i], bound);
102 }
103 } else {
104 for (size_t i = 0; i < n; ++i) {
105 output[i] = cmpeq(statistic[i], threshold);
106 }
107 }
108 return;
109 }
110
111 std::fill_n(output, n, false);
112 size_t counter = options.top;
113 if (options.use_bound && !cmp(threshold, bound)) {
114 --counter;
115 while (counter > 0) {
116 --counter;
117 if (cmp(statistic[collected[counter]], bound)) {
118 ++counter;
119 break;
120 }
121 }
122 }
123
124 for (size_t i = 0; i < counter; ++i) {
125 output[collected[i]] = true;
126 }
127}
128
129template<typename Index_, typename Stat_, class Cmp_, class CmpEqual_>
130std::vector<Index_> choose_highly_variable_genes_index(size_t n, const Stat_* statistic, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
131 std::vector<Index_> output;
132 if (options.top == 0) {
133 return output;
134 }
135
136 Stat_ bound = options.bound;
137 if (options.top >= n) {
138 if (options.use_bound) {
139 for (size_t i = 0; i < n; ++i) {
140 if (options.use_bound && cmp(statistic[i], bound)) {
141 output.push_back(i);
142 }
143 }
144 } else {
145 output.resize(n);
146 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
147 }
148 return output;
149 }
150
151 output = create_semisorted_indices<Index_>(n, statistic, cmp, options.top);
152 Stat_ threshold = statistic[output[options.top - 1]];
153
154 if (options.keep_ties) {
155 output.clear();
156 if (options.use_bound && !cmp(threshold, bound)) {
157 for (size_t i = 0; i < n; ++i) {
158 if (cmp(statistic[i], bound)) {
159 output.push_back(i);
160 }
161 }
162 } else {
163 for (size_t i = 0; i < n; ++i) {
164 if (cmpeq(statistic[i], threshold)) {
165 output.push_back(i);
166 }
167 }
168 }
169 return output;
170 }
171
172 size_t counter = options.top;
173 if (options.use_bound && !cmp(threshold, bound)) {
174 --counter;
175 while (counter > 0) {
176 --counter;
177 if (cmp(statistic[output[counter]], bound)) {
178 ++counter;
179 break;
180 }
181 }
182 }
183
184 output.resize(counter);
185 std::sort(output.begin(), output.end());
186 return output;
187}
188
189}
204template<typename Stat_, typename Bool_>
205void choose_highly_variable_genes(size_t n, const Stat_* statistic, Bool_* output, const ChooseHighlyVariableGenesOptions& options) {
206 if (options.larger) {
207 internal::choose_highly_variable_genes(
208 n,
209 statistic,
210 output,
211 [](Stat_ l, Stat_ r) -> bool { return l > r; },
212 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
213 options
214 );
215 } else {
216 internal::choose_highly_variable_genes(
217 n,
218 statistic,
219 output,
220 [](Stat_ l, Stat_ r) -> bool { return l < r; },
221 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
222 options
223 );
224 }
225}
226
237template<typename Bool_ = uint8_t, typename Stat_>
238std::vector<Bool_> choose_highly_variable_genes(size_t n, const Stat_* statistic, const ChooseHighlyVariableGenesOptions& options) {
239 std::vector<Bool_> output(n);
240 choose_highly_variable_genes(n, statistic, output.data(), options);
241 return output;
242}
243
255template<typename Index_, typename Stat_>
256std::vector<Index_> choose_highly_variable_genes_index(Index_ n, const Stat_* statistic, const ChooseHighlyVariableGenesOptions& options) {
257 if (options.larger) {
258 return internal::choose_highly_variable_genes_index<Index_>(
259 n,
260 statistic,
261 [](Stat_ l, Stat_ r) -> bool { return l > r; },
262 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
263 options
264 );
265 } else {
266 return internal::choose_highly_variable_genes_index<Index_>(
267 n,
268 statistic,
269 [](Stat_ l, Stat_ r) -> bool { return l < r; },
270 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
271 options
272 );
273 }
274}
275
276}
277
278#endif
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:14
std::vector< Index_ > choose_highly_variable_genes_index(Index_ n, const Stat_ *statistic, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:256
void choose_highly_variable_genes(size_t n, const Stat_ *statistic, Bool_ *output, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:205
Options for choose_highly_variable_genes().
Definition choose_highly_variable_genes.hpp:19
bool use_bound
Definition choose_highly_variable_genes.hpp:39
double bound
Definition choose_highly_variable_genes.hpp:46
bool keep_ties
Definition choose_highly_variable_genes.hpp:52
bool larger
Definition choose_highly_variable_genes.hpp:33
size_t top
Definition choose_highly_variable_genes.hpp:28