scran_variances
Model per-gene variance in expression
Loading...
Searching...
No Matches
choose_highly_variable_genes.hpp
Go to the documentation of this file.
1#ifndef SCRAN_VARIANCES_CHOOSE_HIGHLY_VARIABLE_GENES_HPP
2#define SCRAN_VARIANCES_CHOOSE_HIGHLY_VARIABLE_GENES_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <cstdint>
8
14namespace scran_variances {
15
25 size_t top = 4000;
26
30 bool larger = true;
31
37 std::pair<bool, double> bound = std::make_pair<bool, double>(false, 0);
38
43 bool keep_ties = true;
44};
45
49namespace internal {
50
51template<typename Index_, typename Stat_, class Cmp_>
52std::vector<Index_> create_semisorted_indices(size_t n, const Stat_* statistic, Cmp_ cmp, size_t top) {
53 std::vector<Index_> collected(n);
54 std::iota(collected.begin(), collected.end(), static_cast<Index_>(0));
55 auto cBegin = collected.begin(), cMid = cBegin + top - 1, cEnd = collected.end();
56 std::nth_element(cBegin, cMid, cEnd, [&](Index_ l, Index_ r) -> bool {
57 auto L = statistic[l], R = statistic[r];
58 if (L == R) {
59 return l < r; // always favor the earlier index for a stable sort, even if options.larger = false.
60 } else {
61 return cmp(L, R);
62 }
63 });
64 return collected;
65}
66
67template<typename Stat_, class Output_, class Cmp_, class CmpEqual_>
68void choose_highly_variable_genes(size_t n, const Stat_* statistic, Output_* output, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
69 if (options.top == 0) {
70 std::fill_n(output, n, false);
71 return;
72 }
73
74 Stat_ bound = options.bound.second;
75 if (options.top >= n) {
76 if (options.bound.first) {
77 for (size_t i = 0; i < n; ++i) {
78 output[i] = cmp(statistic[i], bound);
79 }
80 } else {
81 std::fill_n(output, n, true);
82 }
83 return;
84 }
85
86 auto collected = create_semisorted_indices<size_t>(n, statistic, cmp, options.top);
87 Stat_ threshold = statistic[collected[options.top - 1]];
88
89 if (options.keep_ties) {
90 if (options.bound.first && !cmp(threshold, bound)) {
91 for (size_t i = 0; i < n; ++i) {
92 output[i] = cmp(statistic[i], bound);
93 }
94 } else {
95 for (size_t i = 0; i < n; ++i) {
96 output[i] = cmpeq(statistic[i], threshold);
97 }
98 }
99 return;
100 }
101
102 std::fill_n(output, n, false);
103 size_t counter = options.top;
104 if (options.bound.first && !cmp(threshold, bound)) {
105 --counter;
106 while (counter > 0) {
107 --counter;
108 if (cmp(statistic[collected[counter]], bound)) {
109 ++counter;
110 break;
111 }
112 }
113 }
114
115 for (size_t i = 0; i < counter; ++i) {
116 output[collected[i]] = true;
117 }
118}
119
120template<typename Index_, typename Stat_, class Cmp_, class CmpEqual_>
121std::vector<Index_> choose_highly_variable_genes_index(size_t n, const Stat_* statistic, Cmp_ cmp, CmpEqual_ cmpeq, const ChooseHighlyVariableGenesOptions& options) {
122 std::vector<Index_> output;
123 if (options.top == 0) {
124 return output;
125 }
126
127 Stat_ bound = options.bound.second;
128 if (options.top >= n) {
129 if (options.bound.first) {
130 for (size_t i = 0; i < n; ++i) {
131 if (options.bound.first && cmp(statistic[i], bound)) {
132 output.push_back(i);
133 }
134 }
135 } else {
136 output.resize(n);
137 std::iota(output.begin(), output.end(), static_cast<Index_>(0));
138 }
139 return output;
140 }
141
142 output = create_semisorted_indices<Index_>(n, statistic, cmp, options.top);
143 Stat_ threshold = statistic[output[options.top - 1]];
144
145 if (options.keep_ties) {
146 output.clear();
147 if (options.bound.first && !cmp(threshold, bound)) {
148 for (size_t i = 0; i < n; ++i) {
149 if (cmp(statistic[i], bound)) {
150 output.push_back(i);
151 }
152 }
153 } else {
154 for (size_t i = 0; i < n; ++i) {
155 if (cmpeq(statistic[i], threshold)) {
156 output.push_back(i);
157 }
158 }
159 }
160 return output;
161 }
162
163 size_t counter = options.top;
164 if (options.bound.first && !cmp(threshold, bound)) {
165 --counter;
166 while (counter > 0) {
167 --counter;
168 if (cmp(statistic[output[counter]], bound)) {
169 ++counter;
170 break;
171 }
172 }
173 }
174
175 output.resize(counter);
176 std::sort(output.begin(), output.end());
177 return output;
178}
179
180}
195template<typename Stat_, typename Bool_>
196void choose_highly_variable_genes(size_t n, const Stat_* statistic, Bool_* output, const ChooseHighlyVariableGenesOptions& options) {
197 if (options.larger) {
198 internal::choose_highly_variable_genes(
199 n,
200 statistic,
201 output,
202 [](Stat_ l, Stat_ r) -> bool { return l > r; },
203 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
204 options
205 );
206 } else {
207 internal::choose_highly_variable_genes(
208 n,
209 statistic,
210 output,
211 [](Stat_ l, Stat_ r) -> bool { return l < r; },
212 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
213 options
214 );
215 }
216}
217
228template<typename Bool_ = uint8_t, typename Stat_>
229std::vector<Bool_> choose_highly_variable_genes(size_t n, const Stat_* statistic, const ChooseHighlyVariableGenesOptions& options) {
230 std::vector<Bool_> output(n);
231 choose_highly_variable_genes(n, statistic, output.data(), options);
232 return output;
233}
234
246template<typename Index_, typename Stat_>
247std::vector<Index_> choose_highly_variable_genes_index(Index_ n, const Stat_* statistic, const ChooseHighlyVariableGenesOptions& options) {
248 if (options.larger) {
249 return internal::choose_highly_variable_genes_index<Index_>(
250 n,
251 statistic,
252 [](Stat_ l, Stat_ r) -> bool { return l > r; },
253 [](Stat_ l, Stat_ r) -> bool { return l >= r; },
254 options
255 );
256 } else {
257 return internal::choose_highly_variable_genes_index<Index_>(
258 n,
259 statistic,
260 [](Stat_ l, Stat_ r) -> bool { return l < r; },
261 [](Stat_ l, Stat_ r) -> bool { return l <= r; },
262 options
263 );
264 }
265}
266
267}
268
269#endif
Variance modelling for single-cell expression data.
Definition choose_highly_variable_genes.hpp:14
std::vector< Index_ > choose_highly_variable_genes_index(Index_ n, const Stat_ *statistic, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:247
void choose_highly_variable_genes(size_t n, const Stat_ *statistic, Bool_ *output, const ChooseHighlyVariableGenesOptions &options)
Definition choose_highly_variable_genes.hpp:196
Options for choose_highly_variable_genes().
Definition choose_highly_variable_genes.hpp:19
bool keep_ties
Definition choose_highly_variable_genes.hpp:43
bool larger
Definition choose_highly_variable_genes.hpp:30
size_t top
Definition choose_highly_variable_genes.hpp:25
std::pair< bool, double > bound
Definition choose_highly_variable_genes.hpp:37