scran_markers
Marker detection for single-cell data
Loading...
Searching...
No Matches
summarize_comparisons.hpp
Go to the documentation of this file.
1#ifndef SCRAN_MARKERS_SUMMARIZE_COMPARISONS_HPP
2#define SCRAN_MARKERS_SUMMARIZE_COMPARISONS_HPP
3
4#include <algorithm>
5#include <numeric>
6#include <vector>
7#include <cmath>
8#include <cstddef>
9#include <optional>
10#include <cassert>
11
12#include "tatami_stats/tatami_stats.hpp"
13#include "sanisizer/sanisizer.hpp"
15#include "quickstats/quickstats.hpp"
16
17#include "utils.hpp"
18
24namespace scran_markers {
25
32template<typename Stat_ = double, typename Rank_ = int>
39 Stat_* min = NULL;
40
46 Stat_* mean = NULL;
47
53 Stat_* median = NULL;
54
60 Stat_* max = NULL;
61
75 std::optional<std::vector<Stat_*> > quantiles;
76
82 Rank_* min_rank = NULL;
83};
84
91template<typename Stat_ = double, typename Rank_ = int>
97 std::vector<Stat_> min;
98
103 std::vector<Stat_> mean;
104
109 std::vector<Stat_> median;
110
115 std::vector<Stat_> max;
116
123 std::optional<std::vector<std::vector<Stat_> > > quantiles;
124
129 std::vector<Rank_> min_rank;
130};
131
135namespace internal {
136
137inline void validate_quantiles(const std::optional<std::vector<double> >& probs) {
138 if (!probs.has_value()) {
139 return;
140 }
141
142 const auto val = probs->front();
143 if (val < 0 || val > 1) {
144 throw std::runtime_error("quantile probabilities should be in [0, 1]");
145 }
146
147 const auto nprobs = probs->size();
148 for (I<decltype(nprobs)> i = 1; i < nprobs; ++i) {
149 const auto val = (*probs)[i];
150 if (val < 0 || val > 1) {
151 throw std::runtime_error("quantile probabilities should be in [0, 1]");
152 }
153 if (val < (*probs)[i - 1]) {
154 throw std::runtime_error("quantile probabilities should be sorted");
155 }
156 }
157}
158
159template<typename Stat_>
160using MaybeMultipleQuantiles = std::optional<quickstats::MultipleQuantilesVariableNumber<Stat_, std::size_t, const std::vector<double>*> >;
161
162template<typename Stat_>
163MaybeMultipleQuantiles<Stat_> setup_multiple_quantiles(const std::optional<std::vector<double> >& requested, const std::size_t ngroups) {
164 MaybeMultipleQuantiles<Stat_> output;
165 if (requested.has_value()) {
166 output.emplace(ngroups, &(*requested));
167 }
168 return output;
169}
170
171template<typename Stat_, typename Gene_, typename Rank_>
172void summarize_comparisons(
173 const std::size_t ngroups,
174 const Stat_* const effects,
175 const std::size_t group,
176 const Gene_ gene,
177 const SummaryBuffers<Stat_, Rank_>& output,
178 MaybeMultipleQuantiles<Stat_>& quantile_calculators,
179 std::vector<Stat_>& buffer
180) {
181 // Ignoring the self comparison and pruning out NaNs.
182 std::size_t ncomps = 0;
183 for (I<decltype(ngroups)> r = 0; r < ngroups; ++r) {
184 if (r == group || std::isnan(effects[r])) {
185 continue;
186 }
187 buffer[ncomps] = effects[r];
188 ++ncomps;
189 }
190
191 if (ncomps <= 1) {
192 Stat_ val = (ncomps == 0 ? std::numeric_limits<Stat_>::quiet_NaN() : buffer[0]);
193 if (output.min) {
194 output.min[gene] = val;
195 }
196 if (output.mean) {
197 output.mean[gene] = val;
198 }
199 if (output.max) {
200 output.max[gene] = val;
201 }
202 if (output.median) {
203 output.median[gene] = val;
204 }
205 if (output.quantiles.has_value()) {
206 for (const auto& quan : *(output.quantiles)) {
207 quan[gene] = val;
208 }
209 }
210
211 } else {
212 const auto ebegin = buffer.data(), elast = ebegin + ncomps;
213 if (output.min) {
214 output.min[gene] = *std::min_element(ebegin, elast);
215 }
216 if (output.mean) {
217 output.mean[gene] = std::accumulate(ebegin, elast, static_cast<Stat_>(0)) / ncomps;
218 }
219 if (output.max) {
220 output.max[gene] = *std::max_element(ebegin, elast);
221 }
222 // This following calculations mutate the buffer, so we put this last to avoid surprises.
223 if (output.median) {
224 output.median[gene] = tatami_stats::medians::direct(ebegin, ncomps, /* skip_nan = */ false);
225 }
226 if (output.quantiles.has_value()) {
227 (*quantile_calculators)(
228 ncomps,
229 ebegin,
230 [&](const std::size_t i, const Stat_ value) -> void {
231 (*output.quantiles)[i][gene] = value;
232 }
233 );
234 }
235 }
236}
237
238template<typename Gene_, typename Stat_, typename Rank_>
239void summarize_comparisons(
240 const Gene_ ngenes,
241 const std::size_t ngroups,
242 const Stat_* const effects,
243 const std::optional<std::vector<double> >& compute_quantiles,
244 const std::vector<SummaryBuffers<Stat_, Rank_> >& output,
245 const int threads
246) {
247 tatami::parallelize([&](const int, const Gene_ start, const Gene_ length) -> void {
248 auto summary_qcalcs = setup_multiple_quantiles<Stat_>(compute_quantiles, ngroups);
249 auto buffer = sanisizer::create<std::vector<Stat_> >(ngroups);
250
251 for (Gene_ gene = start, end = start + length; gene < end; ++gene) {
252 for (I<decltype(ngroups)> l = 0; l < ngroups; ++l) {
253 const auto current_effects = effects + sanisizer::nd_offset<std::size_t>(0, ngroups, l, ngroups, gene);
254 summarize_comparisons(ngroups, current_effects, l, gene, output[l], summary_qcalcs, buffer);
255 }
256 }
257 }, ngenes, threads);
258}
259
260template<typename Stat_, typename Gene_>
261Gene_ fill_and_sort_rank_buffer(const Stat_* const effects, const std::size_t stride, std::vector<std::pair<Stat_, Gene_> >& buffer) {
262 Gene_ counter = 0;
263 for (Gene_ i = 0, end = buffer.size(); i < end; ++i) {
264 const auto cureffect = effects[sanisizer::product_unsafe<std::size_t>(i, stride)];
265 if (!std::isnan(cureffect)) {
266 auto& current = buffer[counter];
267 current.first = cureffect;
268 current.second = i;
269 ++counter;
270 }
271 }
272
273 std::sort(
274 buffer.begin(),
275 buffer.begin() + counter,
276 [&](const std::pair<Stat_, Gene_>& left, const std::pair<Stat_, Gene_>& right) -> bool {
277 // Sort by decreasing first element, then break ties by increasing second element.
278 if (left.first == right.first) {
279 return left.second < right.second;
280 } else {
281 return left.first > right.first;
282 }
283 }
284 );
285
286 return counter;
287}
288
289template<typename Stat_, typename Gene_, typename Rank_>
290void compute_min_rank_pairwise(
291 const Gene_ ngenes,
292 const std::size_t ngroups,
293 const Stat_* const effects,
294 const std::vector<SummaryBuffers<Stat_, Rank_> >& output,
295 const bool preserve_ties,
296 const int threads
297) {
298 const auto ngroups2 = sanisizer::product_unsafe<std::size_t>(ngroups, ngroups);
299 const auto maxrank_placeholder = sanisizer::cast<Rank_>(ngenes); // using the maximum possible rank (i.e., 'ngenes') as the default.
300
301 tatami::parallelize([&](const int, const std::size_t start, const std::size_t length) -> void {
302 auto buffer = sanisizer::create<std::vector<std::pair<Stat_, Gene_> > >(ngenes);
303 for (I<decltype(start)> g = start, end = start + length; g < end; ++g) {
304 const auto target = output[g].min_rank;
305 if (target == NULL) {
306 continue;
307 }
308 std::fill_n(target, ngenes, maxrank_placeholder);
309
310 for (I<decltype(ngroups)> g2 = 0; g2 < ngroups; ++g2) {
311 if (g == g2) {
312 continue;
313 }
314 const auto offset = sanisizer::nd_offset<std::size_t>(g2, ngroups, g);
315 const auto used = fill_and_sort_rank_buffer(effects + offset, ngroups2, buffer);
316
317 if (!preserve_ties) {
318 Rank_ counter = 1;
319 for (Gene_ i = 0; i < used; ++i) {
320 auto& current = target[buffer[i].second];
321 if (counter < current) {
322 current = counter;
323 }
324 ++counter;
325 }
326 } else {
327 Rank_ counter = 1;
328 Gene_ i = 0;
329 while (i < used) {
330 const auto original = i;
331 const auto val = buffer[i].first;
332
333 auto& current = target[buffer[i].second];
334 if (counter < current) {
335 current = counter;
336 }
337
338 while (++i < used && buffer[i].first == val) {
339 auto& current = target[buffer[i].second];
340 if (counter < current) {
341 current = counter;
342 }
343 }
344
345 counter += i - original;
346 }
347 }
348 }
349 }
350 }, ngroups, threads);
351}
352
353template<typename Gene_, typename Stat_, typename Rank_>
354SummaryBuffers<Stat_, Rank_> fill_summary_results(
355 const Gene_ ngenes,
356 SummaryResults<Stat_, Rank_>& out,
357 const bool compute_min,
358 const bool compute_mean,
359 const bool compute_median,
360 const bool compute_max,
361 const std::optional<std::vector<double> >& compute_quantiles,
362 const bool compute_min_rank
363) {
364 SummaryBuffers<Stat_, Rank_> ptr;
365 const auto out_len = sanisizer::cast<typename std::vector<Stat_>::size_type>(ngenes);
366
367 if (compute_min) {
368 out.min.resize(out_len
369#ifdef SCRAN_MARKERS_TEST_INIT
370 , SCRAN_MARKERS_TEST_INIT
371#endif
372 );
373 ptr.min = out.min.data();
374 }
375
376 if (compute_mean) {
377 out.mean.resize(out_len
378#ifdef SCRAN_MARKERS_TEST_INIT
379 , SCRAN_MARKERS_TEST_INIT
380#endif
381 );
382 ptr.mean = out.mean.data();
383 }
384
385 if (compute_median) {
386 out.median.resize(out_len
387#ifdef SCRAN_MARKERS_TEST_INIT
388 , SCRAN_MARKERS_TEST_INIT
389#endif
390 );
391 ptr.median = out.median.data();
392 }
393
394 if (compute_max) {
395 out.max.resize(out_len
396#ifdef SCRAN_MARKERS_TEST_INIT
397 , SCRAN_MARKERS_TEST_INIT
398#endif
399 );
400 ptr.max = out.max.data();
401 }
402
403 if (compute_quantiles.has_value()) {
404 out.quantiles.emplace();
405 ptr.quantiles.emplace();
406 out.quantiles->reserve(compute_quantiles->size());
407 ptr.quantiles->reserve(compute_quantiles->size());
408 for ([[maybe_unused]] const auto quan : *compute_quantiles) {
409 out.quantiles->emplace_back(out_len
410#ifdef SCRAN_MARKERS_TEST_INIT
411 , SCRAN_MARKERS_TEST_INIT
412#endif
413 );
414 ptr.quantiles->push_back(out.quantiles->back().data());
415 }
416 }
417
418 if (compute_min_rank) {
419 out.min_rank.resize(out_len
420#ifdef SCRAN_MARKERS_TEST_INIT
421 , SCRAN_MARKERS_TEST_INIT
422#endif
423 );
424 ptr.min_rank = out.min_rank.data();
425 }
426
427 return ptr;
428}
429
430template<typename Gene_, typename Stat_, typename Rank_>
431std::vector<SummaryBuffers<Stat_, Rank_> > fill_summary_results(
432 Gene_ ngenes,
433 const std::size_t ngroups,
434 std::vector<SummaryResults<Stat_, Rank_> >& outputs,
435 const bool compute_min,
436 const bool compute_mean,
437 const bool compute_median,
438 const bool compute_max,
439 const std::optional<std::vector<double> >& compute_quantiles,
440 const bool compute_min_rank
441) {
442 sanisizer::resize(outputs, ngroups);
443 std::vector<SummaryBuffers<Stat_, Rank_> > ptrs;
444 ptrs.reserve(ngroups);
445 for (I<decltype(ngroups)> g = 0; g < ngroups; ++g) {
446 ptrs.emplace_back(
447 fill_summary_results(
448 ngenes,
449 outputs[g],
450 compute_min,
451 compute_mean,
452 compute_median,
453 compute_max,
454 compute_quantiles,
455 compute_min_rank
456 )
457 );
458 }
459 return ptrs;
460}
461
462}
467}
468
469#endif
Marker detection for single-cell data.
Definition score_markers_pairwise.hpp:27
int parallelize(Function_ fun, const Index_ tasks, const int workers)
Pointers to arrays to hold the summary statistics.
Definition summarize_comparisons.hpp:33
Stat_ * mean
Definition summarize_comparisons.hpp:46
Stat_ * median
Definition summarize_comparisons.hpp:53
Stat_ * min
Definition summarize_comparisons.hpp:39
Stat_ * max
Definition summarize_comparisons.hpp:60
Rank_ * min_rank
Definition summarize_comparisons.hpp:82
std::optional< std::vector< Stat_ * > > quantiles
Definition summarize_comparisons.hpp:75
Container for the summary statistics.
Definition summarize_comparisons.hpp:92
std::vector< Stat_ > mean
Definition summarize_comparisons.hpp:103
std::vector< Stat_ > median
Definition summarize_comparisons.hpp:109
std::optional< std::vector< std::vector< Stat_ > > > quantiles
Definition summarize_comparisons.hpp:123
std::vector< Stat_ > min
Definition summarize_comparisons.hpp:97
std::vector< Rank_ > min_rank
Definition summarize_comparisons.hpp:129
std::vector< Stat_ > max
Definition summarize_comparisons.hpp:115