scran_qc
Simple quality control on single-cell data
Loading...
Searching...
No Matches
find_median_mad.hpp
Go to the documentation of this file.
1#ifndef SCRAN_QC_FIND_MEDIAN_MAD_H
2#define SCRAN_QC_FIND_MEDIAN_MAD_H
3
4#include <vector>
5#include <limits>
6#include <cmath>
7#include <algorithm>
8#include <cstdint>
9
10#include "tatami_stats/tatami_stats.hpp"
11
17namespace scran_qc {
18
28 bool log = false;
29
34 bool median_only = false;
35};
36
41template<typename Float_>
62
78template<typename Index_, typename Float_>
80 static_assert(std::is_floating_point<Float_>::value);
81
82 // Rotate all the NaNs to the front of the buffer and ignore them.
83 Index_ lost = 0;
84 for (Index_ i = 0; i < num; ++i) {
85 if (std::isnan(metrics[i])) {
86 std::swap(metrics[i], metrics[lost]);
87 ++lost;
88 }
89 }
90 metrics += lost;
91 num -= lost;
92
93 if (options.log) {
94 auto copy = metrics;
95 for (Index_ i = 0; i < num; ++i, ++copy) {
96 if (*copy > 0) {
97 *copy = std::log(*copy);
98 } else if (*copy == 0) {
99 *copy = -std::numeric_limits<double>::infinity();
100 } else {
101 throw std::runtime_error("cannot log-transform negative values");
102 }
103 }
104 }
105
106 // No need to skip the NaNs again.
107 auto median = tatami_stats::medians::direct<Float_>(metrics, num, /* skip_nan = */ false);
108
109 if (options.median_only || std::isnan(median)) {
110 // Giving up.
111 return FindMedianMadResults<Float_>(median, std::numeric_limits<Float_>::quiet_NaN());
112 } else if (std::isinf(median)) {
113 // MADs should be no-ops when added/subtracted from infinity. Any
114 // finite value will do here, so might as well keep it simple.
115 return FindMedianMadResults<Float_>(median, static_cast<Float_>(0));
116 }
117
118 // As an aside, there's no way to avoid passing in 'metrics' as a Float_,
119 // even if the original values were integers, because we need to do this
120 // subtraction here that could cast integers to floats. So at some point we
121 // will need a floating-point buffer, and so we might as well just pass the
122 // metrics in as floats in the first place. Technically the first sort
123 // could be done with an integer buffer but then we'd need an extra argument.
124
125 auto copy = metrics;
126 for (Index_ i = 0; i < num; ++i, ++copy) {
127 *copy = std::abs(*copy - median);
128 }
129 auto mad = tatami_stats::medians::direct<Float_>(metrics, num, /* skip_nan = */ false);
130 mad *= 1.4826; // for equivalence with the standard deviation under normality.
131
132 return FindMedianMadResults<Float_>(median, mad);
133}
134
152template<typename Float_ = double, typename Index_, typename Value_>
154 std::unique_ptr<std::vector<Float_> > xbuffer;
155 if (buffer == NULL) {
156 xbuffer = std::make_unique<std::vector<Float_> >(num);
157 buffer = xbuffer->data();
158 }
159 std::copy_n(metrics, num, buffer);
161}
162
171template<typename Float_, typename Index_>
173public:
181 template<typename Block_>
185
190
198 template<typename Block_>
199 void set(Index_ num, const Block_* block) {
200 my_block_starts.clear();
201
202 if (block) {
203 for (Index_ i = 0; i < num; ++i) {
204 size_t candidate = block[i];
205 if (candidate >= my_block_starts.size()) {
206 my_block_starts.resize(candidate + 1);
207 }
209 }
210
211 Index_ sofar = 0;
212 for (auto& s : my_block_starts) {
213 Index_ last = sofar;
214 sofar += s;
215 s = last;
216 }
217 }
218
219 my_buffer.resize(num);
220 my_block_ends.resize(my_block_starts.size());
221 }
222
226public:
227 // Can't figure out how to make compute_blocked() a friend,
228 // so these puppies are public for simplicity.
229 std::vector<Float_> my_buffer;
230 std::vector<Index_> my_block_starts;
231 std::vector<Index_> my_block_ends;
235};
236
260template<typename Output_ = double, typename Index_, typename Value_, typename Block_>
261std::vector<FindMedianMadResults<Output_> > find_median_mad_blocked(
262 Index_ num,
263 const Value_* metrics,
264 const Block_* block,
267{
268 std::unique_ptr<FindMedianMadWorkspace<Output_, Index_> > xworkspace;
269 if (workspace == NULL) {
270 xworkspace = std::make_unique<FindMedianMadWorkspace<Output_, Index_> >(num, block);
271 workspace = xworkspace.get();
272 }
273
274 std::vector<FindMedianMadResults<Output_> > output;
275
276 auto& buffer = workspace->my_buffer;
277 if (!block) {
278 std::copy_n(metrics, num, buffer.begin());
279 output.push_back(find_median_mad(num, buffer.data(), options));
280 return output;
281 }
282
283 const auto& starts = workspace->my_block_starts;
284 auto& ends = workspace->my_block_ends;
285 std::copy(starts.begin(), starts.end(), ends.begin());
286 for (Index_ i = 0; i < num; ++i) {
287 auto& pos = ends[block[i]];
288 buffer[pos] = metrics[i];
289 ++pos;
290 }
291
292 // Using the ranges on the buffer.
293 size_t nblocks = starts.size();
294 output.reserve(nblocks);
295 for (size_t g = 0; g < nblocks; ++g) {
296 output.push_back(find_median_mad(ends[g] - starts[g], buffer.data() + starts[g], options));
297 }
298
299 return output;
300}
301
302}
303
304#endif
Temporary data structures for find_median_mad_blocked().
Definition find_median_mad.hpp:172
void set(Index_ num, const Block_ *block)
Definition find_median_mad.hpp:199
FindMedianMadWorkspace(Index_ num, const Block_ *block)
Definition find_median_mad.hpp:182
Simple quality control for single-cell data.
Definition adt_quality_control.hpp:20
FindMedianMadResults< Float_ > find_median_mad(Index_ num, Float_ *metrics, const FindMedianMadOptions &options)
Definition find_median_mad.hpp:79
std::vector< FindMedianMadResults< Output_ > > find_median_mad_blocked(Index_ num, const Value_ *metrics, const Block_ *block, FindMedianMadWorkspace< Output_, Index_ > *workspace, const FindMedianMadOptions &options)
Definition find_median_mad.hpp:261
Options for find_median_mad().
Definition find_median_mad.hpp:22
bool log
Definition find_median_mad.hpp:28
bool median_only
Definition find_median_mad.hpp:34
Results of find_median_mad().
Definition find_median_mad.hpp:42
Float_ median
Definition find_median_mad.hpp:55
Float_ mad
Definition find_median_mad.hpp:60