scran_qc
Simple quality control on single-cell data
Loading...
Searching...
No Matches
find_median_mad.hpp
Go to the documentation of this file.
1#ifndef SCRAN_QC_FIND_MEDIAN_MAD_H
2#define SCRAN_QC_FIND_MEDIAN_MAD_H
3
4#include <vector>
5#include <limits>
6#include <cmath>
7#include <algorithm>
8#include <cstdint>
9
10#include "tatami_stats/tatami_stats.hpp"
11
17namespace scran_qc {
18
28 bool log = false;
29
34 bool median_only = false;
35};
36
41template<typename Float_>
62
78template<typename Index_, typename Float_>
80 static_assert(std::is_floating_point<Float_>::value);
81
82 // Rotate all the NaNs to the front of the buffer and ignore them.
83 Index_ lost = 0;
84 for (Index_ i = 0; i < num; ++i) {
85 if (std::isnan(metrics[i])) {
86 std::swap(metrics[i], metrics[lost]);
87 ++lost;
88 }
89 }
90 metrics += lost;
91 num -= lost;
92
93 if (options.log) {
94 auto copy = metrics;
95 for (Index_ i = 0; i < num; ++i, ++copy) {
96 if (*copy > 0) {
97 *copy = std::log(*copy);
98 } else if (*copy == 0) {
99 *copy = -std::numeric_limits<double>::infinity();
100 } else {
101 throw std::runtime_error("cannot log-transform negative values");
102 }
103 }
104 }
105
106 // No need to skip the NaNs again.
107 auto median = tatami_stats::medians::direct<Float_>(metrics, num, /* skip_nan = */ false);
108
109 if (options.median_only || std::isnan(median)) {
110 // Giving up.
111 return FindMedianMadResults<Float_>(median, std::numeric_limits<Float_>::quiet_NaN());
112 } else if (std::isinf(median)) {
113 // MADs should be no-ops when added/subtracted from infinity. Any
114 // finite value will do here, so might as well keep it simple.
115 return FindMedianMadResults<Float_>(median, static_cast<Float_>(0));
116 }
117
118 // As an aside, there's no way to avoid passing in 'metrics' as a Float_,
119 // even if the original values were integers, because we need to do this
120 // subtraction here that could cast integers to floats. So at some point we
121 // will need a floating-point buffer, and so we might as well just pass the
122 // metrics in as floats in the first place. Technically the first sort
123 // could be done with an integer buffer but then we'd need an extra argument.
124
125 auto copy = metrics;
126 for (Index_ i = 0; i < num; ++i, ++copy) {
127 *copy = std::abs(*copy - median);
128 }
129 auto mad = tatami_stats::medians::direct<Float_>(metrics, num, /* skip_nan = */ false);
130 mad *= 1.4826; // for equivalence with the standard deviation under normality.
131
132 return FindMedianMadResults<Float_>(median, mad);
133}
134
152template<typename Float_ = double, typename Index_, typename Value_>
154 std::unique_ptr<std::vector<Float_> > xbuffer;
155 if (buffer == NULL) {
156 xbuffer = std::make_unique<std::vector<Float_> >(num
157#ifdef SCRAN_QC_TEST_INIT
159#endif
160 );
161 buffer = xbuffer->data();
162 }
163 std::copy_n(metrics, num, buffer);
165}
166
175template<typename Float_, typename Index_>
177public:
185 template<typename Block_>
189
194
202 template<typename Block_>
203 void set(Index_ num, const Block_* block) {
204 my_block_starts.clear();
205
206 if (block) {
207 for (Index_ i = 0; i < num; ++i) {
208 size_t candidate = block[i];
209 if (candidate >= my_block_starts.size()) {
210 my_block_starts.resize(candidate + 1);
211 }
213 }
214
215 Index_ sofar = 0;
216 for (auto& s : my_block_starts) {
217 Index_ last = sofar;
218 sofar += s;
219 s = last;
220 }
221 }
222
223 my_buffer.resize(num
226#endif
227 );
228 my_block_ends.resize(my_block_starts.size());
229 }
230
234public:
235 // Can't figure out how to make compute_blocked() a friend,
236 // so these puppies are public for simplicity.
237 std::vector<Float_> my_buffer;
238 std::vector<Index_> my_block_starts;
239 std::vector<Index_> my_block_ends;
243};
244
268template<typename Output_ = double, typename Index_, typename Value_, typename Block_>
269std::vector<FindMedianMadResults<Output_> > find_median_mad_blocked(
270 Index_ num,
271 const Value_* metrics,
272 const Block_* block,
275{
276 std::unique_ptr<FindMedianMadWorkspace<Output_, Index_> > xworkspace;
277 if (workspace == NULL) {
278 xworkspace = std::make_unique<FindMedianMadWorkspace<Output_, Index_> >(num, block);
279 workspace = xworkspace.get();
280 }
281
282 std::vector<FindMedianMadResults<Output_> > output;
283
284 auto& buffer = workspace->my_buffer;
285 if (!block) {
286 std::copy_n(metrics, num, buffer.begin());
287 output.push_back(find_median_mad(num, buffer.data(), options));
288 return output;
289 }
290
291 const auto& starts = workspace->my_block_starts;
292 auto& ends = workspace->my_block_ends;
293 std::copy(starts.begin(), starts.end(), ends.begin());
294 for (Index_ i = 0; i < num; ++i) {
295 auto& pos = ends[block[i]];
296 buffer[pos] = metrics[i];
297 ++pos;
298 }
299
300 // Using the ranges on the buffer.
301 size_t nblocks = starts.size();
302 output.reserve(nblocks);
303 for (size_t g = 0; g < nblocks; ++g) {
304 output.push_back(find_median_mad(ends[g] - starts[g], buffer.data() + starts[g], options));
305 }
306
307 return output;
308}
309
310}
311
312#endif
Temporary data structures for find_median_mad_blocked().
Definition find_median_mad.hpp:176
void set(Index_ num, const Block_ *block)
Definition find_median_mad.hpp:203
FindMedianMadWorkspace(Index_ num, const Block_ *block)
Definition find_median_mad.hpp:186
Simple quality control for single-cell data.
Definition adt_quality_control.hpp:20
FindMedianMadResults< Float_ > find_median_mad(Index_ num, Float_ *metrics, const FindMedianMadOptions &options)
Definition find_median_mad.hpp:79
std::vector< FindMedianMadResults< Output_ > > find_median_mad_blocked(Index_ num, const Value_ *metrics, const Block_ *block, FindMedianMadWorkspace< Output_, Index_ > *workspace, const FindMedianMadOptions &options)
Definition find_median_mad.hpp:269
Options for find_median_mad().
Definition find_median_mad.hpp:22
bool log
Definition find_median_mad.hpp:28
bool median_only
Definition find_median_mad.hpp:34
Results of find_median_mad().
Definition find_median_mad.hpp:42
Float_ median
Definition find_median_mad.hpp:55
Float_ mad
Definition find_median_mad.hpp:60