scran_qc
Simple quality control on single-cell data
Loading...
Searching...
No Matches
find_median_mad.hpp
Go to the documentation of this file.
1#ifndef SCRAN_QC_FIND_MEDIAN_MAD_H
2#define SCRAN_QC_FIND_MEDIAN_MAD_H
3
4#include <vector>
5#include <limits>
6#include <cmath>
7#include <algorithm>
8#include <cstddef>
9
10#include "tatami_stats/tatami_stats.hpp"
11#include "sanisizer/sanisizer.hpp"
12
13#include "utils.hpp"
14
20namespace scran_qc {
21
31 bool log = false;
32
37 bool median_only = false;
38};
39
44template<typename Float_>
49 FindMedianMadResults(Float_ m1, Float_ m2) : median(m1), mad(m2) {}
50 FindMedianMadResults() = default;
58 Float_ median = 0;
59
63 Float_ mad = 0;
64};
65
80template<typename Float_>
81FindMedianMadResults<Float_> find_median_mad(std::size_t num, Float_* metrics, const FindMedianMadOptions& options) {
82 static_assert(std::is_floating_point<Float_>::value);
83
84 // Rotate all the NaNs to the front of the buffer and ignore them.
85 decltype(I(num)) lost = 0;
86 for (decltype(I(num)) i = 0; i < num; ++i) {
87 if (std::isnan(metrics[i])) {
88 std::swap(metrics[i], metrics[lost]);
89 ++lost;
90 }
91 }
92 metrics += lost;
93 num -= lost;
94
95 if (options.log) {
96 for (decltype(I(num)) i = 0; i < num; ++i) {
97 auto& val = metrics[i];
98 if (val > 0) {
99 val = std::log(val);
100 } else if (val == 0) {
101 val = -std::numeric_limits<double>::infinity();
102 } else {
103 throw std::runtime_error("cannot log-transform negative values");
104 }
105 }
106 }
107
108 // No need to skip the NaNs again.
109 const auto median = tatami_stats::medians::direct<Float_>(metrics, num, /* skip_nan = */ false);
110
111 if (options.median_only || std::isnan(median)) {
112 // Giving up.
113 return FindMedianMadResults<Float_>(median, std::numeric_limits<Float_>::quiet_NaN());
114 } else if (std::isinf(median)) {
115 // MADs should be no-ops when added/subtracted from infinity. Any
116 // finite value will do here, so might as well keep it simple.
117 return FindMedianMadResults<Float_>(median, static_cast<Float_>(0));
118 }
119
120 // As an aside, there's no way to avoid passing in 'metrics' as a Float_,
121 // even if the original values were integers, because we need to do this
122 // subtraction here that could cast integers to floats. So at some point we
123 // will need a floating-point buffer, and so we might as well just pass the
124 // metrics in as floats in the first place. Technically the first sort
125 // could be done with an integer buffer but then we'd need an extra argument.
126
127 for (decltype(I(num)) i = 0; i < num; ++i) {
128 metrics[i] = std::abs(metrics[i] - median);
129 }
130 auto mad = tatami_stats::medians::direct<Float_>(metrics, num, /* skip_nan = */ false);
131 mad *= 1.4826; // for equivalence with the standard deviation under normality.
132
133 return FindMedianMadResults<Float_>(median, mad);
134}
135
152template<typename Float_ = double, typename Value_>
153FindMedianMadResults<Float_> find_median_mad(const std::size_t num, const Value_* const metrics, Float_* buffer, const FindMedianMadOptions& options) {
154 std::vector<Float_> xbuffer;
155 if (buffer == NULL) {
156 sanisizer::resize(xbuffer, num
157#ifdef SCRAN_QC_TEST_INIT
158 , SCRAN_QC_TEST_INIT
159#endif
160 );
161 buffer = xbuffer.data();
162 }
163 std::copy_n(metrics, num, buffer);
164 return find_median_mad(num, buffer, options);
165}
166
174template<typename Float_>
176public:
184 template<typename Block_>
185 FindMedianMadWorkspace(const std::size_t num, const Block_* const block) :
186 my_buffer(sanisizer::cast<decltype(I(my_buffer.size()))>(num))
187 {
188 set(num, block);
189 }
190
195
203 template<typename Block_>
204 void set(const std::size_t num, const Block_* const block) {
205 my_block_starts.clear();
206
207 if (block) {
208 for (decltype(I(num)) i = 0; i < num; ++i) {
209 const auto candidate = block[i];
210 if (sanisizer::is_greater_than_or_equal(candidate, my_block_starts.size())) {
211 my_block_starts.resize(sanisizer::sum<decltype(I(my_block_starts.size()))>(candidate, 1));
212 }
213 ++my_block_starts[candidate];
214 }
215
216 std::size_t sofar = 0;
217 for (auto& s : my_block_starts) {
218 const auto last = sofar;
219 sofar += s;
220 s = last;
221 }
222 }
223
224 sanisizer::resize(my_buffer, num
225#ifdef SCRAN_QC_TEST_INIT
226 , SCRAN_QC_TEST_INIT
227#endif
228 );
229 sanisizer::resize(my_block_ends, num
230#ifdef SCRAN_QC_TEST_INIT
231 , SCRAN_QC_TEST_INIT
232#endif
233 );
234 }
235
239public:
240 // Can't figure out how to make compute_blocked() a friend,
241 // so these puppies are public for simplicity.
242 std::vector<Float_> my_buffer;
243 std::vector<std::size_t> my_block_starts;
244 std::vector<std::size_t> my_block_ends;
248};
249
272template<typename Output_ = double, typename Value_, typename Block_>
273std::vector<FindMedianMadResults<Output_> > find_median_mad_blocked(
274 const std::size_t num,
275 const Value_* const metrics,
276 const Block_* const block,
278 const FindMedianMadOptions& options)
279{
280 std::unique_ptr<FindMedianMadWorkspace<Output_> > xworkspace;
281 if (workspace == NULL) {
282 xworkspace = std::make_unique<FindMedianMadWorkspace<Output_> >(num, block);
283 workspace = xworkspace.get();
284 }
285
286 std::vector<FindMedianMadResults<Output_> > output;
287
288 auto& buffer = workspace->my_buffer;
289 if (!block) {
290 std::copy_n(metrics, num, buffer.begin());
291 output.push_back(find_median_mad(num, buffer.data(), options));
292 return output;
293 }
294
295 const auto& starts = workspace->my_block_starts;
296 auto& ends = workspace->my_block_ends;
297 std::copy(starts.begin(), starts.end(), ends.begin());
298 for (decltype(I(num)) i = 0; i < num; ++i) {
299 auto& pos = ends[block[i]];
300 buffer[pos] = metrics[i];
301 ++pos;
302 }
303
304 // Using the ranges on the buffer.
305 const auto nblocks = starts.size();
306 output.reserve(nblocks);
307 for (decltype(I(nblocks)) g = 0; g < nblocks; ++g) {
308 output.push_back(find_median_mad(ends[g] - starts[g], buffer.data() + starts[g], options));
309 }
310
311 return output;
312}
313
314}
315
316#endif
Temporary data structures for find_median_mad_blocked().
Definition find_median_mad.hpp:175
void set(const std::size_t num, const Block_ *const block)
Definition find_median_mad.hpp:204
FindMedianMadWorkspace(const std::size_t num, const Block_ *const block)
Definition find_median_mad.hpp:185
Simple quality control for single-cell data.
Definition adt_quality_control.hpp:23
FindMedianMadResults< Float_ > find_median_mad(std::size_t num, Float_ *metrics, const FindMedianMadOptions &options)
Definition find_median_mad.hpp:81
std::vector< FindMedianMadResults< Output_ > > find_median_mad_blocked(const std::size_t num, const Value_ *const metrics, const Block_ *const block, FindMedianMadWorkspace< Output_ > *workspace, const FindMedianMadOptions &options)
Definition find_median_mad.hpp:273
Options for find_median_mad().
Definition find_median_mad.hpp:25
bool log
Definition find_median_mad.hpp:31
bool median_only
Definition find_median_mad.hpp:37
Results of find_median_mad().
Definition find_median_mad.hpp:45
Float_ median
Definition find_median_mad.hpp:58
Float_ mad
Definition find_median_mad.hpp:63