scran_qc
Simple quality control on single-cell data
Loading...
Searching...
No Matches
per_cell_qc_metrics.hpp
Go to the documentation of this file.
1#ifndef SCRAN_QC_PER_CELL_QC_METRICS_HPP
2#define SCRAN_QC_PER_CELL_QC_METRICS_HPP
3
4#include <vector>
5#include <algorithm>
6#include <limits>
7#include <cstdint>
8
9#include "tatami/tatami.hpp"
10#include "tatami_stats/tatami_stats.hpp"
11
17namespace scran_qc {
18
27 bool compute_sum = true;
28
33 bool compute_detected = true;
34
39 bool compute_max_value = true;
40
45 bool compute_max_index = true;
46
51 bool compute_subset_sum = true;
52
58
63 int num_threads = 1;
64};
65
74template<typename Sum_, typename Detected_, typename Value_, typename Index_>
126
130namespace internal {
131
132template<typename Value_, typename Index_, typename Subset_, typename Sum_, typename Detected_>
135 const std::vector<Subset_>& subsets,
137 int num_threads)
138{
139 std::vector<std::vector<Index_> > subset_indices;
140 if (!output.subset_sum.empty() || !output.subset_detected.empty()) {
141 if constexpr(std::is_pointer<Subset_>::value) {
142 size_t nsubsets = subsets.size();
143 subset_indices.resize(nsubsets);
144 Index_ NR = mat.nrow();
145
146 for (size_t s = 0; s < nsubsets; ++s) {
147 auto& current = subset_indices[s];
148 const auto& source = subsets[s];
149 for (Index_ i = 0; i < NR; ++i) {
150 if (source[i]) {
151 current.push_back(i);
152 }
153 }
154 }
155 }
156 }
157
159 auto NR = mat.nrow();
160 auto ext = tatami::consecutive_extractor<false>(&mat, false, start, length);
161 std::vector<Value_> vbuffer(NR);
162
163 bool do_max = output.max_index || output.max_value;
164
165 size_t nsubsets = subsets.size();
166
167 for (Index_ c = start, end = start + length; c < end; ++c) {
168 auto ptr = ext->fetch(c, vbuffer.data());
169
170 if (output.sum) {
171 output.sum[c] = std::accumulate(ptr, ptr + NR, static_cast<Sum_>(0));
172 }
173
174 if (output.detected) {
175 Detected_ count = 0;
176 for (Index_ r = 0; r < NR; ++r) {
177 count += (ptr[r] != 0);
178 }
179 output.detected[c] = count;
180 }
181
182 if (do_max) {
183 Index_ max_index = 0;
184 Value_ max_value = 0;
185
186 if (NR) {
187 max_value = ptr[0];
188 for (Index_ r = 1; r < NR; ++r) {
189 if (max_value < ptr[r]) {
190 max_value = ptr[r];
191 max_index = r;
192 }
193 }
194 }
195
196 if (output.max_index) {
197 output.max_index[c] = max_index;
198 }
199 if (output.max_value) {
200 output.max_value[c] = max_value;
201 }
202 }
203
204 if (!output.subset_sum.empty() || !output.subset_detected.empty()) { // protect against accessing an empty subset_indices.
205 for (size_t s = 0; s < nsubsets; ++s) {
206 const auto& sub = [&]() {
207 if constexpr(std::is_pointer<Subset_>::value) {
208 return subset_indices[s];
209 } else {
210 return subsets[s];
211 }
212 }();
213
214 if (!output.subset_sum.empty() && output.subset_sum[s]) {
215 Sum_ current = 0;
216 for (auto r : sub) {
217 current += ptr[r];
218 }
219 output.subset_sum[s][c] = current;
220 }
221
222 if (!output.subset_detected.empty() && output.subset_detected[s]) {
223 Detected_ current = 0;
224 for (auto r : sub) {
225 current += ptr[r] != 0;
226 }
227 output.subset_detected[s][c] = current;
228 }
229 }
230 }
231 }
232 }, mat.ncol(), num_threads);
233}
234
235template<typename Index_, typename Subset_, typename Sum_, typename Detected_, typename Value_>
236std::vector<std::vector<uint8_t> > boolify_subsets(Index_ NR, const std::vector<Subset_>& subsets, const PerCellQcMetricsBuffers<Sum_, Detected_, Value_, Index_>& output) {
237 std::vector<std::vector<uint8_t> > is_in_subset;
238
239 if (!output.subset_sum.empty() || !output.subset_detected.empty()) {
240 if constexpr(!std::is_pointer<Subset_>::value) {
241 size_t nsubsets = subsets.size();
242 for (size_t s = 0; s < nsubsets; ++s) {
243 is_in_subset.emplace_back(NR);
244 auto& last = is_in_subset.back();
245 for (auto i : subsets[s]) {
246 last[i] = 1;
247 }
248 }
249 }
250 }
251
252 return is_in_subset;
253}
254
255template<typename Value_, typename Index_, typename Subset_, typename Sum_, typename Detected_>
258 const std::vector<Subset_>& subsets,
260 int num_threads)
261{
263
265 auto NR = mat.nrow();
266 auto ext = tatami::consecutive_extractor<true>(&mat, false, start, length);
267 std::vector<Value_> vbuffer(NR);
268 std::vector<Index_> ibuffer(NR);
269
270 bool do_max = output.max_index || output.max_value;
271
272 size_t nsubsets = subsets.size();
273
274 for (Index_ c = start, end = start + length; c < end; ++c) {
275 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
276
277 if (output.sum) {
278 output.sum[c] = std::accumulate(range.value, range.value + range.number, static_cast<Sum_>(0));
279 }
280
281 if (output.detected) {
282 Detected_ current = 0;
283 for (Index_ i = 0; i < range.number; ++i) {
284 current += (range.value[i] != 0);
285 }
286 output.detected[c] = current;
287 }
288
289 if (do_max) {
290 Index_ max_index = 0;
291 Value_ max_value = 0;
292
293 if (range.number) {
294 max_value = range.value[0];
295 max_index = range.index[0];
296 for (Index_ i = 1; i < range.number; ++i) {
297 if (max_value < range.value[i]) {
298 max_value = range.value[i];
299 max_index = range.index[i];
300 }
301 }
302
303 if (max_value <= 0 && range.number < NR) {
304 if (output.max_index) {
305 // Figuring out the index of the first zero, assuming range.index is sorted.
306 Index_ last = 0;
307 for (Index_ i = 0; i < range.number; ++i) {
308 if (range.index[i] > last) { // must be at least one intervening structural zero.
309 break;
310 } else if (range.value[i] == 0) { // appears earlier than any structural zero, so it's already the maximum.
311 break;
312 }
313 last = range.index[i] + 1;
314 }
315 max_index = last;
316 }
317 max_value = 0;
318 }
319 } else if (NR) {
320 max_value = 0;
321 }
322
323 if (output.max_index) {
324 output.max_index[c] = max_index;
325 }
326 if (output.max_value) {
327 output.max_value[c] = max_value;
328 }
329 }
330
331 if (!output.subset_sum.empty() || !output.subset_detected.empty()) { // protect against accessing an empty is_in_subset.
332 for (size_t s = 0; s < nsubsets; ++s) {
333 const auto& sub = [&]() {
334 if constexpr(std::is_pointer<Subset_>::value) {
335 return subsets[s];
336 } else {
337 return is_in_subset[s];
338 }
339 }();
340
341 if (!output.subset_sum.empty() && output.subset_sum[s]) {
342 Sum_ current = 0;
343 for (Index_ i = 0; i < range.number; ++i) {
344 current += (sub[range.index[i]] != 0) * range.value[i];
345 }
346 output.subset_sum[s][c] = current;
347 }
348
349 if (!output.subset_detected.empty() && output.subset_detected[s]) {
350 Detected_ current = 0;
351 for (Index_ i = 0; i < range.number; ++i) {
352 current += (sub[range.index[i]] != 0) * (range.value[i] != 0);
353 }
354 output.subset_detected[s][c] = current;
355 }
356 }
357 }
358 }
359 }, mat.ncol(), num_threads);
360}
361
362template<typename Sum_, typename Detected_, typename Value_, typename Index_>
364public:
365 PerCellQcMetricsRunningBuffers(const PerCellQcMetricsBuffers<Sum_, Detected_, Value_, Index_>& output, int thread, Index_ start, Index_ len) {
366 if (output.sum) {
367 my_sum = tatami_stats::LocalOutputBuffer<Sum_>(thread, start, len, output.sum);
368 }
369
370 if (output.detected) {
371 my_detected = tatami_stats::LocalOutputBuffer<Detected_>(thread, start, len, output.detected);
372 }
373
374 if (output.max_value) {
375 my_max_value = tatami_stats::LocalOutputBuffer<Value_>(thread, start, len, output.max_value);
376 } else if (output.max_index) {
377 my_holding_max_value.resize(len);
378 }
379
380 if (output.max_index) {
381 my_max_index = tatami_stats::LocalOutputBuffer<Index_>(thread, start, len, output.max_index);
382 }
383
384 my_subset_sum.resize(output.subset_sum.size());
385 for (size_t s = 0, send = output.subset_sum.size(); s < send; ++s) {
386 if (output.subset_sum[s]) {
387 my_subset_sum[s] = tatami_stats::LocalOutputBuffer<Sum_>(thread, start, len, output.subset_sum[s]);
388 }
389 }
390
391 my_subset_detected.resize(output.subset_detected.size());
392 for (size_t s = 0, send = output.subset_detected.size(); s < send; ++s) {
393 if (output.subset_detected[s]) {
394 my_subset_detected[s] = tatami_stats::LocalOutputBuffer<Detected_>(thread, start, len, output.subset_detected[s]);
395 }
396 }
397 }
398
399private:
400 tatami_stats::LocalOutputBuffer<Sum_> my_sum;
401 tatami_stats::LocalOutputBuffer<Detected_> my_detected;
402
403 tatami_stats::LocalOutputBuffer<Value_> my_max_value;
404 std::vector<Value_> my_holding_max_value;
405 tatami_stats::LocalOutputBuffer<Index_> my_max_index;
406
407 std::vector<tatami_stats::LocalOutputBuffer<Sum_> > my_subset_sum;
408 std::vector<tatami_stats::LocalOutputBuffer<Detected_> > my_subset_detected;
409
410public:
411 Sum_* sum_data() {
412 return my_sum.data();
413 }
414
415 Detected_* detected_data() {
416 return my_detected.data();
417 }
418
419 Value_* max_value_data() {
420 auto dptr = my_max_value.data();
421 return (dptr ? dptr : my_holding_max_value.data());
422 }
423
424 Index_* max_index_data() {
425 return my_max_index.data();
426 }
427
428 std::vector<Sum_*> subset_sum_data() {
429 std::vector<Sum_*> output;
430 output.reserve(my_subset_sum.size());
431 for (auto& s : my_subset_sum) {
432 output.push_back(s.data());
433 }
434 return output;
435 }
436
437 std::vector<Detected_*> subset_detected_data() {
438 std::vector<Detected_*> output;
439 output.reserve(my_subset_detected.size());
440 for (auto& s : my_subset_detected) {
441 output.push_back(s.data());
442 }
443 return output;
444 }
445
446 void transfer() {
447 if (my_sum.data()) {
448 my_sum.transfer();
449 }
450 if (my_detected.data()) {
451 my_detected.transfer();
452 }
453
454 if (my_max_value.data()) {
455 my_max_value.transfer();
456 }
457 if (my_max_index.data()) {
458 my_max_index.transfer();
459 }
460
461 for (auto& s : my_subset_sum) {
462 if (s.data()) {
463 s.transfer();
464 }
465 }
466
467 for (auto& s : my_subset_detected) {
468 if (s.data()) {
469 s.transfer();
470 }
471 }
472 }
473};
474
475template<typename Value_, typename Index_, typename Subset_, typename Sum_, typename Detected_>
478 const std::vector<Subset_>& subsets,
480 int num_threads)
481{
483
485 auto NR = mat.nrow();
486 auto ext = tatami::consecutive_extractor<false>(&mat, true, static_cast<Index_>(0), NR, start, len);
487 std::vector<Value_> vbuffer(len);
488
490 auto outt = locals.sum_data();
491 auto outd = locals.detected_data();
492 auto outmi = locals.max_index_data();
493 auto outmc = locals.max_value_data();
494 bool do_max = (outmi || outmc);
495 auto outst = locals.subset_sum_data();
496 auto outsd = locals.subset_detected_data();
497
498 size_t nsubsets = subsets.size();
499
500 for (Index_ r = 0; r < NR; ++r) {
501 auto ptr = ext->fetch(vbuffer.data());
502
503 if (outt) {
504 for (Index_ i = 0; i < len; ++i) {
505 outt[i] += ptr[i];
506 }
507 }
508
509 if (outd) {
510 for (Index_ i = 0; i < len; ++i) {
511 outd[i] += (ptr[i] != 0);
512 }
513 }
514
515 if (do_max) {
516 if (r == 0) {
517 std::copy_n(ptr, len, outmc);
518 if (outmi) {
519 std::fill_n(outmi, len, 0);
520 }
521 } else {
522 for (Index_ i = 0; i < len; ++i) {
523 auto& curmax = outmc[i];
524 if (curmax < ptr[i]) {
525 curmax = ptr[i];
526 if (outmi) {
527 outmi[i] = r;
528 }
529 }
530 }
531 }
532 }
533
534 if (!outst.empty() || !outsd.empty()) { // protect against accessing an empty is_in_subset.
535 for (size_t s = 0; s < nsubsets; ++s) {
536 const auto& sub = [&]() {
537 if constexpr(std::is_pointer<Subset_>::value) {
538 return subsets[s];
539 } else {
540 return is_in_subset[s];
541 }
542 }();
543 if (sub[r] == 0) {
544 continue;
545 }
546
547 if (!outst.empty() && outst[s]) {
548 auto current = outst[s];
549 for (Index_ i = 0; i < len; ++i) {
550 current[i] += ptr[i];
551 }
552 }
553
554 if (!outsd.empty() && outsd[s]) {
555 auto current = outsd[s];
556 for (Index_ i = 0; i < len; ++i) {
557 current[i] += (ptr[i] != 0);
558 }
559 }
560 }
561 }
562 }
563
564 locals.transfer();
565 }, mat.ncol(), num_threads);
566}
567
568template<typename Value_, typename Index_, typename Subset_, typename Sum_, typename Detected_>
571 const std::vector<Subset_>& subsets,
573 int num_threads)
574{
578
580 auto NR = mat.nrow();
581 auto ext = tatami::consecutive_extractor<true>(&mat, true, static_cast<Index_>(0), NR, start, len, opt);
582 std::vector<Value_> vbuffer(len);
583 std::vector<Index_> ibuffer(len);
584
586 auto outt = locals.sum_data();
587 auto outd = locals.detected_data();
588 auto outmi = locals.max_index_data();
589 auto outmc = locals.max_value_data();
590 bool do_max = (outmi || outmc);
591 auto outst = locals.subset_sum_data();
592 auto outsd = locals.subset_detected_data();
593
594 size_t nsubsets = subsets.size();
595
596 std::vector<Index_> last_consecutive_nonzero(do_max ? len : 0);
597
598 for (Index_ r = 0; r < NR; ++r) {
599 auto range = ext->fetch(vbuffer.data(), ibuffer.data());
600
601 if (outt) {
602 for (Index_ i = 0; i < range.number; ++i) {
603 outt[range.index[i] - start] += range.value[i];
604 }
605 }
606
607 if (outd) {
608 for (Index_ i = 0; i < range.number; ++i) {
609 outd[range.index[i] - start] += (range.value[i] != 0);
610 }
611 }
612
613 if (do_max) {
614 if (r == 0) {
615 std::fill_n(outmc, len, 0);
616 for (Index_ i = 0; i < range.number; ++i) {
617 auto j = range.index[i] - start;
618 outmc[j] = range.value[i];
619 last_consecutive_nonzero[j] = 1; // see below
620 }
621 if (outmi) {
622 std::fill_n(outmi, len, 0);
623 }
624
625 } else {
626 for (Index_ i = 0; i < range.number; ++i) {
627 auto j = range.index[i] - start;
628 auto& curmax = outmc[j];
629
630 auto val = range.value[i];
631 if (curmax < val) {
632 curmax = val;
633 if (outmi) {
634 outmi[j] = r;
635 }
636 }
637
638 // Getting the index of the last consecutive structural
639 // non-zero, so that we can check if zero is the max
640 // and gets its first occurrence, if necessary.
642 if (last == r) {
643 ++last;
644 }
645 }
646 }
647 }
648
649 if (!outst.empty() || !outsd.empty()) { // protect against accessing an empty is_in_subset.
650 for (size_t s = 0; s < nsubsets; ++s) {
651 const auto& sub = [&]() {
652 if constexpr(std::is_pointer<Subset_>::value) {
653 return subsets[s];
654 } else {
655 return is_in_subset[s];
656 }
657 }();
658 if (sub[r] == 0) {
659 continue;
660 }
661
662 if (!outst.empty() && outst[s]) {
663 auto current = outst[s];
664 for (Index_ i = 0; i < range.number; ++i) {
665 current[range.index[i] - start] += range.value[i];
666 }
667 }
668
669 if (!outsd.empty() && outsd[s]) {
670 auto current = outsd[s];
671 for (Index_ i = 0; i < range.number; ++i) {
672 current[range.index[i] - start] += (range.value[i] != 0);
673 }
674 }
675 }
676 }
677 }
678
679 if (do_max) {
680 auto NR = mat.nrow();
681
682 // Checking anything with non-positive maximum, and replacing it
683 // with zero if there are any structural zeros.
684 for (Index_ c = 0; c < len; ++c) {
686 if (last_nz == NR) { // i.e., no structural zeros.
687 continue;
688 }
689
690 auto& current = outmc[c];
691 if (current > 0) { // doesn't defeat the current maximum.
692 continue;
693 }
694
695 current = 0;
696 if (outmi) {
697 outmi[c] = last_nz;
698 }
699 }
700 }
701
702 locals.transfer();
703 }, mat.ncol(), num_threads);
704}
705
706}
722template<typename Sum_, typename Detected_, typename Value_, typename Index_>
727 PerCellQcMetricsResults() = default;
728
738 std::vector<Sum_> sum;
739
744 std::vector<Detected_> detected;
745
751 std::vector<Index_> max_index;
752
757 std::vector<Value_> max_value;
758
764 std::vector<std::vector<Sum_> > subset_sum;
765
771 std::vector<std::vector<Detected_> > subset_detected;
772};
773
809template<typename Value_, typename Index_, typename Subset_, typename Sum_, typename Detected_>
812 const std::vector<Subset_>& subsets,
815{
816 if (mat.sparse()) {
817 if (mat.prefer_rows()) {
818 internal::compute_qc_running_sparse(mat, subsets, output, options.num_threads);
819 } else {
820 internal::compute_qc_direct_sparse(mat, subsets, output, options.num_threads);
821 }
822 } else {
823 if (mat.prefer_rows()) {
824 internal::compute_qc_running_dense(mat, subsets, output, options.num_threads);
825 } else {
826 internal::compute_qc_direct_dense(mat, subsets, output, options.num_threads);
827 }
828 }
829}
830
850template<typename Sum_ = double, typename Detected_ = int, typename Value_, typename Index_, typename Subset_>
853 const std::vector<Subset_>& subsets,
855{
858 auto ncells = mat.ncol();
859
860 if (options.compute_sum) {
861 output.sum.resize(ncells
864#endif
865 );
866 buffers.sum = output.sum.data();
867 }
868 if (options.compute_detected) {
869 output.detected.resize(ncells
872#endif
873 );
874 buffers.detected = output.detected.data();
875 }
876 if (options.compute_max_index) {
877 output.max_index.resize(ncells
880#endif
881 );
882 buffers.max_index = output.max_index.data();
883 }
884 if (options.compute_max_value) {
885 output.max_value.resize(ncells
888#endif
889 );
890 buffers.max_value = output.max_value.data();
891 }
892
893 size_t nsubsets = subsets.size();
894
895 if (options.compute_subset_sum) {
896 output.subset_sum.resize(nsubsets);
897 buffers.subset_sum.resize(nsubsets);
898 for (size_t s = 0; s < nsubsets; ++s) {
899 output.subset_sum[s].resize(ncells
902#endif
903 );
904 buffers.subset_sum[s] = output.subset_sum[s].data();
905 }
906 }
907
908 if (options.compute_subset_detected) {
909 output.subset_detected.resize(nsubsets);
910 buffers.subset_detected.resize(nsubsets);
911 for (size_t s = 0; s < nsubsets; ++s) {
912 output.subset_detected[s].resize(ncells
915#endif
916 );
917 buffers.subset_detected[s] = output.subset_detected[s].data();
918 }
919 }
920
922 return output;
923}
924
925}
926
927#endif
Temporary data structures for find_median_mad_blocked().
Definition find_median_mad.hpp:176
Simple quality control for single-cell data.
Definition adt_quality_control.hpp:20
void per_cell_qc_metrics(const tatami::Matrix< Value_, Index_ > &mat, const std::vector< Subset_ > &subsets, const PerCellQcMetricsBuffers< Sum_, Detected_, Value_, Index_ > &output, const PerCellQcMetricsOptions &options)
Definition per_cell_qc_metrics.hpp:810
void parallelize(Function_ fun, Index_ tasks, int threads)
Buffers for per_cell_qc_metrics().
Definition per_cell_qc_metrics.hpp:75
Value_ * max_value
Definition per_cell_qc_metrics.hpp:108
Index_ * max_index
Definition per_cell_qc_metrics.hpp:102
Sum_ * sum
Definition per_cell_qc_metrics.hpp:90
Detected_ * detected
Definition per_cell_qc_metrics.hpp:96
std::vector< Detected_ * > subset_detected
Definition per_cell_qc_metrics.hpp:124
std::vector< Sum_ * > subset_sum
Definition per_cell_qc_metrics.hpp:116
Options for per_cell_qc_metrics().
Definition per_cell_qc_metrics.hpp:22
bool compute_sum
Definition per_cell_qc_metrics.hpp:27
bool compute_subset_sum
Definition per_cell_qc_metrics.hpp:51
bool compute_max_value
Definition per_cell_qc_metrics.hpp:39
bool compute_subset_detected
Definition per_cell_qc_metrics.hpp:57
bool compute_detected
Definition per_cell_qc_metrics.hpp:33
bool compute_max_index
Definition per_cell_qc_metrics.hpp:45
int num_threads
Definition per_cell_qc_metrics.hpp:63
Result store for QC metric calculations.
Definition per_cell_qc_metrics.hpp:723
std::vector< Index_ > max_index
Definition per_cell_qc_metrics.hpp:751
std::vector< Value_ > max_value
Definition per_cell_qc_metrics.hpp:757
std::vector< std::vector< Detected_ > > subset_detected
Definition per_cell_qc_metrics.hpp:771
std::vector< Detected_ > detected
Definition per_cell_qc_metrics.hpp:744
std::vector< Sum_ > sum
Definition per_cell_qc_metrics.hpp:738
std::vector< std::vector< Sum_ > > subset_sum
Definition per_cell_qc_metrics.hpp:764
bool sparse_ordered_index