factorize
Create factors from categorical variables
Loading...
Searching...
No Matches
combine_to_factor.hpp
Go to the documentation of this file.
1#ifndef FACTORIZE_COMBINE_TO_FACTOR_HPP
2#define FACTORIZE_COMBINE_TO_FACTOR_HPP
3
4#include <algorithm>
5#include <vector>
6#include <map>
7#include <unordered_map>
8#include <cstddef>
9
10#include "sanisizer/sanisizer.hpp"
11
12#include "create_factor.hpp"
13#include "utils.hpp"
14
20namespace factorize {
21
42template<typename Input_, typename Code_>
43std::vector<std::vector<Input_> > combine_to_factor(const std::size_t n, const std::vector<const Input_*>& inputs, Code_* const codes) {
44 const auto ninputs = inputs.size();
45 auto output = sanisizer::create<std::vector<std::vector<Input_> > >(ninputs);
46
47 // Handling the special cases.
48 if (ninputs == 0) {
49 std::fill_n(codes, n, 0);
50 return output;
51 }
52 if (ninputs == 1) {
53 output[0] = create_factor(n, inputs.front(), codes);
54 return output;
55 }
56
57 // Creating a hashmap on the combinations of each factor.
58 struct Combination {
59 Combination(const std::size_t i) : index(i) {}
60 std::size_t index;
61 };
62
63 auto unique = [&]{ // scoping this in an IIFE to release map memory sooner.
64 // Using a map with a custom comparator that uses the index
65 // of first occurrence of each factor as the key. Currently using a map
66 // to (i) avoid issues with collisions of combined hashes and (ii)
67 // avoid having to write more code for sorting a vector of arrays.
68 auto cmp = [&](const Combination& left, const Combination& right) -> bool {
69 for (auto curf : inputs) {
70 if (curf[left.index] < curf[right.index]) {
71 return true;
72 } else if (curf[left.index] > curf[right.index]) {
73 return false;
74 }
75 }
76 return false;
77 };
78 std::map<Combination, Code_, I<decltype(cmp)> > mapping(std::move(cmp));
79
80 const auto eq = [&](const Combination& left, const Combination& right) -> bool {
81 for (auto curf : inputs) {
82 if (curf[left.index] != curf[right.index]) {
83 return false;
84 }
85 }
86 return true;
87 };
88
89 for (I<decltype(n)> i = 0; i < n; ++i) {
90 Combination current(i);
91 const auto mIt = mapping.find(current);
92 if (mIt == mapping.end() || !eq(mIt->first, current)) {
93 Code_ alt = mapping.size();
94 mapping.insert(mIt, std::make_pair(current, alt));
95 codes[i] = alt;
96 } else {
97 codes[i] = mIt->second;
98 }
99 }
100
101 return std::vector<std::pair<Combination, Code_> >(mapping.begin(), mapping.end());
102 }();
103
104 // Remapping to a sorted set.
105 const auto nuniq = unique.size();
106 for (auto& ofac : output) {
107 ofac.reserve(nuniq);
108 }
109 auto remapping = sanisizer::create<std::vector<Code_> >(nuniq);
110 for (I<decltype(nuniq)> u = 0; u < nuniq; ++u) {
111 const auto ix = unique[u].first.index;
112 for (I<decltype(ninputs)> f = 0; f < ninputs; ++f) {
113 output[f].push_back(inputs[f][ix]);
114 }
115 remapping[unique[u].second] = u;
116 }
117
118 // Mapping each cell to its sorted combination.
119 for (I<decltype(n)> i = 0; i < n; ++i) {
120 codes[i] = remapping[codes[i]];
121 }
122
123 return output;
124}
125
148template<typename Input_, typename Number_, typename Code_>
149std::vector<std::vector<Input_> > combine_to_factor_unused(const std::size_t n, const std::vector<std::pair<const Input_*, Number_> >& inputs, Code_* const codes) {
150 const auto ninputs = inputs.size();
151 auto output = sanisizer::create<std::vector<std::vector<Input_> > >(ninputs);
152
153 // Handling the special cases.
154 if (ninputs == 0) {
155 std::fill_n(codes, n, 0);
156 return output;
157 }
158 if (ninputs == 1) {
159 sanisizer::resize(output[0], inputs[0].second);
160 std::iota(output[0].begin(), output[0].end(), static_cast<Code_>(0));
161 std::copy_n(inputs[0].first, n, codes);
162 return output;
163 }
164
165 // We iterate from back to front, where the first factor is the slowest changing.
166 std::copy_n(inputs[ninputs - 1].first, n, codes);
167 Code_ ncombos = inputs[ninputs - 1].second;
168 for (I<decltype(ninputs)> f = ninputs - 1; f > 0; --f) {
169 const auto& finfo = inputs[f - 1];
170 const auto next_ncombos = sanisizer::product<Code_>(ncombos, finfo.second);
171 const auto ff = finfo.first;
172 for (I<decltype(n)> i = 0; i < n; ++i) {
173 // Product is safe as it is obviously less than 'next_combos' for 'ff[i] < finfo.second'.
174 // Addition is also safe as it will be less than 'next_combos', though this is less obvious.
175 codes[i] += sanisizer::product_unsafe<Code_>(ncombos, ff[i]);
176 }
177 ncombos = next_ncombos;
178 }
179
180 sanisizer::cast<I<decltype(output[0].size())> >(ncombos); // check that we can actually make the output vectors.
181 Code_ outer_repeats = ncombos;
182 Code_ inner_repeats = 1;
183 for (I<decltype(ninputs)> f = ninputs; f > 0; --f) {
184 auto& out = output[f - 1];
185 out.reserve(ncombos);
186
187 const auto& finfo = inputs[f - 1];
188 const Code_ initial_size = inner_repeats * finfo.second;
189 out.resize(initial_size);
190
191 if (inner_repeats == 1) {
192 std::iota(out.begin(), out.end(), static_cast<Code_>(0));
193 } else {
194 auto oIt = out.begin();
195 for (Number_ l = 0; l < finfo.second; ++l) {
196 std::fill_n(oIt, inner_repeats, l);
197 oIt += inner_repeats;
198 }
199 }
200 inner_repeats = initial_size;
201
202 outer_repeats /= finfo.second;
203 for (Code_ r = 1; r < outer_repeats; ++r) {
204 // Referencing to 'begin()' while inserting is safe, as we reserved the full length at the start.
205 // Thus, there shouldn't be any allocations.
206 out.insert(out.end(), out.begin(), out.begin() + initial_size);
207 }
208 }
209
210 return output;
211}
212
213}
214
215#endif
Create a factor from a categorical variable.
Create factors from categorical variables.
Definition combine_to_factor.hpp:20
std::vector< Input_ > create_factor(const std::size_t n, const Input_ *const input, Code_ *const codes)
Definition create_factor.hpp:39
std::vector< std::vector< Input_ > > combine_to_factor(const std::size_t n, const std::vector< const Input_ * > &inputs, Code_ *const codes)
Definition combine_to_factor.hpp:43
std::vector< std::vector< Input_ > > combine_to_factor_unused(const std::size_t n, const std::vector< std::pair< const Input_ *, Number_ > > &inputs, Code_ *const codes)
Definition combine_to_factor.hpp:149