nenesub
Nearest neighbors subsampling
Loading...
Searching...
No Matches
nenesub.hpp
Go to the documentation of this file.
1#ifndef NENESUB_HPP
2#define NENESUB_HPP
3
4#include <vector>
5#include <queue>
6#include <cstddef>
7#include <algorithm>
8#include <type_traits>
9
10#include "knncolle/knncolle.hpp"
11#include "sanisizer/sanisizer.hpp"
12
22namespace nenesub {
23
27struct Options {
33 int num_neighbors = 20;
34
40 int min_remaining = 10;
41
47 int num_threads = 10;
48};
49
53template<typename Input_>
54std::remove_cv_t<std::remove_reference_t<Input_> > I(Input_ x) {
55 return x;
56}
105template<typename Index_, class GetNeighbors_, class GetIndex_, class GetMaxDistance_>
106void compute(const Index_ num_obs, const GetNeighbors_ get_neighbors, const GetIndex_ get_index, const GetMaxDistance_ get_max_distance, const Options& options, std::vector<Index_>& selected) {
107 typedef decltype(I(get_max_distance(0))) Distance;
108 struct Payload {
109 Payload(const Index_ identity, const Index_ remaining, const Distance max_distance) : remaining(remaining), identity(identity), max_distance(max_distance) {}
110 Index_ remaining;
111 Index_ identity;
112 Distance max_distance;
113 };
114
115 const auto cmp = [](const Payload& left, const Payload& right) -> bool {
116 if (left.remaining == right.remaining) {
117 if (left.max_distance == right.max_distance) {
118 return left.identity > right.identity; // smallest identities show up first.
119 }
120 return left.max_distance > right.max_distance; // smallest distances show up first.
121 }
122 return left.remaining < right.remaining; // largest remaining show up first.
123 };
124 std::priority_queue<Payload, std::vector<Payload>, decltype(I(cmp))> store(
125 cmp,
126 [&]{
127 std::vector<Payload> container;
128 container.reserve(num_obs);
129 return container;
130 }()
131 );
132
133 auto reverse_map = sanisizer::create<std::vector<std::vector<Index_> > >(num_obs);
134 auto remaining = sanisizer::create<std::vector<Index_> >(num_obs);
135 for (Index_ c = 0; c < num_obs; ++c) {
136 const auto& neighbors = get_neighbors(c);
137 const Index_ nneighbors = neighbors.size();
138
139 if (nneighbors) { // protect get_max_distance just in case there are no neighbors.
140 store.emplace(c, nneighbors, get_max_distance(c));
141 for (Index_ n = 0; n < nneighbors; ++n) {
142 reverse_map[get_index(neighbors, n)].push_back(c);
143 }
144 remaining[c] = nneighbors;
145 }
146 }
147
148 selected.clear();
149 auto tainted = sanisizer::create<std::vector<unsigned char> >(num_obs);
150 Index_ min_remaining = options.min_remaining;
151 while (!store.empty()) {
152 auto payload = store.top();
153 store.pop();
154 if (tainted[payload.identity]) { // it's already in another selected point's local neighborhood.
155 continue;
156 }
157
158 const Index_ new_remaining = remaining[payload.identity];
159 if (new_remaining < min_remaining) { // it can't be valid so we skip it.
160 continue;
161 }
162
163 payload.remaining = new_remaining;
164 if (!store.empty() && cmp(payload, store.top())) { // cycling it back into the queue with an updated remaining count, if it's not better than the queue's best.
165 store.push(payload);
166 continue;
167 }
168
169 selected.push_back(payload.identity);
170 tainted[payload.identity] = 1;
171 for (const auto x : reverse_map[payload.identity]) {
172 --remaining[x];
173 }
174
175 const auto& neighbors = get_neighbors(payload.identity);
176 const Index_ nneighbors = neighbors.size();
177 for (Index_ n = 0; n < nneighbors; ++n) {
178 const auto current = get_index(neighbors, n);
179 if (tainted[current]) {
180 continue;
181 }
182 tainted[current] = 1;
183 for (const auto x : reverse_map[current]) {
184 --remaining[x];
185 }
186 }
187 }
188
189 std::sort(selected.begin(), selected.end());
190}
191
206template<typename Index_, typename Distance_>
207std::vector<Index_> compute(const knncolle::NeighborList<Index_, Distance_>& neighbors, const Options& options) {
208 std::vector<Index_> output;
209 compute(
210 static_cast<Index_>(neighbors.size()),
211 [&](const Index_ i) -> const auto& { return neighbors[i]; },
212 [](const std::vector<std::pair<Index_, Distance_> >& x, const Index_ n) -> Index_ { return x[n].first; },
213 [&](const Index_ i) -> Distance_ { return neighbors[i].back().second; },
214 options,
215 output
216 );
217 return output;
218}
219
234template<typename Index_, typename Input_, typename Distance_>
235std::vector<Index_> compute(const knncolle::Prebuilt<Index_, Input_, Distance_>& prebuilt, const Options& options) {
236 const int k = options.num_neighbors;
237 if (k < options.min_remaining) {
238 throw std::runtime_error("number of neighbors is less than 'min_remaining'");
239 }
240
241 const Index_ nobs = prebuilt.num_observations();
242 const auto capped_k = knncolle::cap_k(k, nobs);
243 auto nn_indices = sanisizer::create<std::vector<std::vector<Index_> > >(nobs);
244 auto max_distance = sanisizer::create<std::vector<Distance_> >(nobs);
245
246 knncolle::parallelize(options.num_threads, nobs, [&](const int, const Index_ start, const Index_ length) -> void {
247 const auto sptr = prebuilt.initialize();
248 std::vector<Distance_> nn_distances;
249 for (Index_ i = start, end = start + length; i < end; ++i) {
250 sptr->search(i, capped_k, &(nn_indices[i]), &nn_distances);
251 max_distance[i] = (capped_k ? nn_distances.back() : 0);
252 }
253 });
254
255 std::vector<Index_> output;
256 compute(
257 nobs,
258 [&](const Index_ i) -> const std::vector<Index_>& { return nn_indices[i]; },
259 [](const std::vector<Index_>& x, const Index_ n) -> Index_ { return x[n]; },
260 [&](const Index_ i) -> Distance_ { return max_distance[i]; },
261 options,
262 output
263 );
264 return output;
265}
266
285template<typename Index_, typename Input_, typename Distance_, class Matrix_ = knncolle::Matrix<Index_, Input_> >
286std::vector<Index_> compute(
287 const std::size_t num_dims,
288 const Index_ num_obs,
289 const Input_* data,
291 const Options& options)
292{
293 const auto prebuilt = knn_method.build_unique(knncolle::SimpleMatrix<Index_, Input_>(num_dims, num_obs, data));
294 return compute(*prebuilt, options);
295}
296
297}
298
299#endif
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
virtual Index_ num_observations() const=0
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
int cap_k(int k, Index_ num_observations)
std::vector< std::vector< std::pair< Index_, Distance_ > > > NeighborList
Nearest-neighbors subsampling.
void compute(const Index_ num_obs, const GetNeighbors_ get_neighbors, const GetIndex_ get_index, const GetMaxDistance_ get_max_distance, const Options &options, std::vector< Index_ > &selected)
Definition nenesub.hpp:106
Options for compute().
Definition nenesub.hpp:27
int num_neighbors
Definition nenesub.hpp:33
int num_threads
Definition nenesub.hpp:47
int min_remaining
Definition nenesub.hpp:40