nenesub
Nearest neighbors subsampling
Loading...
Searching...
No Matches
nenesub.hpp
Go to the documentation of this file.
1#ifndef NENESUB_HPP
2#define NENESUB_HPP
3
4#include <vector>
5#include <queue>
6#include <cstdint>
7#include <algorithm>
9
19namespace nenesub {
20
24struct Options {
29 int num_neighbors = 20;
30
35 int min_remaining = 10;
36
42 int num_threads = 10;
43};
44
86template<typename Index_, class GetNeighbors_, class GetIndex_, class GetMaxDistance_>
87void compute(Index_ num_obs, GetNeighbors_ get_neighbors, GetIndex_ get_index, GetMaxDistance_ get_max_distance, const Options& options, std::vector<Index_>& selected) {
88 typedef decltype(get_max_distance(0)) Distance_;
89 struct Payload {
90 Payload(Index_ identity, Index_ remaining, Distance_ max_distance) : remaining(remaining), identity(identity), max_distance(max_distance) {}
91 Index_ remaining;
92 Index_ identity;
93 Distance_ max_distance;
94 };
95
96 auto cmp = [](const Payload& left, const Payload& right) -> bool {
97 if (left.remaining == right.remaining) {
98 if (left.max_distance == right.max_distance) {
99 return left.identity > right.identity; // smallest identities show up first.
100 }
101 return left.max_distance > right.max_distance; // smallest distances show up first.
102 }
103 return left.remaining < right.remaining; // largest remaining show up first.
104 };
105 std::priority_queue<Payload, std::vector<Payload>, decltype(cmp)> store(
106 cmp,
107 [&]{
108 std::vector<Payload> container;
109 container.reserve(num_obs);
110 return container;
111 }()
112 );
113
114 std::vector<std::vector<Index_> > reverse_map(num_obs);
115 std::vector<Index_> remaining(num_obs);
116 for (Index_ c = 0; c < num_obs; ++c) {
117 const auto& neighbors = get_neighbors(c);
118 Index_ nneighbors = neighbors.size();
119
120 if (nneighbors) { // protect get_max_distance just in case there are no neighbors.
121 store.emplace(c, nneighbors, get_max_distance(c));
122 for (Index_ n = 0; n < nneighbors; ++n) {
123 reverse_map[get_index(neighbors, n)].push_back(c);
124 }
125 remaining[c] = nneighbors;
126 }
127 }
128
129 selected.clear();
130 std::vector<uint8_t> tainted(num_obs);
131 Index_ min_remaining = options.min_remaining;
132 while (!store.empty()) {
133 auto payload = store.top();
134 store.pop();
135 if (tainted[payload.identity]) {
136 continue;
137 }
138
139 const auto& neighbors = get_neighbors(payload.identity);
140 Index_ new_remaining = remaining[payload.identity];
141
142 if (new_remaining >= min_remaining) {
143 payload.remaining = new_remaining;
144 if (!store.empty() && cmp(payload, store.top())) {
145 store.push(payload);
146 } else {
147 selected.push_back(payload.identity);
148 tainted[payload.identity] = 1;
149 for (auto x : reverse_map[payload.identity]) {
150 --remaining[x];
151 }
152
153 Index_ nneighbors = neighbors.size();
154 for (Index_ n = 0; n < nneighbors; ++n) {
155 auto current = get_index(neighbors, n);
156 tainted[current] = 1;
157 for (auto x : reverse_map[current]) {
158 --remaining[x];
159 }
160 }
161 }
162 }
163 }
164
165 std::sort(selected.begin(), selected.end());
166}
167
183template<typename Index_, typename Distance_>
184std::vector<Index_> compute(const knncolle::NeighborList<Index_, Distance_>& neighbors, const Options& options) {
185 std::vector<Index_> output;
186 compute(
187 static_cast<Index_>(neighbors.size()),
188 [&](size_t i) -> const auto& { return neighbors[i]; },
189 [](const auto& x, Index_ n) -> Index_ { return x[n].first; },
190 [&](size_t i) -> Distance_ { return neighbors[i].back().second; },
191 options,
192 output
193 );
194 return output;
195}
196
209template<typename Dim_, typename Index_, typename Float_>
210std::vector<Index_> compute(const knncolle::Prebuilt<Dim_, Index_, Float_>& prebuilt, const Options& options) {
211 int k = options.num_neighbors;
212 if (k < options.min_remaining) {
213 throw std::runtime_error("number of neighbors is less than 'min_remaining'");
214 }
215
216 Index_ nobs = prebuilt.num_observations();
217 std::vector<std::vector<Index_> > nn_indices(nobs);
218 std::vector<Float_> max_distance(nobs);
219
220 knncolle::parallelize(options.num_threads, nobs, [&](int, Index_ start, Index_ length) -> void {
221 auto sptr = prebuilt.initialize();
222 std::vector<Float_> nn_distances;
223 for (Index_ i = start, end = start + length; i < end; ++i) {
224 sptr->search(i, k, &(nn_indices[i]), &nn_distances);
225 max_distance[i] = (k ? 0 : nn_distances.back());
226 }
227 });
228
229 std::vector<Index_> output;
230 compute(
231 nobs,
232 [&](size_t i) -> const std::vector<Index_>& { return nn_indices[i]; },
233 [](const std::vector<Index_>& x, Index_ n) -> Index_ { return x[n]; },
234 [&](size_t i) -> Float_ { return max_distance[i]; },
235 options,
236 output
237 );
238 return output;
239}
240
257template<typename Dim_, typename Index_, typename Value_, typename Float_>
258std::vector<Index_> compute(
259 Dim_ num_dims,
260 Index_ num_obs,
261 const Value_* data,
263 const Options& options)
264{
265 auto prebuilt = knn_method.build_unique(knncolle::SimpleMatrix<Dim_, Index_, Value_>(num_dims, num_obs, data));
266 return compute(*prebuilt, options);
267}
268
269}
270
271#endif
virtual Index_ num_observations() const=0
std::vector< std::vector< std::pair< Index_, Float_ > > > NeighborList
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
Nearest-neighbors subsampling.
void compute(Index_ num_obs, GetNeighbors_ get_neighbors, GetIndex_ get_index, GetMaxDistance_ get_max_distance, const Options &options, std::vector< Index_ > &selected)
Definition nenesub.hpp:87
Options for compute().
Definition nenesub.hpp:24
int num_neighbors
Definition nenesub.hpp:29
int num_threads
Definition nenesub.hpp:42
int min_remaining
Definition nenesub.hpp:35