nenesub
Nearest neighbors subsampling
Loading...
Searching...
No Matches
nenesub.hpp
Go to the documentation of this file.
1#ifndef NENESUB_HPP
2#define NENESUB_HPP
3
4#include <vector>
5#include <queue>
6#include <cstddef>
7#include <algorithm>
8
10
20namespace nenesub {
21
25struct Options {
30 int num_neighbors = 20;
31
36 int min_remaining = 10;
37
43 int num_threads = 10;
44};
45
87template<typename Index_, class GetNeighbors_, class GetIndex_, class GetMaxDistance_>
88void compute(Index_ num_obs, GetNeighbors_ get_neighbors, GetIndex_ get_index, GetMaxDistance_ get_max_distance, const Options& options, std::vector<Index_>& selected) {
89 typedef decltype(get_max_distance(0)) Distance;
90 struct Payload {
91 Payload(Index_ identity, Index_ remaining, Distance max_distance) : remaining(remaining), identity(identity), max_distance(max_distance) {}
92 Index_ remaining;
93 Index_ identity;
94 Distance max_distance;
95 };
96
97 auto cmp = [](const Payload& left, const Payload& right) -> bool {
98 if (left.remaining == right.remaining) {
99 if (left.max_distance == right.max_distance) {
100 return left.identity > right.identity; // smallest identities show up first.
101 }
102 return left.max_distance > right.max_distance; // smallest distances show up first.
103 }
104 return left.remaining < right.remaining; // largest remaining show up first.
105 };
106 std::priority_queue<Payload, std::vector<Payload>, decltype(cmp)> store(
107 cmp,
108 [&]{
109 std::vector<Payload> container;
110 container.reserve(num_obs);
111 return container;
112 }()
113 );
114
115 std::vector<std::vector<Index_> > reverse_map(num_obs);
116 std::vector<Index_> remaining(num_obs);
117 for (Index_ c = 0; c < num_obs; ++c) {
118 const auto& neighbors = get_neighbors(c);
119 Index_ nneighbors = neighbors.size();
120
121 if (nneighbors) { // protect get_max_distance just in case there are no neighbors.
122 store.emplace(c, nneighbors, get_max_distance(c));
123 for (Index_ n = 0; n < nneighbors; ++n) {
124 reverse_map[get_index(neighbors, n)].push_back(c);
125 }
126 remaining[c] = nneighbors;
127 }
128 }
129
130 selected.clear();
131 std::vector<unsigned char> tainted(num_obs);
132 Index_ min_remaining = options.min_remaining;
133 while (!store.empty()) {
134 auto payload = store.top();
135 store.pop();
136 if (tainted[payload.identity]) {
137 continue;
138 }
139
140 const auto& neighbors = get_neighbors(payload.identity);
141 Index_ new_remaining = remaining[payload.identity];
142
143 if (new_remaining >= min_remaining) {
144 payload.remaining = new_remaining;
145 if (!store.empty() && cmp(payload, store.top())) {
146 store.push(payload);
147 } else {
148 selected.push_back(payload.identity);
149 tainted[payload.identity] = 1;
150 for (auto x : reverse_map[payload.identity]) {
151 --remaining[x];
152 }
153
154 Index_ nneighbors = neighbors.size();
155 for (Index_ n = 0; n < nneighbors; ++n) {
156 auto current = get_index(neighbors, n);
157 tainted[current] = 1;
158 for (auto x : reverse_map[current]) {
159 --remaining[x];
160 }
161 }
162 }
163 }
164 }
165
166 std::sort(selected.begin(), selected.end());
167}
168
184template<typename Index_, typename Distance_>
185std::vector<Index_> compute(const knncolle::NeighborList<Index_, Distance_>& neighbors, const Options& options) {
186 std::vector<Index_> output;
187 compute(
188 static_cast<Index_>(neighbors.size()),
189 [&](Index_ i) -> const auto& { return neighbors[i]; },
190 [](const std::vector<std::pair<Index_, Distance_> >& x, Index_ n) -> Index_ { return x[n].first; },
191 [&](Index_ i) -> Distance_ { return neighbors[i].back().second; },
192 options,
193 output
194 );
195 return output;
196}
197
212template<typename Index_, typename Input_, typename Distance_>
213std::vector<Index_> compute(const knncolle::Prebuilt<Index_, Input_, Distance_>& prebuilt, const Options& options) {
214 int k = options.num_neighbors;
215 if (k < options.min_remaining) {
216 throw std::runtime_error("number of neighbors is less than 'min_remaining'");
217 }
218
219 Index_ nobs = prebuilt.num_observations();
220 auto capped_k = knncolle::cap_k(k, nobs);
221 std::vector<std::vector<Index_> > nn_indices(nobs);
222 std::vector<Distance_> max_distance(nobs);
223
224 knncolle::parallelize(options.num_threads, nobs, [&](int, Index_ start, Index_ length) -> void {
225 auto sptr = prebuilt.initialize();
226 std::vector<Distance_> nn_distances;
227 for (Index_ i = start, end = start + length; i < end; ++i) {
228 sptr->search(i, capped_k, &(nn_indices[i]), &nn_distances);
229 max_distance[i] = (capped_k ? 0 : nn_distances.back());
230 }
231 });
232
233 std::vector<Index_> output;
234 compute(
235 nobs,
236 [&](Index_ i) -> const std::vector<Index_>& { return nn_indices[i]; },
237 [](const std::vector<Index_>& x, Index_ n) -> Index_ { return x[n]; },
238 [&](Index_ i) -> Distance_ { return max_distance[i]; },
239 options,
240 output
241 );
242 return output;
243}
244
263template<typename Index_, typename Input_, typename Distance_, class Matrix_ = knncolle::Matrix<Index_, Input_> >
264std::vector<Index_> compute(
265 std::size_t num_dims,
266 Index_ num_obs,
267 const Input_* data,
269 const Options& options)
270{
271 auto prebuilt = knn_method.build_unique(knncolle::SimpleMatrix<Index_, Input_>(num_dims, num_obs, data));
272 return compute(*prebuilt, options);
273}
274
275}
276
277#endif
std::unique_ptr< Prebuilt< Index_, Data_, Distance_ > > build_unique(const Matrix_ &data) const
virtual Index_ num_observations() const=0
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
int cap_k(int k, Index_ num_observations)
std::vector< std::vector< std::pair< Index_, Distance_ > > > NeighborList
Nearest-neighbors subsampling.
void compute(Index_ num_obs, GetNeighbors_ get_neighbors, GetIndex_ get_index, GetMaxDistance_ get_max_distance, const Options &options, std::vector< Index_ > &selected)
Definition nenesub.hpp:88
Options for compute().
Definition nenesub.hpp:25
int num_neighbors
Definition nenesub.hpp:30
int num_threads
Definition nenesub.hpp:43
int min_remaining
Definition nenesub.hpp:36