kmeans
k-means clustering in C++
Loading...
Searching...
No Matches
InitializeVariancePartition.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
2#define KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <queue>
8#include <cstddef>
9
10#include "sanisizer/sanisizer.hpp"
11
12#include "Initialize.hpp"
13#include "Matrix.hpp"
14
20namespace kmeans {
21
38
42namespace InitializeVariancePartition_internal {
43
44template<typename Float_>
45void compute_welford(const Float_ val, Float_& center, Float_& ss, const Float_ count) {
46 const auto cur_center = center;
47 const Float_ delta = val - cur_center;
48 const Float_ new_center = cur_center + delta / count;
49 center = new_center;
50 ss += (val - new_center) * delta;
51}
52
53template<typename Data_, typename Float_>
54void compute_welford(const std::size_t ndim, const Data_* const dptr, Float_* const center, Float_* const dim_ss, const Float_ count) {
55 for (I<decltype(ndim)> d = 0; d < ndim; ++d) {
56 compute_welford<Float_>(dptr[d], center[d], dim_ss[d], count);
57 }
58}
59
60template<typename Matrix_, typename Index_, typename Float_>
61Float_ optimize_partition(
62 const Matrix_& data,
63 const std::vector<Index_>& current,
64 const std::size_t top_dim,
65 std::vector<Float_>& value_buffer,
66 std::vector<Float_>& stat_buffer)
67{
82 const auto N = current.size();
83 auto work = data.new_known_extractor(current.data(), static_cast<std::size_t>(N)); // safety of the cast is already checked in InitializeVariancePartition::run().
84 value_buffer.clear();
85 value_buffer.reserve(N);
86 for (I<decltype(N)> i = 0; i < N; ++i) {
87 const auto dptr = work->get_observation();
88 value_buffer.push_back(dptr[top_dim]);
89 }
90 std::sort(value_buffer.begin(), value_buffer.end());
91
92 // stat_buffer[i] represents the SS when {0, 1, 2, ..., i} of values_buffer goes in the left partition and {i + 1, ..., N-1} goes in the right partition.
93 // This implies that stat_buffer will have length N - 1, such that i can span from 0 to N - 2.
94 // It can also be guaranteed that N >= 2 as a cluster with one point will have zero and thus never be selected for partition optimization.
95 stat_buffer.clear();
96 const auto N_m1 = N - 1;
97 stat_buffer.reserve(N_m1);
98
99 // Forward and backward iterations to get the sum of squares for the left and right partitions, respectively, at every possible partition point.
100 stat_buffer.push_back(0);
101 Float_ mean = value_buffer.front(), ss = 0, count = 2;
102 for (I<decltype(N_m1)> i = 1; i < N_m1; ++i) { // skip i == 0 as the left partition only has one point.
103 compute_welford<Float_>(value_buffer[i], mean, ss, count);
104 stat_buffer.push_back(ss);
105 ++count;
106 }
107
108 mean = value_buffer.back(), ss = 0, count = 2;
109 for (I<decltype(N_m1)> i_p1 = N_m1 - 1; i_p1 > 0; --i_p1) { // skip i + 1 == N - 1 (i.e., i_p1 == N_m1) as the right partition only has one point.
110 compute_welford<Float_>(value_buffer[i_p1], mean, ss, count);
111 stat_buffer[i_p1 - 1] += ss;
112 ++count;
113 }
114
115 const auto sbBegin = stat_buffer.begin();
116 const I<decltype(value_buffer.size())> which_min = std::min_element(sbBegin, stat_buffer.end()) - sbBegin;
117
118 // To avoid issues with ties, we use the average of the two edge points as the partition boundary.
119 const auto left = value_buffer[which_min];
120 const auto right = value_buffer[which_min + 1];
121 return left + (right - left) / 2; // avoid FP overflow.
122}
123
124}
165template<typename Index_, typename Data_, typename Cluster_, typename Float_, class Matrix_ = Matrix<Index_, Data_> >
166class InitializeVariancePartition final : public Initialize<Index_, Data_, Cluster_, Float_, Matrix_> {
167public:
171 InitializeVariancePartition(InitializeVariancePartitionOptions options) : my_options(std::move(options)) {}
172
177
178private:
180
181public:
187 return my_options;
188 }
189
190public:
194 Cluster_ run(const Matrix_& data, const Cluster_ ncenters, Float_* const centers) const {
195 const auto nobs = data.num_observations();
196 const auto ndim = data.num_dimensions();
197 if (nobs == 0 || ndim == 0) {
198 return 0;
199 }
200
201 auto assignments = sanisizer::create<std::vector<std::vector<Index_> > >(ncenters);
202 sanisizer::resize(assignments[0], nobs);
203 std::iota(assignments.front().begin(), assignments.front().end(), static_cast<Index_>(0));
204 sanisizer::cast<std::size_t>(nobs); // Checking that the maximum size of each 'assignments' can fit into a std::size_t, for optimal_partition().
205
206 auto dim_ss = sanisizer::create<std::vector<std::vector<Float_> > >(ncenters);
207 {
208 auto& cur_ss = dim_ss[0];
209 sanisizer::resize(cur_ss, ndim);
210 std::fill_n(centers, ndim, 0);
211 auto matwork = data.new_known_extractor(static_cast<Index_>(0), nobs);
212 for (I<decltype(nobs)> i = 0; i < nobs; ++i) {
213 const auto dptr = matwork->get_observation();
214 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(), static_cast<Float_>(i + 1));
215 }
216 }
217
218 std::priority_queue<std::pair<Float_, Cluster_> > highest;
219 auto add_to_queue = [&](const Cluster_ i) -> void {
220 const auto& cur_ss = dim_ss[i];
221 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(), static_cast<Float_>(0));
222
223 // Instead of dividing by N and then remultiplying by pow(N, adjustment), we just
224 // divide by pow(N, 1 - adjustment) to save some time and precision.
225 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.size_adjustment);
226
227 highest.emplace(sum_ss, i);
228 };
229 add_to_queue(0);
230
231 std::vector<Index_> cur_assignments_copy;
232 std::vector<Float_> opt_partition_values, opt_partition_stats;
233
234 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
235 const auto chosen = highest.top();
236 if (chosen.first == 0) {
237 return cluster; // no point continuing, we're at zero variance already.
238 }
239 highest.pop();
240
241 const auto cur_center = centers + sanisizer::product_unsafe<std::size_t>(chosen.second, ndim);
242 auto& cur_ss = dim_ss[chosen.second];
243 auto& cur_assignments = assignments[chosen.second];
244
245 const I<decltype(ndim)> top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
246 Float_ partition_boundary;
247 if (my_options.optimize_partition) {
248 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
249 } else {
250 partition_boundary = cur_center[top_dim];
251 }
252
253 const auto next_center = centers + sanisizer::product_unsafe<std::size_t>(cluster, ndim);
254 std::fill_n(next_center, ndim, 0);
255 auto& next_ss = dim_ss[cluster];
256 next_ss.resize(ndim); // resize() is safe from overflow as we already tested it outside the loop with dim_ss[0].
257 auto& next_assignments = assignments[cluster];
258
259 cur_assignments_copy.clear();
260 std::fill_n(cur_center, ndim, 0);
261 std::fill(cur_ss.begin(), cur_ss.end(), 0);
262 auto work = data.new_known_extractor(cur_assignments.data(), cur_assignments.size());
263
264 for (const auto i : cur_assignments) {
265 const auto dptr = work->get_observation(); // make sure this is outside the if(), as it must always be called in each loop iteration to match 'cur_assignments' properly.
266 if (dptr[top_dim] < partition_boundary) {
267 cur_assignments_copy.push_back(i);
268 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(), static_cast<Float_>(cur_assignments_copy.size()));
269 } else {
270 next_assignments.push_back(i);
271 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(), static_cast<Float_>(next_assignments.size()));
272 }
273 }
274
275 // If one or the other is empty, then this entire procedure short-circuits as all future iterations will just re-select this cluster (which won't get partitioned properly anyway).
276 // In the bigger picture, the quick exit out of the iterations is correct as we should only fail to partition in this manner if all points within each remaining cluster are identical.
277 if (next_assignments.empty() || cur_assignments_copy.empty()) {
278 return cluster;
279 }
280
281 cur_assignments.swap(cur_assignments_copy);
282 add_to_queue(chosen.second);
283 add_to_queue(cluster);
284 }
285
286 return ncenters;
287 }
291};
292
293}
294
295#endif
Interface for k-means initialization.
Interface for matrix inputs.
Implements the variance partitioning method of Su and Dy (2007).
Definition InitializeVariancePartition.hpp:166
InitializeVariancePartitionOptions & get_options()
Definition InitializeVariancePartition.hpp:186
InitializeVariancePartition(InitializeVariancePartitionOptions options)
Definition InitializeVariancePartition.hpp:171
Interface for k-means initialization algorithms.
Definition Initialize.hpp:29
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
Options for InitializeVariancePartition.
Definition InitializeVariancePartition.hpp:25
bool optimize_partition
Definition InitializeVariancePartition.hpp:36
double size_adjustment
Definition InitializeVariancePartition.hpp:30