1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
9#include "sanisizer/sanisizer.hpp"
10#include "aarand/aarand.hpp"
14#include "copy_into_array.hpp"
38 typename InitializeKmeansppRng::result_type
seed = sanisizer::cap<typename InitializeKmeansppRng::result_type>(6523);
50namespace InitializeKmeanspp_internal {
52template<
typename Float_,
typename Index_,
class Engine_>
53Index_ weighted_sample(
const std::vector<Float_>& cumulative,
const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
54 const auto total = cumulative.back();
58 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
61 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
70 }
while (chosen_id == nobs || mindist[chosen_id] == 0);
75template<
typename Index_,
typename Float_,
class Matrix_,
typename Cluster_>
76std::vector<Index_> run_kmeanspp(
const Matrix_& data, Cluster_ ncenters,
const typename InitializeKmeansppRng::result_type seed,
const int nthreads) {
77 const auto nobs = data.num_observations();
78 const auto ndim = data.num_dimensions();
79 auto mindist = sanisizer::create<std::vector<Float_> >(nobs, 1);
80 auto cumulative = sanisizer::create<std::vector<Float_> >(nobs);
82 std::vector<Index_> sofar;
83 sofar.reserve(ncenters);
84 InitializeKmeansppRng eng(seed);
86 auto last_work = data.new_known_extractor();
87 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
89 const auto last_ptr = last_work->get_observation(sofar.back());
91 parallelize(nthreads, nobs, [&](
const int,
const Index_ start,
const Index_ length) ->
void {
92 auto curwork = data.new_known_extractor(start, length);
93 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
94 const auto current = curwork->get_observation();
96 if (mindist[obs] == 0) {
101 for (I<
decltype(ndim)> d = 0; d < ndim; ++d) {
102 const Float_ delta =
static_cast<Float_
>(current[d]) -
static_cast<Float_
>(last_ptr[d]);
106 if (cen == 1 || r2 < mindist[obs]) {
113 cumulative[0] = mindist[0];
114 for (Index_ i = 1; i < nobs; ++i) {
115 cumulative[i] = cumulative[i-1] + mindist[i];
118 const auto total = cumulative.back();
123 const auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
124 mindist[chosen_id] = 0;
125 sofar.push_back(chosen_id);
156template<
typename Index_,
typename Data_,
typename Cluster_,
typename Float_,
class Matrix_ = Matrix<Index_, Data_> >
185 Cluster_
run(
const Matrix_& matrix,
const Cluster_ ncenters, Float_*
const centers)
const {
186 const auto nobs = matrix.num_observations();
191 const auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Index_, Float_>(matrix, ncenters, my_options.
seed, my_options.
num_threads);
192 internal::copy_into_array(matrix, sofar, centers);
Interface for k-means initialization.
Interface for matrix inputs.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:157
InitializeKmeanspp()=default
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:165
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:177
Interface for k-means initialization algorithms.
Definition Initialize.hpp:29
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
std::mt19937_64 InitializeKmeansppRng
Definition InitializeKmeanspp.hpp:29
Utilities for parallelization.
Options for InitializeKmeanspp.
Definition InitializeKmeanspp.hpp:34
InitializeKmeansppRng::result_type seed
Definition InitializeKmeanspp.hpp:38
int num_threads
Definition InitializeKmeanspp.hpp:44