kmeans/RefineMiniBatch_8hpp_source.html

#ifndef KMEANS_REFINE_MINIBATCH_HPP

#define KMEANS_REFINE_MINIBATCH_HPP


#include <vector>

#include <algorithm>

#include <cstddef>

#include <random>


#include "sanisizer/sanisizer.hpp"

#include "aarand/aarand.hpp"


#include "Refine.hpp"

#include "Details.hpp"

#include "QuickSearch.hpp"

#include "is_edge_case.hpp"

#include "parallelize.hpp"


namespace kmeans {


typedef std::mt19937_64 RefineMiniBatchRng;


struct RefineMiniBatchOptions {

    int max_iterations = 100;


    int batch_size = 500;


    double max_change_proportion = 0.01;


    int convergence_history = 10;


    typename RefineMiniBatchRng::result_type seed = sanisizer::cap<typename RefineMiniBatchRng::result_type>(1234567890);


    int num_threads = 1;

};


template<typename Index_, typename Data_, typename Cluster_, typename Float_, typename Matrix_ = Matrix<Index_, Data_> >


class RefineMiniBatch : public Refine<Index_, Data_, Cluster_, Float_, Matrix_> {

public:

    RefineMiniBatch(RefineMiniBatchOptions options) : my_options(std::move(options)) {}


    RefineMiniBatch() = default;


public:


    RefineMiniBatchOptions& get_options() {

        return my_options;

    }


private:

    RefineMiniBatchOptions my_options;


public:

    Details<Index_> run(const Matrix_& data, const Cluster_ ncenters, Float_* const centers, Cluster_* const clusters) const {

        const auto nobs = data.num_observations();

        if (internal::is_edge_case(nobs, ncenters)) {

            return internal::process_edge_case(data, ncenters, centers, clusters);

        }


        auto total_sampled = sanisizer::create<std::vector<unsigned long long> >(ncenters); // holds the number of sampled observations across iterations, so we need a large integer.

        auto last_changed = sanisizer::create<std::vector<unsigned long long> >(ncenters); // holds the number of sampled/changed observation for the last few iterations.

        auto last_sampled = sanisizer::create<std::vector<unsigned long long> >(ncenters);

        auto previous = sanisizer::create<std::vector<Cluster_> >(nobs);


        const I<decltype(nobs)> actual_batch_size = sanisizer::min(nobs, my_options.batch_size);

        sanisizer::cast<std::size_t>(actual_batch_size); // check that static_cast for new_known_extractor() calls will be safe.

        auto chosen = sanisizer::create<std::vector<Index_> >(actual_batch_size);

        RefineMiniBatchRng eng(my_options.seed);


        const auto ndim = data.num_dimensions();

        internal::QuickSearch<Float_, Cluster_> index(ndim, ncenters);


        I<decltype(my_options.max_iterations)> iter = 0;

        for (; iter < my_options.max_iterations; ++iter) {

            aarand::sample(nobs, actual_batch_size, chosen.data(), eng);

            if (iter > 0) {

                for (const auto o : chosen) {

                    previous[o] = clusters[o];

                }

            }


            index.reset(centers);

            parallelize(my_options.num_threads, actual_batch_size, [&](const int, const Index_ start, const Index_ length) -> void {

                auto matwork = data.new_known_extractor(chosen.data() + start, static_cast<std::size_t>(length));

                auto qswork = index.new_workspace();

                for (Index_ s = start, end = start + length; s < end; ++s) {

                    const auto ptr = matwork->get_observation();

                    clusters[chosen[s]] = index.find(ptr, qswork);

                }

            });


            // Updating the means for each cluster.

            auto work = data.new_known_extractor(chosen.data(), static_cast<std::size_t>(chosen.size()));

            for (const auto o : chosen) {

                const auto c = clusters[o];

                auto& n = total_sampled[c];

                ++n;


                const auto ocopy = work->get_observation();

                for (I<decltype(ndim)> d = 0; d < ndim; ++d) {

                    auto& curcenter = centers[sanisizer::nd_offset<std::size_t>(d, ndim, c)];

                    curcenter += (static_cast<Float_>(ocopy[d]) - curcenter) / n; // cast to ensure consistent precision regardless of Matrix_::data_type.

                }

            }


            // Checking for updates.

            if (iter != 0) {

                for (const auto o : chosen) {

                    const auto p = previous[o];

                    ++(last_sampled[p]);

                    const auto c = clusters[o];

                    if (p != c) {

                        ++(last_sampled[c]);

                        ++(last_changed[p]);

                        ++(last_changed[c]);

                    }

                }


                if (iter % my_options.convergence_history == 0) {

                    bool too_many_changes = false;

                    for (Cluster_ c = 0; c < ncenters; ++c) {

                        if (static_cast<double>(last_changed[c]) >= static_cast<double>(last_sampled[c]) * my_options.max_change_proportion) {

                            too_many_changes = true;

                            break;

                        }

                    }


                    if (!too_many_changes) {

                        break;

                    }

                    std::fill(last_sampled.begin(), last_sampled.end(), 0);

                    std::fill(last_changed.begin(), last_changed.end(), 0);

                }

            }

        }


        // Run through all observations to make sure they have the latest cluster assignments.

        index.reset(centers);

        parallelize(my_options.num_threads, nobs, [&](const int, const Index_ start, const Index_ length) -> void {

            auto matwork = data.new_known_extractor(start, length);

            auto qswork = index.new_workspace();

            for (Index_ s = start, end = start + length; s < end; ++s) {

                const auto ptr = matwork->get_observation();

                clusters[s] = index.find(ptr, qswork);

            }

        });


        auto cluster_sizes = sanisizer::create<std::vector<Index_> >(ncenters);

        for (Index_ o = 0; o < nobs; ++o) {

            ++cluster_sizes[clusters[o]];

        }

        internal::compute_centroids(data, ncenters, centers, clusters, cluster_sizes);


        int status = 0;

        if (iter == my_options.max_iterations) {

            status = 2;

        } else {

            ++iter; // make it 1-based.

        }

        return Details<Index_>(std::move(cluster_sizes), iter, status);

    }

};


}


#endif

Details.hpp
Report detailed clustering statistics.

Refine.hpp
Interface for k-means refinement.

kmeans::RefineMiniBatch
Implements the mini-batch algorithm for k-means clustering.
Definition RefineMiniBatch.hpp:100

kmeans::RefineMiniBatch::RefineMiniBatch
RefineMiniBatch()=default

kmeans::RefineMiniBatch::get_options
RefineMiniBatchOptions & get_options()
Definition RefineMiniBatch.hpp:117

kmeans::RefineMiniBatch::RefineMiniBatch
RefineMiniBatch(RefineMiniBatchOptions options)
Definition RefineMiniBatch.hpp:105

kmeans::Refine
Interface for k-means refinement algorithms.
Definition Refine.hpp:30

kmeans::Refine::run
virtual Details< Index_ > run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers, Cluster_ *clusters) const =0

kmeans
Perform k-means clustering.
Definition compute_wcss.hpp:16

kmeans::RefineMiniBatchRng
std::mt19937_64 RefineMiniBatchRng
Definition RefineMiniBatch.hpp:29

kmeans::parallelize
void parallelize(const int num_workers, const Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28

parallelize.hpp
Utilities for parallelization.

kmeans::Details
Additional statistics from the k-means algorithm.
Definition Details.hpp:20

kmeans::RefineMiniBatchOptions
Options for RefineMiniBatch.
Definition RefineMiniBatch.hpp:34

kmeans::RefineMiniBatchOptions::seed
RefineMiniBatchRng::result_type seed
Definition RefineMiniBatch.hpp:62

kmeans::RefineMiniBatchOptions::max_iterations
int max_iterations
Definition RefineMiniBatch.hpp:39

kmeans::RefineMiniBatchOptions::max_change_proportion
double max_change_proportion
Definition RefineMiniBatch.hpp:51

kmeans::RefineMiniBatchOptions::convergence_history
int convergence_history
Definition RefineMiniBatch.hpp:57

kmeans::RefineMiniBatchOptions::num_threads
int num_threads
Definition RefineMiniBatch.hpp:68

kmeans::RefineMiniBatchOptions::batch_size
int batch_size
Definition RefineMiniBatch.hpp:45