194 Cluster_
run(
const Matrix_& data,
const Cluster_ ncenters, Float_*
const centers)
const {
195 const auto nobs = data.num_observations();
196 const auto ndim = data.num_dimensions();
197 if (nobs == 0 || ndim == 0) {
201 auto assignments = sanisizer::create<std::vector<std::vector<Index_> > >(ncenters);
202 sanisizer::resize(assignments[0], nobs);
203 std::iota(assignments.front().begin(), assignments.front().end(),
static_cast<Index_
>(0));
204 sanisizer::cast<std::size_t>(nobs);
206 auto dim_ss = sanisizer::create<std::vector<std::vector<Float_> > >(ncenters);
208 auto& cur_ss = dim_ss[0];
209 sanisizer::resize(cur_ss, ndim);
210 std::fill_n(centers, ndim, 0);
211 auto matwork = data.new_known_extractor(
static_cast<Index_
>(0), nobs);
212 for (I<
decltype(nobs)> i = 0; i < nobs; ++i) {
213 const auto dptr = matwork->get_observation();
214 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(),
static_cast<Float_
>(i + 1));
218 std::priority_queue<std::pair<Float_, Cluster_> > highest;
219 auto add_to_queue = [&](
const Cluster_ i) ->
void {
220 const auto& cur_ss = dim_ss[i];
221 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(),
static_cast<Float_
>(0));
225 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.
size_adjustment);
227 highest.emplace(sum_ss, i);
231 std::vector<Index_> cur_assignments_copy;
232 std::vector<Float_> opt_partition_values, opt_partition_stats;
234 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
235 const auto chosen = highest.top();
236 if (chosen.first == 0) {
241 const auto cur_center = centers + sanisizer::product_unsafe<std::size_t>(chosen.second, ndim);
242 auto& cur_ss = dim_ss[chosen.second];
243 auto& cur_assignments = assignments[chosen.second];
245 const I<
decltype(ndim)> top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
246 Float_ partition_boundary;
248 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
250 partition_boundary = cur_center[top_dim];
253 const auto next_center = centers + sanisizer::product_unsafe<std::size_t>(cluster, ndim);
254 std::fill_n(next_center, ndim, 0);
255 auto& next_ss = dim_ss[cluster];
256 next_ss.resize(ndim);
257 auto& next_assignments = assignments[cluster];
259 cur_assignments_copy.clear();
260 std::fill_n(cur_center, ndim, 0);
261 std::fill(cur_ss.begin(), cur_ss.end(), 0);
262 auto work = data.new_known_extractor(cur_assignments.data(), cur_assignments.size());
264 for (
const auto i : cur_assignments) {
265 const auto dptr = work->get_observation();
266 if (dptr[top_dim] < partition_boundary) {
267 cur_assignments_copy.push_back(i);
268 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(),
static_cast<Float_
>(cur_assignments_copy.size()));
270 next_assignments.push_back(i);
271 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(),
static_cast<Float_
>(next_assignments.size()));
277 if (next_assignments.empty() || cur_assignments_copy.empty()) {
281 cur_assignments.swap(cur_assignments_copy);
282 add_to_queue(chosen.second);
283 add_to_queue(cluster);