1#ifndef KMEANS_HARTIGAN_WONG_HPP
2#define KMEANS_HARTIGAN_WONG_HPP
11#include "QuickSearch.hpp"
13#include "compute_centroids.hpp"
14#include "is_edge_case.hpp"
56namespace RefineHartiganWong_internal {
130template<
typename Index_>
133 Index_ my_last_observation = 0;
135 static constexpr int current_optimal_transfer = -1;
136 static constexpr int previous_optimal_transfer = -2;
137 static constexpr int ancient_history = -3;
141 int my_last_iteration = ancient_history;
144 void reset(Index_ total_obs) {
145 if (my_last_iteration > current_optimal_transfer) {
146 my_last_observation = total_obs;
147 my_last_iteration = previous_optimal_transfer;
148 }
else if (my_last_iteration == current_optimal_transfer) {
150 my_last_iteration = previous_optimal_transfer;
152 my_last_iteration = ancient_history;
156 void set_optimal(Index_ obs) {
157 my_last_observation = obs;
158 my_last_iteration = current_optimal_transfer;
162 void set_quick(
int iter, Index_ obs) {
163 my_last_iteration = iter;
164 my_last_observation = obs;
168 bool changed_after(
int iter, Index_ obs)
const {
169 if (my_last_iteration == iter) {
170 return my_last_observation > obs;
172 return my_last_iteration > iter;
176 bool changed_after_or_at(
int iter, Index_ obs)
const {
177 if (my_last_iteration == iter) {
178 return my_last_observation >= obs;
180 return my_last_iteration > iter;
184 bool is_live(Index_ obs)
const {
185 return changed_after(previous_optimal_transfer, obs);
189template<
typename Float_,
typename Index_,
typename Cluster_>
192 std::vector<Cluster_> best_destination_cluster;
193 std::vector<Index_> cluster_sizes;
195 std::vector<Float_> loss_multiplier;
196 std::vector<Float_> gain_multiplier;
197 std::vector<Float_> wcss_loss;
199 std::vector<UpdateHistory<Index_> > update_history;
201 Index_ optra_steps_since_last_transfer = 0;
204 Workspace(Index_ nobs, Cluster_ ncenters) :
206 best_destination_cluster(nobs),
207 cluster_sizes(ncenters),
208 loss_multiplier(ncenters),
209 gain_multiplier(ncenters),
211 update_history(ncenters)
215template<
typename Data_,
typename Float_,
typename Dim_>
216Float_ squared_distance_from_cluster(
const Data_* data,
const Float_* center, Dim_ ndim) {
218 for (
decltype(ndim) dim = 0; dim < ndim; ++dim, ++data, ++center) {
219 Float_ delta =
static_cast<Float_
>(*data) - *center;
220 output += delta * delta;
225template<
class Matrix_,
typename Cluster_,
typename Float_>
226void find_closest_two_centers(
const Matrix_& data, Cluster_ ncenters,
const Float_* centers, Cluster_* best_cluster, std::vector<Cluster_>& best_destination_cluster,
int nthreads) {
227 auto ndim = data.num_dimensions();
231 internal::QuickSearch<Float_, Cluster_,
decltype(ndim)> index(ndim, ncenters, centers);
233 auto nobs = data.num_observations();
234 typedef typename Matrix_::index_type Index_;
235 parallelize(nthreads, nobs, [&](
int, Index_ start, Index_ length) ->
void {
236 auto matwork = data.create_workspace(start, length);
237 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
238 auto optr = data.get_observation(matwork);
239 auto res2 = index.find2(optr);
240 best_cluster[obs] = res2.first;
241 best_destination_cluster[obs] = res2.second;
246template<
typename Float_>
247constexpr Float_ big_number() {
251template<
typename Dim_,
typename Data_,
typename Index_,
typename Cluster_,
typename Float_>
252void transfer_point(Dim_ ndim,
const Data_* obs_ptr, Index_ obs_id, Cluster_ l1, Cluster_ l2, Float_* centers, Cluster_* best_cluster, Workspace<Float_, Index_, Cluster_>& work) {
255 Float_ al1 = work.cluster_sizes[l1], alw = al1 - 1;
256 Float_ al2 = work.cluster_sizes[l2], alt = al2 + 1;
258 size_t long_ndim = ndim;
259 auto copy1 = centers +
static_cast<size_t>(l1) * long_ndim;
260 auto copy2 = centers +
static_cast<size_t>(l2) * long_ndim;
261 for (
decltype(ndim) dim = 0; dim < ndim; ++dim, ++copy1, ++copy2, ++obs_ptr) {
262 Float_ oval = *obs_ptr;
263 *copy1 = (*copy1 * al1 - oval) / alw;
264 *copy2 = (*copy2 * al2 + oval) / alt;
267 --work.cluster_sizes[l1];
268 ++work.cluster_sizes[l2];
270 work.gain_multiplier[l1] = alw / al1;
271 work.loss_multiplier[l1] = (alw > 1 ? alw / (alw - 1) : big_number<Float_>());
272 work.loss_multiplier[l2] = alt / al2;
273 work.gain_multiplier[l2] = alt / (alt + 1);
275 best_cluster[obs_id] = l2;
276 work.best_destination_cluster[obs_id] = l1;
286template<
class Matrix_,
typename Cluster_,
typename Float_>
287bool optimal_transfer(
const Matrix_& data, Workspace<Float_, typename Matrix_::index_type, Cluster_>& work, Cluster_ ncenters, Float_* centers, Cluster_* best_cluster,
bool all_live) {
288 auto nobs = data.num_observations();
289 auto ndim = data.num_dimensions();
290 auto matwork = data.create_workspace();
291 size_t long_ndim = ndim;
293 for (
decltype(nobs) obs = 0; obs < nobs; ++obs) {
294 ++work.optra_steps_since_last_transfer;
296 auto l1 = best_cluster[obs];
297 if (work.cluster_sizes[l1] != 1) {
298 auto obs_ptr = data.get_observation(obs, matwork);
310 auto& wcss_loss = work.wcss_loss[obs];
311 auto l1_ptr = centers + long_ndim *
static_cast<size_t>(l1);
312 wcss_loss = squared_distance_from_cluster(obs_ptr, l1_ptr, ndim) * work.loss_multiplier[l1];
315 auto l2 = work.best_destination_cluster[obs];
316 auto original_l2 = l2;
317 auto l2_ptr = centers + long_ndim *
static_cast<size_t>(l2);
318 auto wcss_gain = squared_distance_from_cluster(obs_ptr, l2_ptr, ndim) * work.gain_multiplier[l2];
320 auto update_destination_cluster = [&](Cluster_ cen) ->
void {
321 auto cen_ptr = centers + long_ndim *
static_cast<size_t>(cen);
322 auto candidate = squared_distance_from_cluster(obs_ptr, cen_ptr, ndim) * work.gain_multiplier[cen];
323 if (candidate < wcss_gain) {
324 wcss_gain = candidate;
342 if (all_live || work.update_history[l1].is_live(obs)) {
343 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
344 if (cen != l1 && cen != original_l2) {
345 update_destination_cluster(cen);
349 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
350 if (cen != l1 && cen != original_l2 && work.update_history[cen].is_live(obs)) {
351 update_destination_cluster(cen);
357 if (wcss_gain >= wcss_loss) {
358 work.best_destination_cluster[obs] = l2;
360 work.optra_steps_since_last_transfer = 0;
361 work.update_history[l1].set_optimal(obs);
362 work.update_history[l2].set_optimal(obs);
363 transfer_point(ndim, obs_ptr, obs, l1, l2, centers, best_cluster, work);
369 if (work.optra_steps_since_last_transfer == nobs) {
388template<
class Matrix_,
typename Cluster_,
typename Float_>
389std::pair<bool, bool> quick_transfer(
391 Workspace<Float_, typename Matrix_::index_type, Cluster_>& work,
393 Cluster_* best_cluster,
394 int quick_iterations)
396 bool had_transfer =
false;
398 auto nobs = data.num_observations();
399 auto matwork = data.create_workspace();
400 auto ndim = data.num_dimensions();
401 size_t long_ndim = data.num_dimensions();
403 typedef decltype(nobs) Index_;
404 Index_ steps_since_last_quick_transfer = 0;
406 for (
int it = 0; it < quick_iterations; ++it) {
407 int prev_it = it - 1;
409 for (
decltype(nobs) obs = 0; obs < nobs; ++obs) {
410 ++steps_since_last_quick_transfer;
411 auto l1 = best_cluster[obs];
413 if (work.cluster_sizes[l1] != 1) {
414 const typename Matrix_::data_type* obs_ptr = NULL;
423 auto& history1 = work.update_history[l1];
424 if (history1.changed_after_or_at(prev_it, obs)) {
425 auto l1_ptr = centers +
static_cast<size_t>(l1) * long_ndim;
426 obs_ptr = data.get_observation(obs, matwork);
427 work.wcss_loss[obs] = squared_distance_from_cluster(obs_ptr, l1_ptr, ndim) * work.loss_multiplier[l1];
434 auto l2 = work.best_destination_cluster[obs];
435 auto& history2 = work.update_history[l2];
436 if (history1.changed_after(prev_it, obs) || history2.changed_after(prev_it, obs)) {
437 if (obs_ptr == NULL) {
438 obs_ptr = data.get_observation(obs, matwork);
440 auto l2_ptr = centers +
static_cast<size_t>(l2) * long_ndim;
441 auto wcss_gain = squared_distance_from_cluster(obs_ptr, l2_ptr, ndim) * work.gain_multiplier[l2];
443 if (wcss_gain < work.wcss_loss[obs]) {
445 steps_since_last_quick_transfer = 0;
446 history1.set_quick(it, obs);
447 history2.set_quick(it, obs);
448 transfer_point(ndim, obs_ptr, obs, l1, l2, centers, best_cluster, work);
453 if (steps_since_last_quick_transfer == nobs) {
456 return std::make_pair(had_transfer,
false);
461 return std::make_pair(had_transfer,
true);
498template<
typename Matrix_ = SimpleMatrix<
double,
int>,
typename Cluster_ =
int,
typename Float_ =
double>
513 typedef typename Matrix_::index_type Index_;
525 Details<Index_> run(
const Matrix_& data, Cluster_ ncenters, Float_* centers, Cluster_* clusters)
const {
526 auto nobs = data.num_observations();
527 if (internal::is_edge_case(nobs, ncenters)) {
528 return internal::process_edge_case(data, ncenters, centers, clusters);
531 RefineHartiganWong_internal::Workspace<Float_, Index_, Cluster_> work(nobs, ncenters);
533 RefineHartiganWong_internal::find_closest_two_centers(data, ncenters, centers, clusters, work.best_destination_cluster, my_options.
num_threads);
534 for (Index_ obs = 0; obs < nobs; ++obs) {
535 ++work.cluster_sizes[clusters[obs]];
537 internal::compute_centroids(data, ncenters, centers, clusters, work.cluster_sizes);
539 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
540 Float_ num = work.cluster_sizes[cen];
541 work.gain_multiplier[cen] = num / (num + 1);
542 work.loss_multiplier[cen] = (num > 1 ? num / (num - 1) : RefineHartiganWong_internal::big_number<Float_>());
548 bool finished = RefineHartiganWong_internal::optimal_transfer(data, work, ncenters, centers, clusters, (iter == 1));
553 auto quick_status = RefineHartiganWong_internal::quick_transfer(
567 internal::compute_centroids(data, ncenters, centers, clusters, work.cluster_sizes);
569 if (quick_status.second) {
582 if (quick_status.first) {
583 work.optra_steps_since_last_transfer = 0;
586 for (Cluster_ c = 0; c < ncenters; ++c) {
587 work.update_history[c].reset(nobs);
595 return Details(std::move(work.cluster_sizes), iter, ifault);
Report detailed clustering statistics.
Interface for k-means refinement.
Implements the Hartigan-Wong algorithm for k-means clustering.
Definition RefineHartiganWong.hpp:499
RefineHartiganWong(RefineHartiganWongOptions options)
Definition RefineHartiganWong.hpp:504
RefineHartiganWongOptions & get_options()
Definition RefineHartiganWong.hpp:520
Details< Index_ > run(const Matrix_ &data, Cluster_ ncenters, Float_ *centers, Cluster_ *clusters) const
Definition RefineHartiganWong.hpp:525
RefineHartiganWong()=default
Interface for all k-means refinement algorithms.
Definition Refine.hpp:23
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Utilities for parallelization.
Additional statistics from the k-means algorithm.
Definition Details.hpp:20
Options for RefineHartiganWong.
Definition RefineHartiganWong.hpp:27
bool quit_on_quick_transfer_convergence_failure
Definition RefineHartiganWong.hpp:44
int max_iterations
Definition RefineHartiganWong.hpp:32
int num_threads
Definition RefineHartiganWong.hpp:50
int max_quick_transfer_iterations
Definition RefineHartiganWong.hpp:38