1#ifndef KMEANS_HARTIGAN_WONG_HPP
2#define KMEANS_HARTIGAN_WONG_HPP
11#include "QuickSearch.hpp"
13#include "compute_centroids.hpp"
14#include "is_edge_case.hpp"
56namespace RefineHartiganWong_internal {
130template<
typename Index_>
133 Index_ my_last_observation = 0;
135 static constexpr int current_optimal_transfer = -1;
136 static constexpr int previous_optimal_transfer = -2;
137 static constexpr int ancient_history = -3;
141 int my_last_iteration = ancient_history;
144 void reset(Index_ total_obs) {
145 if (my_last_iteration > current_optimal_transfer) {
146 my_last_observation = total_obs;
147 my_last_iteration = previous_optimal_transfer;
148 }
else if (my_last_iteration == current_optimal_transfer) {
150 my_last_iteration = previous_optimal_transfer;
152 my_last_iteration = ancient_history;
156 void set_optimal(Index_ obs) {
157 my_last_observation = obs;
158 my_last_iteration = current_optimal_transfer;
162 void set_quick(
int iter, Index_ obs) {
163 my_last_iteration = iter;
164 my_last_observation = obs;
168 bool changed_after(
int iter, Index_ obs)
const {
169 if (my_last_iteration == iter) {
170 return my_last_observation > obs;
172 return my_last_iteration > iter;
176 bool changed_after_or_at(
int iter, Index_ obs)
const {
177 if (my_last_iteration == iter) {
178 return my_last_observation >= obs;
180 return my_last_iteration > iter;
184 bool is_live(Index_ obs)
const {
185 return changed_after(previous_optimal_transfer, obs);
189template<
typename Float_,
typename Index_,
typename Cluster_>
192 std::vector<Cluster_> best_destination_cluster;
193 std::vector<Index_> cluster_sizes;
195 std::vector<Float_> loss_multiplier;
196 std::vector<Float_> gain_multiplier;
197 std::vector<Float_> wcss_loss;
199 std::vector<UpdateHistory<Index_> > update_history;
201 Index_ optra_steps_since_last_transfer = 0;
204 Workspace(Index_ nobs, Cluster_ ncenters) :
206 best_destination_cluster(nobs),
207 cluster_sizes(ncenters),
208 loss_multiplier(ncenters),
209 gain_multiplier(ncenters),
211 update_history(ncenters)
215template<
typename Data_,
typename Float_>
216Float_ squared_distance_from_cluster(
const Data_* data,
const Float_* center,
size_t ndim) {
218 for (
size_t d = 0; d < ndim; ++d) {
219 Float_ delta =
static_cast<Float_
>(data[d]) - center[d];
220 output += delta * delta;
225template<
class Matrix_,
typename Cluster_,
typename Float_>
226void find_closest_two_centers(
const Matrix_& data, Cluster_ ncenters,
const Float_* centers, Cluster_* best_cluster, std::vector<Cluster_>& best_destination_cluster,
int nthreads) {
227 auto ndim = data.num_dimensions();
231 internal::QuickSearch<Float_, Cluster_> index(ndim, ncenters, centers);
233 auto nobs = data.num_observations();
234 parallelize(nthreads, nobs, [&](
int, Index<Matrix_> start, Index<Matrix_> length) ->
void {
235 auto matwork = data.new_extractor(start, length);
236 for (Index<Matrix_> obs = start, end = start + length; obs < end; ++obs) {
237 auto optr = matwork->get_observation();
238 auto res2 = index.find2(optr);
239 best_cluster[obs] = res2.first;
240 best_destination_cluster[obs] = res2.second;
245template<
typename Float_>
246constexpr Float_ big_number() {
250template<
typename Data_,
typename Index_,
typename Cluster_,
typename Float_>
251void transfer_point(
size_t ndim,
const Data_* obs_ptr, Index_ obs_id, Cluster_ l1, Cluster_ l2, Float_* centers, Cluster_* best_cluster, Workspace<Float_, Index_, Cluster_>& work) {
254 Float_ al1 = work.cluster_sizes[l1], alw = al1 - 1;
255 Float_ al2 = work.cluster_sizes[l2], alt = al2 + 1;
257 auto copy1 = centers +
static_cast<size_t>(l1) * ndim;
258 auto copy2 = centers +
static_cast<size_t>(l2) * ndim;
259 for (
decltype(ndim) dim = 0; dim < ndim; ++dim, ++copy1, ++copy2, ++obs_ptr) {
260 Float_ oval = *obs_ptr;
261 *copy1 = (*copy1 * al1 - oval) / alw;
262 *copy2 = (*copy2 * al2 + oval) / alt;
265 --work.cluster_sizes[l1];
266 ++work.cluster_sizes[l2];
268 work.gain_multiplier[l1] = alw / al1;
269 work.loss_multiplier[l1] = (alw > 1 ? alw / (alw - 1) : big_number<Float_>());
270 work.loss_multiplier[l2] = alt / al2;
271 work.gain_multiplier[l2] = alt / (alt + 1);
273 best_cluster[obs_id] = l2;
274 work.best_destination_cluster[obs_id] = l1;
284template<
class Matrix_,
typename Cluster_,
typename Float_>
285bool optimal_transfer(
const Matrix_& data, Workspace<Float_, Index<Matrix_>, Cluster_>& work, Cluster_ ncenters, Float_* centers, Cluster_* best_cluster,
bool all_live) {
286 auto nobs = data.num_observations();
287 auto ndim = data.num_dimensions();
288 auto matwork = data.new_extractor();
290 for (
decltype(nobs) obs = 0; obs < nobs; ++obs) {
291 ++work.optra_steps_since_last_transfer;
293 auto l1 = best_cluster[obs];
294 if (work.cluster_sizes[l1] != 1) {
295 auto obs_ptr = matwork->get_observation(obs);
307 auto& wcss_loss = work.wcss_loss[obs];
308 auto l1_ptr = centers + ndim *
static_cast<size_t>(l1);
309 wcss_loss = squared_distance_from_cluster(obs_ptr, l1_ptr, ndim) * work.loss_multiplier[l1];
312 auto l2 = work.best_destination_cluster[obs];
313 auto original_l2 = l2;
314 auto l2_ptr = centers + ndim *
static_cast<size_t>(l2);
315 auto wcss_gain = squared_distance_from_cluster(obs_ptr, l2_ptr, ndim) * work.gain_multiplier[l2];
317 auto update_destination_cluster = [&](Cluster_ cen) ->
void {
318 auto cen_ptr = centers + ndim *
static_cast<size_t>(cen);
319 auto candidate = squared_distance_from_cluster(obs_ptr, cen_ptr, ndim) * work.gain_multiplier[cen];
320 if (candidate < wcss_gain) {
321 wcss_gain = candidate;
339 if (all_live || work.update_history[l1].is_live(obs)) {
340 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
341 if (cen != l1 && cen != original_l2) {
342 update_destination_cluster(cen);
346 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
347 if (cen != l1 && cen != original_l2 && work.update_history[cen].is_live(obs)) {
348 update_destination_cluster(cen);
354 if (wcss_gain >= wcss_loss) {
355 work.best_destination_cluster[obs] = l2;
357 work.optra_steps_since_last_transfer = 0;
358 work.update_history[l1].set_optimal(obs);
359 work.update_history[l2].set_optimal(obs);
360 transfer_point(ndim, obs_ptr, obs, l1, l2, centers, best_cluster, work);
366 if (work.optra_steps_since_last_transfer == nobs) {
385template<
class Matrix_,
typename Cluster_,
typename Float_>
386std::pair<bool, bool> quick_transfer(
388 Workspace<Float_, Index<Matrix_>, Cluster_>& work,
390 Cluster_* best_cluster,
391 int quick_iterations)
393 bool had_transfer =
false;
395 auto nobs = data.num_observations();
396 auto matwork = data.new_extractor();
397 size_t ndim = data.num_dimensions();
399 typedef decltype(nobs) Index_;
400 Index_ steps_since_last_quick_transfer = 0;
402 for (
int it = 0; it < quick_iterations; ++it) {
403 int prev_it = it - 1;
405 for (
decltype(nobs) obs = 0; obs < nobs; ++obs) {
406 ++steps_since_last_quick_transfer;
407 auto l1 = best_cluster[obs];
409 if (work.cluster_sizes[l1] != 1) {
410 decltype(matwork->get_observation(obs)) obs_ptr = NULL;
419 auto& history1 = work.update_history[l1];
420 if (history1.changed_after_or_at(prev_it, obs)) {
421 auto l1_ptr = centers +
static_cast<size_t>(l1) * ndim;
422 obs_ptr = matwork->get_observation(obs);
423 work.wcss_loss[obs] = squared_distance_from_cluster(obs_ptr, l1_ptr, ndim) * work.loss_multiplier[l1];
430 auto l2 = work.best_destination_cluster[obs];
431 auto& history2 = work.update_history[l2];
432 if (history1.changed_after(prev_it, obs) || history2.changed_after(prev_it, obs)) {
433 if (obs_ptr == NULL) {
434 obs_ptr = matwork->get_observation(obs);
436 auto l2_ptr = centers +
static_cast<size_t>(l2) * ndim;
437 auto wcss_gain = squared_distance_from_cluster(obs_ptr, l2_ptr, ndim) * work.gain_multiplier[l2];
439 if (wcss_gain < work.wcss_loss[obs]) {
441 steps_since_last_quick_transfer = 0;
442 history1.set_quick(it, obs);
443 history2.set_quick(it, obs);
444 transfer_point(ndim, obs_ptr, obs, l1, l2, centers, best_cluster, work);
449 if (steps_since_last_quick_transfer == nobs) {
452 return std::make_pair(had_transfer,
false);
457 return std::make_pair(had_transfer,
true);
497template<
typename Index_,
typename Data_,
typename Cluster_,
typename Float_,
class Matrix_ = Matrix<Index_, Data_> >
526 Details<Index_> run(
const Matrix_& data, Cluster_ ncenters, Float_* centers, Cluster_* clusters)
const {
527 auto nobs = data.num_observations();
528 if (internal::is_edge_case(nobs, ncenters)) {
529 return internal::process_edge_case(data, ncenters, centers, clusters);
532 RefineHartiganWong_internal::Workspace<Float_, Index_, Cluster_> work(nobs, ncenters);
534 RefineHartiganWong_internal::find_closest_two_centers(data, ncenters, centers, clusters, work.best_destination_cluster, my_options.
num_threads);
535 for (Index_ obs = 0; obs < nobs; ++obs) {
536 ++work.cluster_sizes[clusters[obs]];
538 internal::compute_centroids(data, ncenters, centers, clusters, work.cluster_sizes);
540 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
541 Float_ num = work.cluster_sizes[cen];
542 work.gain_multiplier[cen] = num / (num + 1);
543 work.loss_multiplier[cen] = (num > 1 ? num / (num - 1) : RefineHartiganWong_internal::big_number<Float_>());
549 bool finished = RefineHartiganWong_internal::optimal_transfer(data, work, ncenters, centers, clusters, (iter == 1));
554 auto quick_status = RefineHartiganWong_internal::quick_transfer(
568 internal::compute_centroids(data, ncenters, centers, clusters, work.cluster_sizes);
570 if (quick_status.second) {
583 if (quick_status.first) {
584 work.optra_steps_since_last_transfer = 0;
587 for (Cluster_ c = 0; c < ncenters; ++c) {
588 work.update_history[c].reset(nobs);
596 return Details(std::move(work.cluster_sizes), iter, ifault);
Report detailed clustering statistics.
Interface for k-means refinement.
Implements the Hartigan-Wong algorithm for k-means clustering.
Definition RefineHartiganWong.hpp:498
RefineHartiganWongOptions & get_options()
Definition RefineHartiganWong.hpp:518
RefineHartiganWong(RefineHartiganWongOptions options)
Definition RefineHartiganWong.hpp:503
RefineHartiganWong()=default
Interface for k-means refinement algorithms.
Definition Refine.hpp:26
virtual Details< Index_ > run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers, Cluster_ *clusters) const =0
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Utilities for parallelization.
Additional statistics from the k-means algorithm.
Definition Details.hpp:20
Options for RefineHartiganWong.
Definition RefineHartiganWong.hpp:27
bool quit_on_quick_transfer_convergence_failure
Definition RefineHartiganWong.hpp:44
int max_iterations
Definition RefineHartiganWong.hpp:32
int num_threads
Definition RefineHartiganWong.hpp:50
int max_quick_transfer_iterations
Definition RefineHartiganWong.hpp:38