1#ifndef KMEANS_HARTIGAN_WONG_HPP
2#define KMEANS_HARTIGAN_WONG_HPP
11#include "QuickSearch.hpp"
13#include "compute_centroids.hpp"
14#include "is_edge_case.hpp"
80template<
typename Index_>
94 static constexpr int init = -3;
150template<
typename Index_>
153 enum class Event : uint8_t { NONE, PAST_OPT, CURRENT_OPT, QUICK, INIT };
177 Event my_had_recent_transfer = Event::INIT;
178 Index_ my_last_optimal_transfer = 0;
181 bool is_live(Index_ obs)
const {
182 if (my_had_recent_transfer == Event::PAST_OPT) {
183 return my_last_optimal_transfer > obs;
185 return my_had_recent_transfer > Event::PAST_OPT;
189 void mark_current(Index_ obs) {
190 my_had_recent_transfer = Event::CURRENT_OPT;
191 my_last_optimal_transfer = obs;
194 void reset(
bool was_quick_transferred) {
195 if (was_quick_transferred) {
196 my_had_recent_transfer = Event::QUICK;
197 }
else if (my_had_recent_transfer == Event::CURRENT_OPT) {
198 my_had_recent_transfer = Event::PAST_OPT;
200 my_had_recent_transfer = Event::NONE;
205template<
typename Float_,
typename Index_,
typename Cluster_>
208 std::vector<Cluster_> second_best_cluster;
209 std::vector<Index_> cluster_sizes;
211 std::vector<Float_> loss_multiplier;
212 std::vector<Float_> gain_multiplier;
213 std::vector<Float_> wcss_loss;
215 std::vector<UpdateHistory<Index_> > update_history;
216 std::vector<uint8_t> was_quick_transferred;
217 std::vector<LiveStatus<Index_> > live_set;
219 Index_ optra_steps_since_last_transfer = 0;
222 Workspace(Index_ nobs, Cluster_ ncenters) :
224 second_best_cluster(nobs),
225 cluster_sizes(ncenters),
226 loss_multiplier(ncenters),
227 gain_multiplier(ncenters),
231 update_history(ncenters),
232 was_quick_transferred(ncenters),
237template<
typename Data_,
typename Float_,
typename Dim_>
247template<
class Matrix_,
typename Cluster_,
typename Float_>
255 auto nobs =
data.num_observations();
256 typedef typename Matrix_::index_type Index_;
268template<
typename Float_>
273template<
typename Dim_,
typename Data_,
typename Index_,
typename Cluster_,
typename Float_>
308template<
class Matrix_,
typename Cluster_,
typename Float_>
310 auto nobs =
data.num_observations();
316 ++
work.optra_steps_since_last_transfer;
319 if (
work.cluster_sizes[
l1] != 1) {
372 work.optra_steps_since_last_transfer = 0;
376 work.update_history[
l1].set_optimal(
obs);
377 work.update_history[
l2].set_optimal(
obs);
385 if (
work.optra_steps_since_last_transfer ==
nobs) {
404template<
class Matrix_,
typename Cluster_,
typename Float_>
413 std::fill(
work.was_quick_transferred.begin(),
work.was_quick_transferred.end(), 0);
415 auto nobs =
data.num_observations();
420 typedef decltype(
nobs) Index_;
430 if (
work.cluster_sizes[
l1] != 1) {
464 work.was_quick_transferred[
l1] =
true;
465 work.was_quick_transferred[
l2] =
true;
520template<
typename Matrix_ = SimpleMatrix<
double,
int>,
typename Cluster_ =
int,
typename Float_ =
double>
535 typedef typename Matrix_::index_type Index_;
548 auto nobs =
data.num_observations();
550 return internal::process_edge_case(
data,
ncenters, centers, clusters);
553 RefineHartiganWong_internal::Workspace<Float_, Index_, Cluster_>
work(
nobs,
ncenters);
555 RefineHartiganWong_internal::find_closest_two_centers(
data,
ncenters, centers, clusters,
work.second_best_cluster, my_options.
num_threads);
557 ++
work.cluster_sizes[clusters[
obs]];
559 internal::compute_centroids(
data,
ncenters, centers, clusters,
work.cluster_sizes);
564 work.loss_multiplier[
cen] = (
num > 1 ?
num / (
num - 1) : RefineHartiganWong_internal::big_number<Float_>());
575 auto quick_status = RefineHartiganWong_internal::quick_transfer(
589 internal::compute_centroids(
data,
ncenters, centers, clusters,
work.cluster_sizes);
605 work.optra_steps_since_last_transfer = 0;
608 for (
auto&
u :
work.update_history) {
613 work.live_set[
c].reset(
work.was_quick_transferred[
c]);
Report detailed clustering statistics.
Interface for k-means refinement.
Implements the variance partitioning method of Su and Dy (2007).
Definition InitializeVariancePartition.hpp:164
InitializeVariancePartition()=default
Implements the Hartigan-Wong algorithm for k-means clustering.
Definition RefineHartiganWong.hpp:521
RefineHartiganWong(RefineHartiganWongOptions options)
Definition RefineHartiganWong.hpp:526
RefineHartiganWongOptions & get_options()
Definition RefineHartiganWong.hpp:542
Details< Index_ > run(const Matrix_ &data, Cluster_ ncenters, Float_ *centers, Cluster_ *clusters) const
Definition RefineHartiganWong.hpp:547
RefineHartiganWong()=default
Interface for all k-means refinement algorithms.
Definition Refine.hpp:23
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
void parallelize(int num_workers, Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Utilities for parallelization.
Additional statistics from the k-means algorithm.
Definition Details.hpp:20
Options for RefineHartiganWong.
Definition RefineHartiganWong.hpp:27
bool quit_on_quick_transfer_convergence_failure
Definition RefineHartiganWong.hpp:44
int max_iterations
Definition RefineHartiganWong.hpp:32
int num_threads
Definition RefineHartiganWong.hpp:50
int max_quick_transfer_iterations
Definition RefineHartiganWong.hpp:38