1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
9#include "sanisizer/sanisizer.hpp"
10#include "aarand/aarand.hpp"
14#include "copy_into_array.hpp"
38 typename InitializeKmeansppRng::result_type
seed = sanisizer::cap<typename InitializeKmeansppRng::result_type>(6523);
50namespace InitializeKmeanspp_internal {
52template<
typename Float_,
typename Index_,
class Engine_>
53Index_ weighted_sample(
const std::vector<Float_>& cumulative,
const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
54 const auto total = cumulative.back();
58 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
61 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
70 }
while (chosen_id == nobs || mindist[chosen_id] == 0);
75template<
typename Index_,
typename Float_,
class Matrix_,
typename Cluster_>
76std::vector<Index_> run_kmeanspp(
const Matrix_& data, Cluster_ ncenters,
const typename InitializeKmeansppRng::result_type seed,
const int nthreads) {
77 const auto nobs = data.num_observations();
78 const auto ndim = data.num_dimensions();
79 auto mindist = sanisizer::create<std::vector<Float_> >(nobs, 1);
81 auto cumulative = sanisizer::create<std::vector<Float_> >(nobs);
82 sanisizer::can_ptrdiff<
decltype(I(cumulative.begin()))>(nobs);
84 std::vector<Index_> sofar;
85 sofar.reserve(ncenters);
86 InitializeKmeansppRng eng(seed);
88 auto last_work = data.new_extractor();
89 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
91 const auto last_ptr = last_work->get_observation(sofar.back());
93 parallelize(nthreads, nobs, [&](
const int,
const Index_ start,
const Index_ length) ->
void {
94 auto curwork = data.new_extractor(start, length);
95 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
96 const auto current = curwork->get_observation();
98 if (mindist[obs] == 0) {
103 for (
decltype(I(ndim)) d = 0; d < ndim; ++d) {
104 const Float_ delta =
static_cast<Float_
>(current[d]) -
static_cast<Float_
>(last_ptr[d]);
108 if (cen == 1 || r2 < mindist[obs]) {
115 cumulative[0] = mindist[0];
116 for (Index_ i = 1; i < nobs; ++i) {
117 cumulative[i] = cumulative[i-1] + mindist[i];
120 const auto total = cumulative.back();
125 const auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
126 mindist[chosen_id] = 0;
127 sofar.push_back(chosen_id);
158template<
typename Index_,
typename Data_,
typename Cluster_,
typename Float_,
class Matrix_ = Matrix<Index_, Data_> >
187 Cluster_
run(
const Matrix_& matrix,
const Cluster_ ncenters, Float_*
const centers)
const {
188 const auto nobs = matrix.num_observations();
193 const auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Index_, Float_>(matrix, ncenters, my_options.
seed, my_options.
num_threads);
194 internal::copy_into_array(matrix, sofar, centers);
Interface for k-means initialization.
Interface for matrix inputs.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:159
InitializeKmeanspp()=default
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:167
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:179
Interface for k-means initialization algorithms.
Definition Initialize.hpp:29
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
std::mt19937_64 InitializeKmeansppRng
Definition InitializeKmeanspp.hpp:29
Utilities for parallelization.
Options for InitializeKmeanspp.
Definition InitializeKmeanspp.hpp:34
InitializeKmeansppRng::result_type seed
Definition InitializeKmeanspp.hpp:38
int num_threads
Definition InitializeKmeanspp.hpp:44