1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
9#include "aarand/aarand.hpp"
13#include "copy_into_array.hpp"
43namespace InitializeKmeanspp_internal {
45template<
typename Float_,
typename Index_,
class Engine_>
46Index_ weighted_sample(
const std::vector<Float_>& cumulative,
const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
47 auto total = cumulative.back();
51 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
52 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
61 }
while (chosen_id == nobs || mindist[chosen_id] == 0);
66template<
typename Float_,
class Matrix_,
typename Cluster_>
67std::vector<typename Matrix_::index_type> run_kmeanspp(
const Matrix_& data, Cluster_ ncenters, uint64_t seed,
int nthreads) {
68 typedef typename Matrix_::index_type Index_;
69 typedef typename Matrix_::dimension_type Dim_;
71 auto nobs = data.num_observations();
72 auto ndim = data.num_dimensions();
73 std::vector<Float_> mindist(nobs, 1);
74 std::vector<Float_> cumulative(nobs);
75 std::vector<Index_> sofar;
76 sofar.reserve(ncenters);
77 std::mt19937_64 eng(seed);
79 auto last_work = data.create_workspace();
80 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
82 auto last_ptr = data.get_observation(sofar.back(), last_work);
84 parallelize(nthreads, nobs, [&](
int, Index_ start, Index_ length) {
85 auto curwork = data.create_workspace();
86 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
88 auto acopy = data.get_observation(obs, curwork);
89 auto scopy = last_ptr;
92 for (Dim_ dim = 0; dim < ndim; ++dim, ++acopy, ++scopy) {
93 Float_ delta =
static_cast<Float_
>(*acopy) -
static_cast<Float_
>(*scopy);
97 if (cen == 1 || r2 < mindist[obs]) {
105 cumulative[0] = mindist[0];
106 for (Index_ i = 1; i < nobs; ++i) {
107 cumulative[i] = cumulative[i-1] + mindist[i];
110 const auto total = cumulative.back();
115 auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
116 mindist[chosen_id] = 0;
117 sofar.push_back(chosen_id);
145template<
typename Matrix_ = SimpleMatrix<
double,
int>,
typename Cluster_ =
int,
typename Float_ =
double>
171 Cluster_
run(
const Matrix_& matrix, Cluster_ ncenters, Float_* centers)
const {
172 size_t nobs = matrix.num_observations();
177 auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Float_>(matrix, ncenters, my_options.
seed, my_options.
num_threads);
178 internal::copy_into_array(matrix, sofar, centers);
Interface for k-means initialization.
Wrapper for a simple dense matrix.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:146
InitializeKmeanspp()=default
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:154
Cluster_ run(const Matrix_ &matrix, Cluster_ ncenters, Float_ *centers) const
Definition InitializeKmeanspp.hpp:171
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:166
Base class for initialization algorithms.
Definition Initialize.hpp:22
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
Utilities for parallelization.
Options for k-means++ initialization.
Definition InitializeKmeanspp.hpp:27
int num_threads
Definition InitializeKmeanspp.hpp:37
uint64_t seed
Definition InitializeKmeanspp.hpp:31