1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
9#include "aarand/aarand.hpp"
13#include "copy_into_array.hpp"
43namespace InitializeKmeanspp_internal {
45template<
typename Float_,
typename Index_,
class Engine_>
46Index_ weighted_sample(
const std::vector<Float_>& cumulative,
const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
47 auto total = cumulative.back();
51 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
52 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
61 }
while (chosen_id == nobs || mindist[chosen_id] == 0);
66template<
typename Index_,
typename Float_,
class Matrix_,
typename Cluster_>
67std::vector<Index_> run_kmeanspp(
const Matrix_& data, Cluster_ ncenters, uint64_t seed,
int nthreads) {
68 Index_ nobs = data.num_observations();
69 size_t ndim = data.num_dimensions();
70 std::vector<Float_> mindist(nobs, 1);
71 std::vector<Float_> cumulative(nobs);
72 std::vector<Index_> sofar;
73 sofar.reserve(ncenters);
74 std::mt19937_64 eng(seed);
76 auto last_work = data.new_extractor();
77 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
79 auto last_ptr = last_work->get_observation(sofar.back());
81 parallelize(nthreads, nobs, [&](
int, Index_ start, Index_ length) ->
void {
82 auto curwork = data.new_extractor(start, length);
83 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
84 auto current = curwork->get_observation();
88 for (
size_t d = 0; d < ndim; ++d) {
89 Float_ delta =
static_cast<Float_
>(current[d]) -
static_cast<Float_
>(last_ptr[d]);
93 if (cen == 1 || r2 < mindist[obs]) {
101 cumulative[0] = mindist[0];
102 for (Index_ i = 1; i < nobs; ++i) {
103 cumulative[i] = cumulative[i-1] + mindist[i];
106 const auto total = cumulative.back();
111 auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
112 mindist[chosen_id] = 0;
113 sofar.push_back(chosen_id);
143template<
typename Index_,
typename Data_,
typename Cluster_,
typename Float_,
class Matrix_ = Matrix<Index_, Data_> >
171 Cluster_
run(
const Matrix_& matrix, Cluster_ ncenters, Float_* centers)
const {
172 Index_ nobs = matrix.num_observations();
177 auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Index_, Float_>(matrix, ncenters, my_options.
seed, my_options.
num_threads);
178 internal::copy_into_array(matrix, sofar, centers);
Interface for k-means initialization.
Interface for matrix inputs.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:144
InitializeKmeanspp()=default
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:152
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:163
Interface for k-means initialization algorithms.
Definition Initialize.hpp:27
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
Utilities for parallelization.
Options for k-means++ initialization.
Definition InitializeKmeanspp.hpp:27
int num_threads
Definition InitializeKmeanspp.hpp:37
uint64_t seed
Definition InitializeKmeanspp.hpp:31