kmeans
A C++ library for k-means
Loading...
Searching...
No Matches
InitializeKmeanspp.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
3
4#include <vector>
5#include <random>
6#include <algorithm>
7#include <cstdint>
8
9#include "aarand/aarand.hpp"
10
11#include "Initialize.hpp"
12#include "Matrix.hpp"
13#include "copy_into_array.hpp"
14#include "parallelize.hpp"
15
22namespace kmeans {
23
31 uint64_t seed = 6523u;
32
37 int num_threads = 1;
38};
39
43namespace InitializeKmeanspp_internal {
44
45template<typename Float_, typename Index_, class Engine_>
46Index_ weighted_sample(const std::vector<Float_>& cumulative, const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
47 auto total = cumulative.back();
48 Index_ chosen_id = 0;
49
50 do {
51 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
52 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
53
54 // We wrap this in a do/while to defend against edge cases where
55 // ties are chosen. The most obvious of these is when you get a
56 // `sampled_weight` of zero _and_ there exists a bunch of zeros at
57 // the start of `cumulative`. One could also get unexpected ties
58 // from limited precision in floating point comparisons, so we'll
59 // just be safe and implement a loop here, in the same vein as
60 // uniform01.
61 } while (chosen_id == nobs || mindist[chosen_id] == 0);
62
63 return chosen_id;
64}
65
66template<typename Index_, typename Float_, class Matrix_, typename Cluster_>
67std::vector<Index_> run_kmeanspp(const Matrix_& data, Cluster_ ncenters, uint64_t seed, int nthreads) {
68 Index_ nobs = data.num_observations();
69 size_t ndim = data.num_dimensions();
70 std::vector<Float_> mindist(nobs, 1);
71 std::vector<Float_> cumulative(nobs);
72 std::vector<Index_> sofar;
73 sofar.reserve(ncenters);
74 std::mt19937_64 eng(seed);
75
76 auto last_work = data.new_extractor();
77 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
78 if (!sofar.empty()) {
79 auto last_ptr = last_work->get_observation(sofar.back());
80
81 parallelize(nthreads, nobs, [&](int, Index_ start, Index_ length) -> void {
82 auto curwork = data.new_extractor(start, length);
83 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
84 auto current = curwork->get_observation(); // make sure this is outside the if(), as we MUST call this in every loop iteration to fulfill consecutive access.
85
86 if (mindist[obs]) {
87 Float_ r2 = 0;
88 for (size_t d = 0; d < ndim; ++d) {
89 Float_ delta = static_cast<Float_>(current[d]) - static_cast<Float_>(last_ptr[d]); // cast to ensure consistent precision regardless of Data_.
90 r2 += delta * delta;
91 }
92
93 if (cen == 1 || r2 < mindist[obs]) {
94 mindist[obs] = r2;
95 }
96 }
97 }
98 });
99 }
100
101 cumulative[0] = mindist[0];
102 for (Index_ i = 1; i < nobs; ++i) {
103 cumulative[i] = cumulative[i-1] + mindist[i];
104 }
105
106 const auto total = cumulative.back();
107 if (total == 0) { // a.k.a. only duplicates left.
108 break;
109 }
110
111 auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
112 mindist[chosen_id] = 0;
113 sofar.push_back(chosen_id);
114 }
115
116 return sofar;
117}
118
119}
143template<typename Index_, typename Data_, typename Cluster_, typename Float_, class Matrix_ = Matrix<Index_, Data_> >
144class InitializeKmeanspp final : public Initialize<Index_, Data_, Cluster_, Float_, Matrix_> {
145private:
146 InitializeKmeansppOptions my_options;
147
148public:
152 InitializeKmeanspp(InitializeKmeansppOptions options) : my_options(std::move(options)) {}
153
158
159public:
164 return my_options;
165 }
166
167public:
171 Cluster_ run(const Matrix_& matrix, Cluster_ ncenters, Float_* centers) const {
172 Index_ nobs = matrix.num_observations();
173 if (!nobs) {
174 return 0;
175 }
176
177 auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Index_, Float_>(matrix, ncenters, my_options.seed, my_options.num_threads);
178 internal::copy_into_array(matrix, sofar, centers);
179 return sofar.size();
180 }
184};
185
186}
187
188#endif
Interface for k-means initialization.
Interface for matrix inputs.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:144
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:152
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:163
Interface for k-means initialization algorithms.
Definition Initialize.hpp:27
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
Utilities for parallelization.
Options for k-means++ initialization.
Definition InitializeKmeanspp.hpp:27
int num_threads
Definition InitializeKmeanspp.hpp:37
uint64_t seed
Definition InitializeKmeanspp.hpp:31