kmeans
k-means clustering in C++
Loading...
Searching...
No Matches
InitializeKmeanspp.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
3
4#include <vector>
5#include <random>
6#include <algorithm>
7#include <cstdint>
8
9#include "sanisizer/sanisizer.hpp"
10#include "aarand/aarand.hpp"
11
12#include "Initialize.hpp"
13#include "Matrix.hpp"
14#include "copy_into_array.hpp"
15#include "parallelize.hpp"
16#include "utils.hpp"
17
24namespace kmeans {
25
29typedef std::mt19937_64 InitializeKmeansppRng;
30
38 typename InitializeKmeansppRng::result_type seed = sanisizer::cap<typename InitializeKmeansppRng::result_type>(6523);
39
44 int num_threads = 1;
45};
46
50namespace InitializeKmeanspp_internal {
51
52template<typename Float_, typename Index_, class Engine_>
53Index_ weighted_sample(const std::vector<Float_>& cumulative, const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
54 const auto total = cumulative.back();
55 Index_ chosen_id = 0;
56
57 do {
58 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
59
60 // Subtraction is safe as we already checked for valid ptrdiff in run_kmeanspp().
61 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
62
63 // We wrap this in a do/while to defend against edge cases where
64 // ties are chosen. The most obvious of these is when you get a
65 // `sampled_weight` of zero _and_ there exists a bunch of zeros at
66 // the start of `cumulative`. One could also get unexpected ties
67 // from limited precision in floating point comparisons, so we'll
68 // just be safe and implement a loop here, in the same vein as
69 // uniform01.
70 } while (chosen_id == nobs || mindist[chosen_id] == 0);
71
72 return chosen_id;
73}
74
75template<typename Index_, typename Float_, class Matrix_, typename Cluster_>
76std::vector<Index_> run_kmeanspp(const Matrix_& data, Cluster_ ncenters, const typename InitializeKmeansppRng::result_type seed, const int nthreads) {
77 const auto nobs = data.num_observations();
78 const auto ndim = data.num_dimensions();
79 auto mindist = sanisizer::create<std::vector<Float_> >(nobs, 1);
80
81 auto cumulative = sanisizer::create<std::vector<Float_> >(nobs);
82 sanisizer::can_ptrdiff<decltype(I(cumulative.begin()))>(nobs); // check that we can compute a ptrdiff for weighted_sample().
83
84 std::vector<Index_> sofar;
85 sofar.reserve(ncenters);
86 InitializeKmeansppRng eng(seed);
87
88 auto last_work = data.new_extractor();
89 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
90 if (!sofar.empty()) {
91 const auto last_ptr = last_work->get_observation(sofar.back());
92
93 parallelize(nthreads, nobs, [&](const int, const Index_ start, const Index_ length) -> void {
94 auto curwork = data.new_extractor(start, length);
95 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
96 const auto current = curwork->get_observation(); // make sure this is before the 'continue', as we MUST call this in every loop iteration to fulfill consecutive access.
97
98 if (mindist[obs] == 0) {
99 continue;
100 }
101
102 Float_ r2 = 0;
103 for (decltype(I(ndim)) d = 0; d < ndim; ++d) {
104 const Float_ delta = static_cast<Float_>(current[d]) - static_cast<Float_>(last_ptr[d]); // cast to ensure consistent precision regardless of Data_.
105 r2 += delta * delta;
106 }
107
108 if (cen == 1 || r2 < mindist[obs]) {
109 mindist[obs] = r2;
110 }
111 }
112 });
113 }
114
115 cumulative[0] = mindist[0];
116 for (Index_ i = 1; i < nobs; ++i) {
117 cumulative[i] = cumulative[i-1] + mindist[i];
118 }
119
120 const auto total = cumulative.back();
121 if (total == 0) { // a.k.a. only duplicates left.
122 break;
123 }
124
125 const auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
126 mindist[chosen_id] = 0;
127 sofar.push_back(chosen_id);
128 }
129
130 return sofar;
131}
132
133}
158template<typename Index_, typename Data_, typename Cluster_, typename Float_, class Matrix_ = Matrix<Index_, Data_> >
159class InitializeKmeanspp final : public Initialize<Index_, Data_, Cluster_, Float_, Matrix_> {
160private:
161 InitializeKmeansppOptions my_options;
162
163public:
167 InitializeKmeanspp(InitializeKmeansppOptions options) : my_options(std::move(options)) {}
168
173
174public:
180 return my_options;
181 }
182
183public:
187 Cluster_ run(const Matrix_& matrix, const Cluster_ ncenters, Float_* const centers) const {
188 const auto nobs = matrix.num_observations();
189 if (!nobs) {
190 return 0;
191 }
192
193 const auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Index_, Float_>(matrix, ncenters, my_options.seed, my_options.num_threads);
194 internal::copy_into_array(matrix, sofar, centers);
195 return sofar.size();
196 }
200};
201
202}
203
204#endif
Interface for k-means initialization.
Interface for matrix inputs.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:159
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:167
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:179
Interface for k-means initialization algorithms.
Definition Initialize.hpp:29
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
std::mt19937_64 InitializeKmeansppRng
Definition InitializeKmeanspp.hpp:29
Utilities for parallelization.
Options for InitializeKmeanspp.
Definition InitializeKmeanspp.hpp:34
InitializeKmeansppRng::result_type seed
Definition InitializeKmeanspp.hpp:38
int num_threads
Definition InitializeKmeanspp.hpp:44