kmeans
A C++ library for k-means
Loading...
Searching...
No Matches
InitializeKmeanspp.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_KMEANSPP_HPP
2#define KMEANS_INITIALIZE_KMEANSPP_HPP
3
4#include <vector>
5#include <random>
6#include <algorithm>
7#include <cstdint>
8
9#include "aarand/aarand.hpp"
10
11#include "Initialize.hpp"
12#include "SimpleMatrix.hpp"
13#include "copy_into_array.hpp"
14#include "parallelize.hpp"
15
22namespace kmeans {
23
31 uint64_t seed = 6523u;
32
37 int num_threads = 1;
38};
39
43namespace InitializeKmeanspp_internal {
44
45template<typename Float_, typename Index_, class Engine_>
46Index_ weighted_sample(const std::vector<Float_>& cumulative, const std::vector<Float_>& mindist, Index_ nobs, Engine_& eng) {
47 auto total = cumulative.back();
48 Index_ chosen_id = 0;
49
50 do {
51 const Float_ sampled_weight = total * aarand::standard_uniform<Float_>(eng);
52 chosen_id = std::lower_bound(cumulative.begin(), cumulative.end(), sampled_weight) - cumulative.begin();
53
54 // We wrap this in a do/while to defend against edge cases where
55 // ties are chosen. The most obvious of these is when you get a
56 // `sampled_weight` of zero _and_ there exists a bunch of zeros at
57 // the start of `cumulative`. One could also get unexpected ties
58 // from limited precision in floating point comparisons, so we'll
59 // just be safe and implement a loop here, in the same vein as
60 // uniform01.
61 } while (chosen_id == nobs || mindist[chosen_id] == 0);
62
63 return chosen_id;
64}
65
66template<typename Float_, class Matrix_, typename Cluster_>
67std::vector<typename Matrix_::index_type> run_kmeanspp(const Matrix_& data, Cluster_ ncenters, uint64_t seed, int nthreads) {
68 typedef typename Matrix_::index_type Index_;
69 typedef typename Matrix_::dimension_type Dim_;
70
71 auto nobs = data.num_observations();
72 auto ndim = data.num_dimensions();
73 std::vector<Float_> mindist(nobs, 1);
74 std::vector<Float_> cumulative(nobs);
75 std::vector<Index_> sofar;
76 sofar.reserve(ncenters);
77 std::mt19937_64 eng(seed);
78
79 auto last_work = data.create_workspace();
80 for (Cluster_ cen = 0; cen < ncenters; ++cen) {
81 if (!sofar.empty()) {
82 auto last_ptr = data.get_observation(sofar.back(), last_work);
83
84 parallelize(nthreads, nobs, [&](int, Index_ start, Index_ length) {
85 auto curwork = data.create_workspace();
86 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
87 if (mindist[obs]) {
88 auto acopy = data.get_observation(obs, curwork);
89 auto scopy = last_ptr;
90
91 Float_ r2 = 0;
92 for (Dim_ dim = 0; dim < ndim; ++dim, ++acopy, ++scopy) {
93 Float_ delta = static_cast<Float_>(*acopy) - static_cast<Float_>(*scopy); // cast to ensure consistent precision regardless of Data_.
94 r2 += delta * delta;
95 }
96
97 if (cen == 1 || r2 < mindist[obs]) {
98 mindist[obs] = r2;
99 }
100 }
101 }
102 });
103 }
104
105 cumulative[0] = mindist[0];
106 for (Index_ i = 1; i < nobs; ++i) {
107 cumulative[i] = cumulative[i-1] + mindist[i];
108 }
109
110 const auto total = cumulative.back();
111 if (total == 0) { // a.k.a. only duplicates left.
112 break;
113 }
114
115 auto chosen_id = weighted_sample(cumulative, mindist, nobs, eng);
116 mindist[chosen_id] = 0;
117 sofar.push_back(chosen_id);
118 }
119
120 return sofar;
121}
122
123}
145template<typename Matrix_ = SimpleMatrix<double, int>, typename Cluster_ = int, typename Float_ = double>
146class InitializeKmeanspp : public Initialize<Matrix_, Cluster_, Float_> {
147private:
148 InitializeKmeansppOptions my_options;
149
150public:
154 InitializeKmeanspp(InitializeKmeansppOptions options) : my_options(std::move(options)) {}
155
160
161public:
167 return my_options;
168 }
169
170public:
171 Cluster_ run(const Matrix_& matrix, Cluster_ ncenters, Float_* centers) const {
172 size_t nobs = matrix.num_observations();
173 if (!nobs) {
174 return 0;
175 }
176
177 auto sofar = InitializeKmeanspp_internal::run_kmeanspp<Float_>(matrix, ncenters, my_options.seed, my_options.num_threads);
178 internal::copy_into_array(matrix, sofar, centers);
179 return sofar.size();
180 }
181};
182
183}
184
185#endif
Interface for k-means initialization.
Wrapper for a simple dense matrix.
k-means++ initialization of Arthur and Vassilvitskii (2007).
Definition InitializeKmeanspp.hpp:146
InitializeKmeanspp(InitializeKmeansppOptions options)
Definition InitializeKmeanspp.hpp:154
Cluster_ run(const Matrix_ &matrix, Cluster_ ncenters, Float_ *centers) const
Definition InitializeKmeanspp.hpp:171
InitializeKmeansppOptions & get_options()
Definition InitializeKmeanspp.hpp:166
Base class for initialization algorithms.
Definition Initialize.hpp:22
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
Utilities for parallelization.
Options for k-means++ initialization.
Definition InitializeKmeanspp.hpp:27
int num_threads
Definition InitializeKmeanspp.hpp:37
uint64_t seed
Definition InitializeKmeanspp.hpp:31