kmeans
A C++ library for k-means
Loading...
Searching...
No Matches
InitializeVariancePartition.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
2#define KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <queue>
8#include <cstdint>
9
10#include "Initialize.hpp"
11
17namespace kmeans {
18
35
39namespace InitializeVariancePartition_internal {
40
41template<typename Float_>
42void compute_welford(Float_ val, Float_& center, Float_& ss, Float_ count) {
43 auto cur_center = center;
44 Float_ delta = val - cur_center;
45 Float_ new_center = cur_center + delta / count;
46 center = new_center;
47 ss += (val - new_center) * delta;
48}
49
50template<class Dim_, typename Value_, typename Float_>
51void compute_welford(Dim_ ndim, const Value_* dptr, Float_* center, Float_* dim_ss, Float_ count) {
52 for (Dim_ j = 0; j < ndim; ++j) {
53 compute_welford<Float_>(dptr[j], center[j], dim_ss[j], count);
54 }
55}
56
57template<typename Matrix_, typename Float_>
58Float_ optimize_partition(
59 const Matrix_& data,
60 const std::vector<typename Matrix_::index_type>& current,
61 size_t top_dim,
62 std::vector<Float_>& value_buffer,
63 std::vector<Float_>& stat_buffer)
64{
80 size_t N = current.size();
81 auto work = data.create_workspace(current.data(), N);
82 value_buffer.clear();
83 for (size_t i = 0; i < N; ++i) {
84 auto dptr = data.get_observation(work);
85 value_buffer.push_back(dptr[top_dim]);
86 }
87
88 std::sort(value_buffer.begin(), value_buffer.end());
89
90 stat_buffer.clear();
91 stat_buffer.reserve(value_buffer.size() + 1);
92 stat_buffer.push_back(0); // first and last entries of stat_buffer are 'nonsense' partitions, i.e., all points in one partition or the other.
93
94 // Forward and backward iterations to get the sum of squares at every possible partition point.
95 Float_ mean = 0, ss = 0, count = 1;
96 for (auto val : value_buffer) {
97 compute_welford<Float_>(val, mean, ss, count);
98 stat_buffer.push_back(ss);
99 ++count;
100 }
101
102 mean = 0, ss = 0, count = 1;
103 auto sbIt = stat_buffer.rbegin() + 1; // skipping the last entry, as it's a nonsense partition.
104 for (auto vIt = value_buffer.rbegin(), vEnd = value_buffer.rend(); vIt != vEnd; ++vIt) {
105 compute_welford<Float_>(*vIt, mean, ss, count);
106 *sbIt += ss;
107 ++count;
108 ++sbIt;
109 }
110
111 auto sbBegin = stat_buffer.begin(), sbEnd = stat_buffer.end();
112 auto which_min = std::min_element(sbBegin, sbEnd) - sbBegin;
113 if (which_min == 0) {
114 // Should never get to this point as current.size() > 1,
115 // but we'll just add this in for safety's sake.
116 return value_buffer[0];
117 } else {
118 // stat_buffer[i] represents the SS when {0, 1, 2, ..., i-1} goes in
119 // the left partition and {i, ..., N-1} goes in the right partition.
120 // To avoid issues with ties, we use the average of the two edge
121 // points as the partition boundary.
122 return (value_buffer[which_min - 1] + value_buffer[which_min]) / 2;
123 }
124}
125
126}
163template<typename Matrix_ = SimpleMatrix<double, int>, typename Cluster_ = int, typename Float_ = double>
164class InitializeVariancePartition : public Initialize<Matrix_, Cluster_, Float_> {
165public:
169 InitializeVariancePartition(InitializeVariancePartitionOptions options) : my_options(std::move(options)) {}
170
175
176private:
178
179public:
185 return my_options;
186 }
187
188public:
189 Cluster_ run(const Matrix_& data, Cluster_ ncenters, Float_* centers) const {
190 auto nobs = data.num_observations();
191 auto ndim = data.num_dimensions();
192 if (nobs == 0 || ndim == 0) {
193 return 0;
194 }
195
196 std::vector<std::vector<typename Matrix_::index_type> > assignments(ncenters);
197 assignments[0].resize(nobs);
198 std::iota(assignments.front().begin(), assignments.front().end(), 0);
199
200 std::vector<std::vector<Float_> > dim_ss(ncenters);
201 {
202 auto& cur_ss = dim_ss[0];
203 cur_ss.resize(ndim);
204 std::fill_n(centers, ndim, 0);
205 auto matwork = data.create_workspace(0, nobs);
206 for (decltype(nobs) i = 0; i < nobs; ++i) {
207 auto dptr = data.get_observation(matwork);
208 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(), static_cast<Float_>(i + 1));
209 }
210 }
211
212 std::priority_queue<std::pair<Float_, Cluster_> > highest;
213 auto add_to_queue = [&](Cluster_ i) {
214 const auto& cur_ss = dim_ss[i];
215 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(), static_cast<Float_>(0));
216
217 // Instead of dividing by N and then remultiplying by pow(N, adjustment), we just
218 // divide by pow(N, 1 - adjustment) to save some time and precision.
219 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.size_adjustment);
220
221 highest.emplace(sum_ss, i);
222 };
223 add_to_queue(0);
224
225 std::vector<typename Matrix_::index_type> cur_assignments_copy;
226 size_t long_ndim = ndim;
227 std::vector<Float_> opt_partition_values, opt_partition_stats;
228
229 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
230 auto chosen = highest.top();
231 if (chosen.first == 0) {
232 return cluster; // no point continuing, we're at zero variance already.
233 }
234 highest.pop();
235
236 auto* cur_center = centers + static_cast<size_t>(chosen.second) * long_ndim; // cast to size_t to avoid overflow issues.
237 auto& cur_ss = dim_ss[chosen.second];
238 auto& cur_assignments = assignments[chosen.second];
239
240 size_t top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
241 Float_ partition_boundary;
242 if (my_options.optimize_partition) {
243 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
244 } else {
245 partition_boundary = cur_center[top_dim];
246 }
247
248 auto* next_center = centers + static_cast<size_t>(cluster) * long_ndim; // again, size_t to avoid overflow.
249 std::fill_n(next_center, ndim, 0);
250 auto& next_ss = dim_ss[cluster];
251 next_ss.resize(ndim);
252 auto& next_assignments = assignments[cluster];
253
254 auto work = data.create_workspace(cur_assignments.data(), cur_assignments.size());
255 cur_assignments_copy.clear();
256 std::fill_n(cur_center, ndim, 0);
257 std::fill(cur_ss.begin(), cur_ss.end(), 0);
258
259 for (auto i : cur_assignments) {
260 auto dptr = data.get_observation(work);
261 if (dptr[top_dim] < partition_boundary) {
262 cur_assignments_copy.push_back(i);
263 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(), static_cast<Float_>(cur_assignments_copy.size()));
264 } else {
265 next_assignments.push_back(i);
266 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(), static_cast<Float_>(next_assignments.size()));
267 }
268 }
269
270 // If one or the other is empty, then this entire procedure short
271 // circuits as all future iterations will just re-select this
272 // cluster (which won't get partitioned properly anyway). In the
273 // bigger picture, the quick exit out of the iterations is correct
274 // as we should only fail to partition in this manner if all points
275 // within each remaining cluster are identical.
276 if (next_assignments.empty() || cur_assignments_copy.empty()) {
277 return cluster;
278 }
279
280 cur_assignments.swap(cur_assignments_copy);
281 add_to_queue(chosen.second);
282 add_to_queue(cluster);
283 }
284
285 return ncenters;
286 }
287};
288
292// Back-compatibility.
293template<typename Matrix_ = SimpleMatrix<double, int>, typename Cluster_ = int, typename Float_ = double>
294using InitializePcaPartition = InitializeVariancePartition<Matrix_, Cluster_, Float_>;
295
296typedef InitializeVariancePartitionOptions InitializePcaPartitionOptions;
301}
302
303#endif
Interface for k-means initialization.
Implements the variance partitioning method of Su and Dy (2007).
Definition InitializeVariancePartition.hpp:164
InitializeVariancePartition(InitializeVariancePartitionOptions options)
Definition InitializeVariancePartition.hpp:169
Cluster_ run(const Matrix_ &data, Cluster_ ncenters, Float_ *centers) const
Definition InitializeVariancePartition.hpp:189
InitializeVariancePartitionOptions & get_options()
Definition InitializeVariancePartition.hpp:184
Base class for initialization algorithms.
Definition Initialize.hpp:22
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
Options for InitializeVariancePartition.
Definition InitializeVariancePartition.hpp:22
bool optimize_partition
Definition InitializeVariancePartition.hpp:33
double size_adjustment
Definition InitializeVariancePartition.hpp:27