kmeans
A C++ library for k-means
Loading...
Searching...
No Matches
InitializeVariancePartition.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
2#define KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <queue>
8#include <cstdint>
9
10#include "Initialize.hpp"
11#include "Matrix.hpp"
12
18namespace kmeans {
19
36
40namespace InitializeVariancePartition_internal {
41
42template<typename Float_>
43void compute_welford(Float_ val, Float_& center, Float_& ss, Float_ count) {
44 auto cur_center = center;
45 Float_ delta = val - cur_center;
46 Float_ new_center = cur_center + delta / count;
47 center = new_center;
48 ss += (val - new_center) * delta;
49}
50
51template<typename Data_, typename Float_>
52void compute_welford(size_t ndim, const Data_* dptr, Float_* center, Float_* dim_ss, Float_ count) {
53 for (size_t d = 0; d < ndim; ++d) {
54 compute_welford<Float_>(dptr[d], center[d], dim_ss[d], count);
55 }
56}
57
58template<typename Matrix_, typename Index_, typename Float_>
59Float_ optimize_partition(
60 const Matrix_& data,
61 const std::vector<Index_>& current,
62 size_t top_dim,
63 std::vector<Float_>& value_buffer,
64 std::vector<Float_>& stat_buffer)
65{
81 size_t N = current.size();
82 auto work = data.new_extractor(current.data(), N);
83 value_buffer.clear();
84 for (size_t i = 0; i < N; ++i) {
85 auto dptr = work->get_observation();
86 value_buffer.push_back(dptr[top_dim]);
87 }
88
89 std::sort(value_buffer.begin(), value_buffer.end());
90
91 stat_buffer.clear();
92 stat_buffer.reserve(value_buffer.size() + 1);
93 stat_buffer.push_back(0); // first and last entries of stat_buffer are 'nonsense' partitions, i.e., all points in one partition or the other.
94
95 // Forward and backward iterations to get the sum of squares at every possible partition point.
96 Float_ mean = 0, ss = 0, count = 1;
97 for (auto val : value_buffer) {
98 compute_welford<Float_>(val, mean, ss, count);
99 stat_buffer.push_back(ss);
100 ++count;
101 }
102
103 mean = 0, ss = 0, count = 1;
104 auto sbIt = stat_buffer.rbegin() + 1; // skipping the last entry, as it's a nonsense partition.
105 for (auto vIt = value_buffer.rbegin(), vEnd = value_buffer.rend(); vIt != vEnd; ++vIt) {
106 compute_welford<Float_>(*vIt, mean, ss, count);
107 *sbIt += ss;
108 ++count;
109 ++sbIt;
110 }
111
112 auto sbBegin = stat_buffer.begin(), sbEnd = stat_buffer.end();
113 auto which_min = std::min_element(sbBegin, sbEnd) - sbBegin;
114 if (which_min == 0) {
115 // Should never get to this point as current.size() > 1,
116 // but we'll just add this in for safety's sake.
117 return value_buffer[0];
118 } else {
119 // stat_buffer[i] represents the SS when {0, 1, 2, ..., i-1} goes in
120 // the left partition and {i, ..., N-1} goes in the right partition.
121 // To avoid issues with ties, we use the average of the two edge
122 // points as the partition boundary.
123 return (value_buffer[which_min - 1] + value_buffer[which_min]) / 2;
124 }
125}
126
127}
167template<typename Index_, typename Data_, typename Cluster_, typename Float_, class Matrix_ = Matrix<Index_, Data_> >
168class InitializeVariancePartition final : public Initialize<Index_, Data_, Cluster_, Float_, Matrix_> {
169public:
173 InitializeVariancePartition(InitializeVariancePartitionOptions options) : my_options(std::move(options)) {}
174
179
180private:
182
183public:
189 return my_options;
190 }
191
192public:
196 Cluster_ run(const Matrix_& data, Cluster_ ncenters, Float_* centers) const {
197 auto nobs = data.num_observations();
198 size_t ndim = data.num_dimensions();
199 if (nobs == 0 || ndim == 0) {
200 return 0;
201 }
202
203 std::vector<std::vector<Index_> > assignments(ncenters);
204 assignments[0].resize(nobs);
205 std::iota(assignments.front().begin(), assignments.front().end(), 0);
206
207 std::vector<std::vector<Float_> > dim_ss(ncenters);
208 {
209 auto& cur_ss = dim_ss[0];
210 cur_ss.resize(ndim);
211 std::fill_n(centers, ndim, 0);
212 auto matwork = data.new_extractor(0, nobs);
213 for (decltype(nobs) i = 0; i < nobs; ++i) {
214 auto dptr = matwork->get_observation();
215 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(), static_cast<Float_>(i + 1));
216 }
217 }
218
219 std::priority_queue<std::pair<Float_, Cluster_> > highest;
220 auto add_to_queue = [&](Cluster_ i) -> void {
221 const auto& cur_ss = dim_ss[i];
222 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(), static_cast<Float_>(0));
223
224 // Instead of dividing by N and then remultiplying by pow(N, adjustment), we just
225 // divide by pow(N, 1 - adjustment) to save some time and precision.
226 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.size_adjustment);
227
228 highest.emplace(sum_ss, i);
229 };
230 add_to_queue(0);
231
232 std::vector<Index_> cur_assignments_copy;
233 std::vector<Float_> opt_partition_values, opt_partition_stats;
234
235 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
236 auto chosen = highest.top();
237 if (chosen.first == 0) {
238 return cluster; // no point continuing, we're at zero variance already.
239 }
240 highest.pop();
241
242 auto* cur_center = centers + static_cast<size_t>(chosen.second) * ndim; // cast to size_t to avoid overflow issues.
243 auto& cur_ss = dim_ss[chosen.second];
244 auto& cur_assignments = assignments[chosen.second];
245
246 size_t top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
247 Float_ partition_boundary;
248 if (my_options.optimize_partition) {
249 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
250 } else {
251 partition_boundary = cur_center[top_dim];
252 }
253
254 auto* next_center = centers + static_cast<size_t>(cluster) * ndim; // again, size_t to avoid overflow.
255 std::fill_n(next_center, ndim, 0);
256 auto& next_ss = dim_ss[cluster];
257 next_ss.resize(ndim);
258 auto& next_assignments = assignments[cluster];
259
260 cur_assignments_copy.clear();
261 std::fill_n(cur_center, ndim, 0);
262 std::fill(cur_ss.begin(), cur_ss.end(), 0);
263 auto work = data.new_extractor(cur_assignments.data(), cur_assignments.size());
264
265 for (auto i : cur_assignments) {
266 auto dptr = work->get_observation(); // make sure this is outside the if(), as it must always be called in each loop iteration to match 'cur_assignments' properly.
267 if (dptr[top_dim] < partition_boundary) {
268 cur_assignments_copy.push_back(i);
269 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(), static_cast<Float_>(cur_assignments_copy.size()));
270 } else {
271 next_assignments.push_back(i);
272 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(), static_cast<Float_>(next_assignments.size()));
273 }
274 }
275
276 // If one or the other is empty, then this entire procedure short
277 // circuits as all future iterations will just re-select this
278 // cluster (which won't get partitioned properly anyway). In the
279 // bigger picture, the quick exit out of the iterations is correct
280 // as we should only fail to partition in this manner if all points
281 // within each remaining cluster are identical.
282 if (next_assignments.empty() || cur_assignments_copy.empty()) {
283 return cluster;
284 }
285
286 cur_assignments.swap(cur_assignments_copy);
287 add_to_queue(chosen.second);
288 add_to_queue(cluster);
289 }
290
291 return ncenters;
292 }
296};
297
298}
299
300#endif
Interface for k-means initialization.
Interface for matrix inputs.
Implements the variance partitioning method of Su and Dy (2007).
Definition InitializeVariancePartition.hpp:168
InitializeVariancePartitionOptions & get_options()
Definition InitializeVariancePartition.hpp:188
InitializeVariancePartition(InitializeVariancePartitionOptions options)
Definition InitializeVariancePartition.hpp:173
Interface for k-means initialization algorithms.
Definition Initialize.hpp:27
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Namespace for k-means clustering.
Definition compute_wcss.hpp:12
Options for InitializeVariancePartition.
Definition InitializeVariancePartition.hpp:23
bool optimize_partition
Definition InitializeVariancePartition.hpp:34
double size_adjustment
Definition InitializeVariancePartition.hpp:28