kmeans
k-means clustering in C++
Loading...
Searching...
No Matches
InitializeVariancePartition.hpp
Go to the documentation of this file.
1#ifndef KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
2#define KMEANS_INITIALIZE_VARIANCE_PARTITION_HPP
3
4#include <vector>
5#include <algorithm>
6#include <numeric>
7#include <queue>
8#include <cstddef>
9
10#include "sanisizer/sanisizer.hpp"
11
12#include "Initialize.hpp"
13#include "Matrix.hpp"
14
20namespace kmeans {
21
38
42namespace InitializeVariancePartition_internal {
43
44template<typename Float_>
45void compute_welford(const Float_ val, Float_& center, Float_& ss, const Float_ count) {
46 const auto cur_center = center;
47 const Float_ delta = val - cur_center;
48 const Float_ new_center = cur_center + delta / count;
49 center = new_center;
50 ss += (val - new_center) * delta;
51}
52
53template<typename Data_, typename Float_>
54void compute_welford(const std::size_t ndim, const Data_* const dptr, Float_* const center, Float_* const dim_ss, const Float_ count) {
55 for (decltype(I(ndim)) d = 0; d < ndim; ++d) {
56 compute_welford<Float_>(dptr[d], center[d], dim_ss[d], count);
57 }
58}
59
60template<typename Matrix_, typename Index_, typename Float_>
61Float_ optimize_partition(
62 const Matrix_& data,
63 const std::vector<Index_>& current,
64 const std::size_t top_dim,
65 std::vector<Float_>& value_buffer,
66 std::vector<Float_>& stat_buffer)
67{
82 const auto N = current.size();
83 auto work = data.new_extractor(current.data(), static_cast<std::size_t>(N)); // safety of the cast is already checked in InitializeVariancePartition::run().
84 value_buffer.clear();
85 value_buffer.reserve(N);
86 for (decltype(I(N)) i = 0; i < N; ++i) {
87 const auto dptr = work->get_observation();
88 value_buffer.push_back(dptr[top_dim]);
89 }
90 std::sort(value_buffer.begin(), value_buffer.end());
91
92 // stat_buffer[i] represents the SS when {0, 1, 2, ..., i} of values_buffer goes in the left partition and {i + 1, ..., N-1} goes in the right partition.
93 // This implies that stat_buffer will have length N - 1, such that i can span from 0 to N - 2.
94 // It can also be guaranteed that N >= 2 as a cluster with one point will have zero and thus never be selected for partition optimization.
95 stat_buffer.clear();
96 const auto N_m1 = N - 1;
97 stat_buffer.reserve(N_m1);
98
99 // Forward and backward iterations to get the sum of squares for the left and right partitions, respectively, at every possible partition point.
100 stat_buffer.push_back(0);
101 Float_ mean = value_buffer.front(), ss = 0, count = 2;
102 for (decltype(I(N_m1)) i = 1; i < N_m1; ++i) { // skip i == 0 as the left partition only has one point.
103 compute_welford<Float_>(value_buffer[i], mean, ss, count);
104 stat_buffer.push_back(ss);
105 ++count;
106 }
107
108 mean = value_buffer.back(), ss = 0, count = 2;
109 for (decltype(I(N_m1)) i_p1 = N_m1 - 1; i_p1 > 0; --i_p1) { // skip i + 1 == N - 1 (i.e., i_p1 == N_m1) as the right partition only has one point.
110 compute_welford<Float_>(value_buffer[i_p1], mean, ss, count);
111 stat_buffer[i_p1 - 1] += ss;
112 ++count;
113 }
114
115 // iterator arithmetic is safe as we checked can_ptrdiff() in InitializeVariancePartition::run().
116 const auto sbBegin = stat_buffer.begin();
117 const decltype(I(value_buffer.size())) which_min = std::min_element(sbBegin, stat_buffer.end()) - sbBegin;
118
119 // To avoid issues with ties, we use the average of the two edge points as the partition boundary.
120 const auto left = value_buffer[which_min];
121 const auto right = value_buffer[which_min + 1];
122 return left + (right - left) / 2; // avoid FP overflow.
123}
124
125}
166template<typename Index_, typename Data_, typename Cluster_, typename Float_, class Matrix_ = Matrix<Index_, Data_> >
167class InitializeVariancePartition final : public Initialize<Index_, Data_, Cluster_, Float_, Matrix_> {
168public:
172 InitializeVariancePartition(InitializeVariancePartitionOptions options) : my_options(std::move(options)) {}
173
178
179private:
181
182public:
188 return my_options;
189 }
190
191public:
195 Cluster_ run(const Matrix_& data, const Cluster_ ncenters, Float_* const centers) const {
196 const auto nobs = data.num_observations();
197 const auto ndim = data.num_dimensions();
198 if (nobs == 0 || ndim == 0) {
199 return 0;
200 }
201
202 auto assignments = sanisizer::create<std::vector<std::vector<Index_> > >(ncenters);
203 sanisizer::resize(assignments[0], nobs);
204 std::iota(assignments.front().begin(), assignments.front().end(), static_cast<Index_>(0));
205 sanisizer::cast<std::size_t>(nobs); // Checking that the maximum size of each 'assignments' can fit into a std::size_t, for optimal_partition().
206
207 auto dim_ss = sanisizer::create<std::vector<std::vector<Float_> > >(ncenters);
208 {
209 auto& cur_ss = dim_ss[0];
210 sanisizer::resize(cur_ss, ndim);
211 std::fill_n(centers, ndim, 0);
212 auto matwork = data.new_extractor(static_cast<Index_>(0), nobs);
213 for (decltype(I(nobs)) i = 0; i < nobs; ++i) {
214 const auto dptr = matwork->get_observation();
215 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(), static_cast<Float_>(i + 1));
216 }
217 }
218
219 std::priority_queue<std::pair<Float_, Cluster_> > highest;
220 auto add_to_queue = [&](const Cluster_ i) -> void {
221 const auto& cur_ss = dim_ss[i];
222 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(), static_cast<Float_>(0));
223
224 // Instead of dividing by N and then remultiplying by pow(N, adjustment), we just
225 // divide by pow(N, 1 - adjustment) to save some time and precision.
226 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.size_adjustment);
227
228 highest.emplace(sum_ss, i);
229 };
230 add_to_queue(0);
231
232 std::vector<Index_> cur_assignments_copy;
233 std::vector<Float_> opt_partition_values, opt_partition_stats;
234
235 // Checking that iterator arithmetic won't overflow the vector's ptrdiff type.
236 sanisizer::can_ptrdiff<decltype(I(opt_partition_values.begin()))>(nobs);
237 sanisizer::can_ptrdiff<decltype(I(dim_ss.front().begin()))>(ndim);
238
239 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
240 const auto chosen = highest.top();
241 if (chosen.first == 0) {
242 return cluster; // no point continuing, we're at zero variance already.
243 }
244 highest.pop();
245
246 const auto cur_center = centers + sanisizer::product_unsafe<std::size_t>(chosen.second, ndim);
247 auto& cur_ss = dim_ss[chosen.second];
248 auto& cur_assignments = assignments[chosen.second];
249
250 // Iterator arithmetic is safe as we checked can_ptrdiff outside the loop.
251 const decltype(I(ndim)) top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
252 Float_ partition_boundary;
253 if (my_options.optimize_partition) {
254 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
255 } else {
256 partition_boundary = cur_center[top_dim];
257 }
258
259 const auto next_center = centers + sanisizer::product_unsafe<std::size_t>(cluster, ndim);
260 std::fill_n(next_center, ndim, 0);
261 auto& next_ss = dim_ss[cluster];
262 next_ss.resize(ndim); // resize() is safe from overflow as we already tested it outside the loop with dim_ss[0].
263 auto& next_assignments = assignments[cluster];
264
265 cur_assignments_copy.clear();
266 std::fill_n(cur_center, ndim, 0);
267 std::fill(cur_ss.begin(), cur_ss.end(), 0);
268 auto work = data.new_extractor(cur_assignments.data(), cur_assignments.size());
269
270 for (const auto i : cur_assignments) {
271 const auto dptr = work->get_observation(); // make sure this is outside the if(), as it must always be called in each loop iteration to match 'cur_assignments' properly.
272 if (dptr[top_dim] < partition_boundary) {
273 cur_assignments_copy.push_back(i);
274 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(), static_cast<Float_>(cur_assignments_copy.size()));
275 } else {
276 next_assignments.push_back(i);
277 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(), static_cast<Float_>(next_assignments.size()));
278 }
279 }
280
281 // If one or the other is empty, then this entire procedure short-circuits as all future iterations will just re-select this cluster (which won't get partitioned properly anyway).
282 // In the bigger picture, the quick exit out of the iterations is correct as we should only fail to partition in this manner if all points within each remaining cluster are identical.
283 if (next_assignments.empty() || cur_assignments_copy.empty()) {
284 return cluster;
285 }
286
287 cur_assignments.swap(cur_assignments_copy);
288 add_to_queue(chosen.second);
289 add_to_queue(cluster);
290 }
291
292 return ncenters;
293 }
297};
298
299}
300
301#endif
Interface for k-means initialization.
Interface for matrix inputs.
Implements the variance partitioning method of Su and Dy (2007).
Definition InitializeVariancePartition.hpp:167
InitializeVariancePartitionOptions & get_options()
Definition InitializeVariancePartition.hpp:187
InitializeVariancePartition(InitializeVariancePartitionOptions options)
Definition InitializeVariancePartition.hpp:172
Interface for k-means initialization algorithms.
Definition Initialize.hpp:29
virtual Cluster_ run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
Options for InitializeVariancePartition.
Definition InitializeVariancePartition.hpp:25
bool optimize_partition
Definition InitializeVariancePartition.hpp:36
double size_adjustment
Definition InitializeVariancePartition.hpp:30