196 Cluster_
run(
const Matrix_& data, Cluster_ ncenters, Float_* centers)
const {
197 auto nobs = data.num_observations();
198 size_t ndim = data.num_dimensions();
199 if (nobs == 0 || ndim == 0) {
203 std::vector<std::vector<Index_> > assignments(ncenters);
204 assignments[0].resize(nobs);
205 std::iota(assignments.front().begin(), assignments.front().end(), 0);
207 std::vector<std::vector<Float_> > dim_ss(ncenters);
209 auto& cur_ss = dim_ss[0];
211 std::fill_n(centers, ndim, 0);
212 auto matwork = data.new_extractor(0, nobs);
213 for (
decltype(nobs) i = 0; i < nobs; ++i) {
214 auto dptr = matwork->get_observation();
215 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(),
static_cast<Float_
>(i + 1));
219 std::priority_queue<std::pair<Float_, Cluster_> > highest;
220 auto add_to_queue = [&](Cluster_ i) ->
void {
221 const auto& cur_ss = dim_ss[i];
222 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(),
static_cast<Float_
>(0));
226 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.
size_adjustment);
228 highest.emplace(sum_ss, i);
232 std::vector<Index_> cur_assignments_copy;
233 std::vector<Float_> opt_partition_values, opt_partition_stats;
235 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
236 auto chosen = highest.top();
237 if (chosen.first == 0) {
242 auto* cur_center = centers +
static_cast<size_t>(chosen.second) * ndim;
243 auto& cur_ss = dim_ss[chosen.second];
244 auto& cur_assignments = assignments[chosen.second];
246 size_t top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
247 Float_ partition_boundary;
249 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
251 partition_boundary = cur_center[top_dim];
254 auto* next_center = centers +
static_cast<size_t>(cluster) * ndim;
255 std::fill_n(next_center, ndim, 0);
256 auto& next_ss = dim_ss[cluster];
257 next_ss.resize(ndim);
258 auto& next_assignments = assignments[cluster];
260 cur_assignments_copy.clear();
261 std::fill_n(cur_center, ndim, 0);
262 std::fill(cur_ss.begin(), cur_ss.end(), 0);
263 auto work = data.new_extractor(cur_assignments.data(), cur_assignments.size());
265 for (
auto i : cur_assignments) {
266 auto dptr = work->get_observation();
267 if (dptr[top_dim] < partition_boundary) {
268 cur_assignments_copy.push_back(i);
269 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(),
static_cast<Float_
>(cur_assignments_copy.size()));
271 next_assignments.push_back(i);
272 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(),
static_cast<Float_
>(next_assignments.size()));
282 if (next_assignments.empty() || cur_assignments_copy.empty()) {
286 cur_assignments.swap(cur_assignments_copy);
287 add_to_queue(chosen.second);
288 add_to_queue(cluster);