195 Cluster_
run(
const Matrix_& data,
const Cluster_ ncenters, Float_*
const centers)
const {
196 const auto nobs = data.num_observations();
197 const auto ndim = data.num_dimensions();
198 if (nobs == 0 || ndim == 0) {
202 auto assignments = sanisizer::create<std::vector<std::vector<Index_> > >(ncenters);
203 sanisizer::resize(assignments[0], nobs);
204 std::iota(assignments.front().begin(), assignments.front().end(),
static_cast<Index_
>(0));
205 sanisizer::cast<std::size_t>(nobs);
207 auto dim_ss = sanisizer::create<std::vector<std::vector<Float_> > >(ncenters);
209 auto& cur_ss = dim_ss[0];
210 sanisizer::resize(cur_ss, ndim);
211 std::fill_n(centers, ndim, 0);
212 auto matwork = data.new_extractor(
static_cast<Index_
>(0), nobs);
213 for (
decltype(I(nobs)) i = 0; i < nobs; ++i) {
214 const auto dptr = matwork->get_observation();
215 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(),
static_cast<Float_
>(i + 1));
219 std::priority_queue<std::pair<Float_, Cluster_> > highest;
220 auto add_to_queue = [&](
const Cluster_ i) ->
void {
221 const auto& cur_ss = dim_ss[i];
222 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(),
static_cast<Float_
>(0));
226 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.
size_adjustment);
228 highest.emplace(sum_ss, i);
232 std::vector<Index_> cur_assignments_copy;
233 std::vector<Float_> opt_partition_values, opt_partition_stats;
236 sanisizer::can_ptrdiff<
decltype(I(opt_partition_values.begin()))>(nobs);
237 sanisizer::can_ptrdiff<
decltype(I(dim_ss.front().begin()))>(ndim);
239 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
240 const auto chosen = highest.top();
241 if (chosen.first == 0) {
246 const auto cur_center = centers + sanisizer::product_unsafe<std::size_t>(chosen.second, ndim);
247 auto& cur_ss = dim_ss[chosen.second];
248 auto& cur_assignments = assignments[chosen.second];
251 const decltype(I(ndim)) top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
252 Float_ partition_boundary;
254 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
256 partition_boundary = cur_center[top_dim];
259 const auto next_center = centers + sanisizer::product_unsafe<std::size_t>(cluster, ndim);
260 std::fill_n(next_center, ndim, 0);
261 auto& next_ss = dim_ss[cluster];
262 next_ss.resize(ndim);
263 auto& next_assignments = assignments[cluster];
265 cur_assignments_copy.clear();
266 std::fill_n(cur_center, ndim, 0);
267 std::fill(cur_ss.begin(), cur_ss.end(), 0);
268 auto work = data.new_extractor(cur_assignments.data(), cur_assignments.size());
270 for (
const auto i : cur_assignments) {
271 const auto dptr = work->get_observation();
272 if (dptr[top_dim] < partition_boundary) {
273 cur_assignments_copy.push_back(i);
274 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(),
static_cast<Float_
>(cur_assignments_copy.size()));
276 next_assignments.push_back(i);
277 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(),
static_cast<Float_
>(next_assignments.size()));
283 if (next_assignments.empty() || cur_assignments_copy.empty()) {
287 cur_assignments.swap(cur_assignments_copy);
288 add_to_queue(chosen.second);
289 add_to_queue(cluster);