189 Cluster_
run(
const Matrix_& data, Cluster_ ncenters, Float_* centers)
const {
190 auto nobs = data.num_observations();
191 auto ndim = data.num_dimensions();
192 if (nobs == 0 || ndim == 0) {
196 std::vector<std::vector<typename Matrix_::index_type> > assignments(ncenters);
197 assignments[0].resize(nobs);
198 std::iota(assignments.front().begin(), assignments.front().end(), 0);
200 std::vector<std::vector<Float_> > dim_ss(ncenters);
202 auto& cur_ss = dim_ss[0];
204 std::fill_n(centers, ndim, 0);
205 auto matwork = data.create_workspace(0, nobs);
206 for (
decltype(nobs) i = 0; i < nobs; ++i) {
207 auto dptr = data.get_observation(matwork);
208 InitializeVariancePartition_internal::compute_welford(ndim, dptr, centers, cur_ss.data(),
static_cast<Float_
>(i + 1));
212 std::priority_queue<std::pair<Float_, Cluster_> > highest;
213 auto add_to_queue = [&](Cluster_ i) {
214 const auto& cur_ss = dim_ss[i];
215 Float_ sum_ss = std::accumulate(cur_ss.begin(), cur_ss.end(),
static_cast<Float_
>(0));
219 sum_ss /= std::pow(assignments[i].size(), 1.0 - my_options.
size_adjustment);
221 highest.emplace(sum_ss, i);
225 std::vector<typename Matrix_::index_type> cur_assignments_copy;
226 size_t long_ndim = ndim;
227 std::vector<Float_> opt_partition_values, opt_partition_stats;
229 for (Cluster_ cluster = 1; cluster < ncenters; ++cluster) {
230 auto chosen = highest.top();
231 if (chosen.first == 0) {
236 auto* cur_center = centers +
static_cast<size_t>(chosen.second) * long_ndim;
237 auto& cur_ss = dim_ss[chosen.second];
238 auto& cur_assignments = assignments[chosen.second];
240 size_t top_dim = std::max_element(cur_ss.begin(), cur_ss.end()) - cur_ss.begin();
241 Float_ partition_boundary;
243 partition_boundary = InitializeVariancePartition_internal::optimize_partition(data, cur_assignments, top_dim, opt_partition_values, opt_partition_stats);
245 partition_boundary = cur_center[top_dim];
248 auto* next_center = centers +
static_cast<size_t>(cluster) * long_ndim;
249 std::fill_n(next_center, ndim, 0);
250 auto& next_ss = dim_ss[cluster];
251 next_ss.resize(ndim);
252 auto& next_assignments = assignments[cluster];
254 auto work = data.create_workspace(cur_assignments.data(), cur_assignments.size());
255 cur_assignments_copy.clear();
256 std::fill_n(cur_center, ndim, 0);
257 std::fill(cur_ss.begin(), cur_ss.end(), 0);
259 for (
auto i : cur_assignments) {
260 auto dptr = data.get_observation(work);
261 if (dptr[top_dim] < partition_boundary) {
262 cur_assignments_copy.push_back(i);
263 InitializeVariancePartition_internal::compute_welford(ndim, dptr, cur_center, cur_ss.data(),
static_cast<Float_
>(cur_assignments_copy.size()));
265 next_assignments.push_back(i);
266 InitializeVariancePartition_internal::compute_welford(ndim, dptr, next_center, next_ss.data(),
static_cast<Float_
>(next_assignments.size()));
276 if (next_assignments.empty() || cur_assignments_copy.empty()) {
280 cur_assignments.swap(cur_assignments_copy);
281 add_to_queue(chosen.second);
282 add_to_queue(cluster);