kmeans
k-means clustering in C++
Loading...
Searching...
No Matches
RefineLloyd.hpp
Go to the documentation of this file.
1#ifndef KMEANS_LLOYD_HPP
2#define KMEANS_LLOYD_HPP
3
4#include <vector>
5#include <algorithm>
6
7#include "sanisizer/sanisizer.hpp"
8
9#include "Refine.hpp"
10#include "Details.hpp"
11#include "QuickSearch.hpp"
12#include "is_edge_case.hpp"
13#include "compute_centroids.hpp"
14#include "parallelize.hpp"
15
22namespace kmeans {
23
33
38 int num_threads = 1;
39};
40
66template<typename Index_, typename Data_, typename Cluster_, typename Float_, typename Matrix_ = Matrix<Index_, Data_> >
67class RefineLloyd final : public Refine<Index_, Data_, Cluster_, Float_, Matrix_> {
68private:
69 RefineLloydOptions my_options;
70
71public:
75 RefineLloyd(RefineLloydOptions options) : my_options(std::move(options)) {}
76
80 RefineLloyd() = default;
81
82public:
88 return my_options;
89 }
90
91public:
95 Details<Index_> run(const Matrix_& data, const Cluster_ ncenters, Float_* const centers, Cluster_* const clusters) const {
96 const auto nobs = data.num_observations();
97 if (internal::is_edge_case(nobs, ncenters)) {
98 return internal::process_edge_case(data, ncenters, centers, clusters);
99 }
100
101 auto sizes = sanisizer::create<std::vector<Index_> >(ncenters);
102 const auto ndim = data.num_dimensions();
103 internal::QuickSearch<Float_, Cluster_> index;
104
105 auto num_diff_threads = sanisizer::create<std::vector<Index_> >(my_options.num_threads);
106 std::fill_n(clusters, nobs, 0); // just to avoid using uninitialized values in the first iteration of the loop.
107
108 I<decltype(my_options.max_iterations)> iter = 0;
109 for (; iter < my_options.max_iterations; ++iter) {
110 index.reset(ndim, ncenters, centers);
111
112 parallelize(my_options.num_threads, nobs, [&](const int t, const Index_ start, const Index_ length) -> void {
113 auto work = data.new_extractor(start, length);
114 Index_ num_diff = 0;
115 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
116 const auto dptr = work->get_observation();
117 const auto closest = index.find(dptr);
118 auto& previous = clusters[obs];
119 num_diff += (closest != previous);
120 previous = closest;
121 }
122 num_diff_threads[t] = num_diff;
123 });
124
125 if (iter) {
126 // Checking if it already converged.
127 bool updated = false;
128 for (const auto num_diff : num_diff_threads) {
129 if (num_diff) {
130 updated = true;
131 break;
132 }
133 }
134 if (!updated) {
135 break;
136 }
137 }
138
139 std::fill(sizes.begin(), sizes.end(), 0);
140 for (Index_ obs = 0; obs < nobs; ++obs) {
141 ++sizes[clusters[obs]];
142 }
143 internal::compute_centroids(data, ncenters, centers, clusters, sizes);
144 }
145
146 int status = 0;
147 if (iter == my_options.max_iterations) {
148 status = 2;
149 } else {
150 ++iter; // make it 1-based.
151 }
152 return Details<Index_>(std::move(sizes), iter, status);
153 }
157};
158
159}
160
161#endif
Report detailed clustering statistics.
Interface for k-means refinement.
Implements the Lloyd algorithm for k-means clustering.
Definition RefineLloyd.hpp:67
RefineLloyd(RefineLloydOptions options)
Definition RefineLloyd.hpp:75
RefineLloydOptions & get_options()
Definition RefineLloyd.hpp:87
Interface for k-means refinement algorithms.
Definition Refine.hpp:30
virtual Details< Index_ > run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers, Cluster_ *clusters) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
void parallelize(const int num_workers, const Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Utilities for parallelization.
Additional statistics from the k-means algorithm.
Definition Details.hpp:20
Options for RefineLloyd.
Definition RefineLloyd.hpp:27
int num_threads
Definition RefineLloyd.hpp:38
int max_iterations
Definition RefineLloyd.hpp:32