kmeans
k-means clustering in C++
Loading...
Searching...
No Matches
RefineLloyd.hpp
Go to the documentation of this file.
1#ifndef KMEANS_LLOYD_HPP
2#define KMEANS_LLOYD_HPP
3
4#include <vector>
5#include <algorithm>
6
7#include "sanisizer/sanisizer.hpp"
8
9#include "Refine.hpp"
10#include "Details.hpp"
11#include "QuickSearch.hpp"
12#include "is_edge_case.hpp"
13#include "compute_centroids.hpp"
14#include "parallelize.hpp"
15
22namespace kmeans {
23
33
38 int num_threads = 1;
39};
40
66template<typename Index_, typename Data_, typename Cluster_, typename Float_, typename Matrix_ = Matrix<Index_, Data_> >
67class RefineLloyd final : public Refine<Index_, Data_, Cluster_, Float_, Matrix_> {
68private:
69 RefineLloydOptions my_options;
70
71public:
75 RefineLloyd(RefineLloydOptions options) : my_options(std::move(options)) {}
76
80 RefineLloyd() = default;
81
82public:
88 return my_options;
89 }
90
91public:
95 Details<Index_> run(const Matrix_& data, const Cluster_ ncenters, Float_* const centers, Cluster_* const clusters) const {
96 const auto nobs = data.num_observations();
97 if (internal::is_edge_case(nobs, ncenters)) {
98 return internal::process_edge_case(data, ncenters, centers, clusters);
99 }
100
101 auto sizes = sanisizer::create<std::vector<Index_> >(ncenters);
102 auto copy = sanisizer::create<std::vector<Cluster_> >(nobs);
103
104 const auto ndim = data.num_dimensions();
105 internal::QuickSearch<Float_, Cluster_> index;
106
107 decltype(I(my_options.max_iterations)) iter = 0;
108 for (; iter < my_options.max_iterations; ++iter) {
109 index.reset(ndim, ncenters, centers);
110 parallelize(my_options.num_threads, nobs, [&](const int, const Index_ start, const Index_ length) -> void {
111 auto work = data.new_extractor(start, length);
112 for (Index_ obs = start, end = start + length; obs < end; ++obs) {
113 const auto dptr = work->get_observation();
114 copy[obs] = index.find(dptr);
115 }
116 });
117
118 // Checking if it already converged.
119 bool updated = false;
120 for (Index_ obs = 0; obs < nobs; ++obs) {
121 if (copy[obs] != clusters[obs]) {
122 updated = true;
123 break;
124 }
125 }
126 if (!updated) {
127 break;
128 }
129 std::copy(copy.begin(), copy.end(), clusters);
130
131 std::fill(sizes.begin(), sizes.end(), 0);
132 for (Index_ obs = 0; obs < nobs; ++obs) {
133 ++sizes[clusters[obs]];
134 }
135 internal::compute_centroids(data, ncenters, centers, clusters, sizes);
136 }
137
138 int status = 0;
139 if (iter == my_options.max_iterations) {
140 status = 2;
141 } else {
142 ++iter; // make it 1-based.
143 }
144 return Details<Index_>(std::move(sizes), iter, status);
145 }
149};
150
151}
152
153#endif
Report detailed clustering statistics.
Interface for k-means refinement.
Implements the Lloyd algorithm for k-means clustering.
Definition RefineLloyd.hpp:67
RefineLloyd(RefineLloydOptions options)
Definition RefineLloyd.hpp:75
RefineLloydOptions & get_options()
Definition RefineLloyd.hpp:87
Interface for k-means refinement algorithms.
Definition Refine.hpp:30
virtual Details< Index_ > run(const Matrix_ &data, Cluster_ num_centers, Float_ *centers, Cluster_ *clusters) const =0
Perform k-means clustering.
Definition compute_wcss.hpp:16
void parallelize(const int num_workers, const Task_ num_tasks, Run_ run_task_range)
Definition parallelize.hpp:28
Utilities for parallelization.
Additional statistics from the k-means algorithm.
Definition Details.hpp:20
Options for RefineLloyd.
Definition RefineLloyd.hpp:27
int num_threads
Definition RefineLloyd.hpp:38
int max_iterations
Definition RefineLloyd.hpp:32