kmeans
k-means clustering in C++
|
Perform k-means clustering. More...
Classes | |
class | ConsecutiveAccessExtractor |
Extractor for accessing consecutive observations. More... | |
struct | Details |
Additional statistics from the k-means algorithm. More... | |
class | IndexedAccessExtractor |
Extractor for accessing indexed observations. More... | |
class | Initialize |
Interface for k-means initialization algorithms. More... | |
class | InitializeKmeanspp |
k-means++ initialization of Arthur and Vassilvitskii (2007). More... | |
struct | InitializeKmeansppOptions |
Options for InitializeKmeanspp . More... | |
class | InitializeNone |
No-op "initialization" with existing cluster centers. More... | |
class | InitializeRandom |
Initialize by sampling random observations without replacement. More... | |
struct | InitializeRandomOptions |
Options for InitializeRandom . More... | |
class | InitializeVariancePartition |
Implements the variance partitioning method of Su and Dy (2007). More... | |
struct | InitializeVariancePartitionOptions |
Options for InitializeVariancePartition . More... | |
class | Matrix |
Interface for matrix data. More... | |
class | RandomAccessExtractor |
Extractor for accessing random observations. More... | |
class | Refine |
Interface for k-means refinement algorithms. More... | |
class | RefineHartiganWong |
Implements the Hartigan-Wong algorithm for k-means clustering. More... | |
struct | RefineHartiganWongOptions |
Options for RefineHartiganWong . More... | |
class | RefineLloyd |
Implements the Lloyd algorithm for k-means clustering. More... | |
struct | RefineLloydOptions |
Options for RefineLloyd . More... | |
class | RefineMiniBatch |
Implements the mini-batch algorithm for k-means clustering. More... | |
struct | RefineMiniBatchOptions |
Options for RefineMiniBatch . More... | |
struct | Results |
Results of the k-means clustering. More... | |
class | SimpleMatrix |
A simple matrix of observations. More... | |
Typedefs | |
typedef std::mt19937_64 | InitializeKmeansppRng |
typedef std::mt19937_64 | InitializeRandomRng |
typedef std::mt19937_64 | RefineMiniBatchRng |
Functions | |
template<class Matrix_ , typename Cluster_ , typename Float_ > | |
void | compute_wcss (const Matrix_ &data, const Cluster_ num_centers, const Float_ *const centers, const Cluster_ *const clusters, Float_ *const wcss) |
template<typename Index_ , typename Data_ , typename Cluster_ , typename Float_ , class Matrix_ = Matrix<Index_, Data_>> | |
Details< Index_ > | compute (const Matrix_ &data, const Initialize< Index_, Data_, Cluster_, Float_, Matrix_ > &initialize, const Refine< Index_, Data_, Cluster_, Float_, Matrix_ > &refine, Cluster_ num_centers, Float_ *centers, Cluster_ *clusters) |
template<typename Index_ , typename Data_ , typename Cluster_ , typename Float_ > | |
Details< Index_ > | compute (const Matrix< Index_, Data_ > &data, const Initialize< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > &initialize, const Refine< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > &refine, const Cluster_ num_centers, Float_ *const centers, Cluster_ *const clusters) |
template<typename Index_ , typename Data_ , typename Cluster_ , typename Float_ , class Matrix_ = Matrix<Index_, Data_>> | |
Results< Index_, Cluster_, Float_ > | compute (const Matrix_ &data, const Initialize< Index_, Data_, Cluster_, Float_, Matrix_ > &initialize, const Refine< Index_, Data_, Cluster_, Float_, Matrix_ > &refine, const Cluster_ num_centers) |
template<typename Index_ , typename Data_ , typename Cluster_ , typename Float_ > | |
Results< Index_, Cluster_, Float_ > | compute (const Matrix< Index_, Data_ > &data, const Initialize< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > &initialize, const Refine< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > &refine, const Cluster_ num_centers) |
template<typename Task_ , class Run_ > | |
void | parallelize (const int num_workers, const Task_ num_tasks, Run_ run_task_range) |
Perform k-means clustering.
typedef std::mt19937_64 kmeans::InitializeKmeansppRng |
Type of the pseudo-random number generator for InitializeKmeanspp
.
typedef std::mt19937_64 kmeans::InitializeRandomRng |
Type of the pseudo-random number generator for InitializeRandom
.
typedef std::mt19937_64 kmeans::RefineMiniBatchRng |
Type of the pseudo-random number generator for RefineMiniBatch
.
Results< Index_, Cluster_, Float_ > kmeans::compute | ( | const Matrix< Index_, Data_ > & | data, |
const Initialize< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > & | initialize, | ||
const Refine< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > & | refine, | ||
const Cluster_ | num_centers ) |
Overload of compute()
to assist template deduction for the default Matrix
. This allocates and returns the vectors for the centroids and cluster assignments.
Index_ | Integer type of the observation indices. |
Data_ | Numeric type of the input dataset. |
Cluster_ | Integer type of the cluster assignments. |
Float_ | Floating-point type of the centroids. |
data | A matrix containing data for each observation. |
initialize | Initialization method to use. |
refine | Refinement method to use. |
num_centers | Number of cluster centers. |
Details< Index_ > kmeans::compute | ( | const Matrix< Index_, Data_ > & | data, |
const Initialize< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > & | initialize, | ||
const Refine< Index_, Data_, Cluster_, Float_, Matrix< Index_, Data_ > > & | refine, | ||
const Cluster_ | num_centers, | ||
Float_ *const | centers, | ||
Cluster_ *const | clusters ) |
Overload of compute()
to assist template deduction for the default Matrix
.
Index_ | Integer type of the observation indices. |
Data_ | Numeric type of the input dataset. |
Cluster_ | Integer type of the cluster assignments. |
Float_ | Floating-point type of the centroids. |
data | A matrix containing data for each observation. | |
initialize | Initialization method to use. | |
refine | Refinement method to use. | |
num_centers | Number of cluster centers. | |
[out] | centers | Pointer to an array of length equal to the product of num_centers and data.num_dimensions() . This contains a column-major matrix where rows correspond to dimensions and columns correspond to cluster centers. On output, each column should contain the initial centroid location for its cluster. |
[in] | clusters | Pointer to an array of length equal to the number of observations (from data.num_observations() ). On output, this will contain the 0-based cluster assignment for each observation, where each entry is less than num_centers . |
Details< Index_ > kmeans::compute | ( | const Matrix_ & | data, |
const Initialize< Index_, Data_, Cluster_, Float_, Matrix_ > & | initialize, | ||
const Refine< Index_, Data_, Cluster_, Float_, Matrix_ > & | refine, | ||
Cluster_ | num_centers, | ||
Float_ * | centers, | ||
Cluster_ * | clusters ) |
Index_ | Integer type of the observation indices. This should be the same as the index type of Matrix_ . |
Data_ | Numeric type of the input dataset. This should be the same as the data type of Matrix_ . |
Cluster_ | Integer type of the cluster assignments. |
Float_ | Floating-point type of the centroids. |
Matrix_ | Class satisfying the Matrix interface. |
data | A matrix containing data for each observation. | |
initialize | Initialization method to use. | |
refine | Refinement method to use. | |
num_centers | Number of cluster centers. | |
[out] | centers | Pointer to an array of length equal to the product of num_centers and data.num_dimensions() . This contains a column-major matrix where rows correspond to dimensions and columns correspond to cluster centers. On output, each column should contain the initial centroid location for its cluster. |
[in] | clusters | Pointer to an array of length equal to the number of observations (from data.num_observations() ). On output, this will contain the 0-based cluster assignment for each observation, where each entry is less than num_centers . |
Results< Index_, Cluster_, Float_ > kmeans::compute | ( | const Matrix_ & | data, |
const Initialize< Index_, Data_, Cluster_, Float_, Matrix_ > & | initialize, | ||
const Refine< Index_, Data_, Cluster_, Float_, Matrix_ > & | refine, | ||
const Cluster_ | num_centers ) |
Overload of compute()
that allocates and returns the vectors for the centroids and cluster assignments.
Index_ | Integer type of the observation indices. This should be the same as the index type of Matrix_ . |
Data_ | Numeric type of the input dataset. This should be the same as the data type of Matrix_ . |
Cluster_ | Integer type of the cluster assignments. |
Float_ | Floating-point type of the centroids. |
Matrix_ | Class satisfying the Matrix interface. |
data | A matrix containing data for each observation. |
initialize | Initialization method to use. |
refine | Refinement method to use. |
num_centers | Number of cluster centers. |
void kmeans::compute_wcss | ( | const Matrix_ & | data, |
const Cluster_ | num_centers, | ||
const Float_ *const | centers, | ||
const Cluster_ *const | clusters, | ||
Float_ *const | wcss ) |
Matrix_ | Class satisfying the Matrix interface. |
Cluster_ | Integer type of the cluster assignments. |
Float_ | Floating-point type of the centers and output. |
data | A matrix containing data for each observation. | |
num_centers | Number of cluster centers. | |
[in] | centers | Pointer to an array of length equal to the product of num_centers and data.num_dimensions() . This contains a column-major matrix where rows correspond to dimensions and columns correspond to cluster centers. Each column should contain the initial centroid location for its cluster. |
[in] | clusters | Pointer to an array of length equal to the number of observations (from data.num_observations() ). This should contain the 0-based cluster assignment for each observation, where each value is no greater than num_centers . |
[out] | wcss | Pointer to an array of length equal to num_centers . On output, this will contain the within-cluster sum of squares. |
void kmeans::parallelize | ( | const int | num_workers, |
const Task_ | num_tasks, | ||
Run_ | run_task_range ) |
Task_ | Integer type of the number of tasks. |
Run_ | Function to execute a range of tasks. |
num_workers | Number of workers. |
num_tasks | Number of tasks. |
run_task_range | Function to iterate over a range of tasks within a worker. |
By default, this is an alias to subpar::parallelize_range()
. However, if the KMEANS_CUSTOM_PARALLEL
function-like macro is defined, it is called instead. Any user-defined macro should accept the same arguments as subpar::parallelize_range()
.