initial import

This commit is contained in:
Benjamin Delespierre 2014-10-08 17:03:58 +02:00
commit 6a2a38f64f
6 changed files with 624 additions and 0 deletions

116
README.md Normal file
View File

@ -0,0 +1,116 @@
# PHP K-Means
_Clustering made simple_
<bloquote>k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. This results in a partitioning of the data space into Voronoi cells.</bloquote>
Read more on [Wikipedia](http://en.wikipedia.org/wiki/K-means_clustering)
PHP K-Means, like its name suggest, is an implementation of K-Means and K-Means++ algorithms for the PHP plateform. It works with an unlimited number of dimentions.
## Usage
Given the following points of R²
```PHP
$points = [
[80,55],[86,59],[19,85],[41,47],[57,58],
[76,22],[94,60],[13,93],[90,48],[52,54],
[62,46],[88,44],[85,24],[63,14],[51,40],
[75,31],[86,62],[81,95],[47,22],[43,95],
[71,19],[17,65],[69,21],[59,60],[59,12],
[15,22],[49,93],[56,35],[18,20],[39,59],
[50,15],[81,36],[67,62],[32,15],[75,65],
[10,47],[75,18],[13,45],[30,62],[95,79],
[64,11],[92,14],[94,49],[39,13],[60,68],
[62,10],[74,44],[37,42],[97,60],[47,73],
];
```
We want to find 3 clusters:
```PHP
// create a 2 dimentionnal space and fill it
$space = new KMeans\Space(2);
foreach ($points as $point)
$space->addPoint($point);
// resolve 3 clusters
$clusters = $space->solve(3);
```
Now we can retrieve each cluster's centroid (the average meaning amongts its points) and all the points in it:
```PHP
foreach ($clusters as $i => $cluster)
printf("Cluster %d [%d,%d]: %d points\n", $i, $cluster[0], $cluster[1], count($cluster));
```
Example of output:
```
Cluster 0 [79,58]: 18 points
Cluster 1 [57,19]: 19 points
Cluster 2 [31,66]: 13 points
```
### Heads up!
K-Means algorithm is non-deterministic so you may get different results when running it multiple times with the same data. The more points you add in the space, the more accurate the result will be.
You are strongly advised to read the Wikipedia article thoroughly before using this library.
## K-Means++
When triggering the `Kmeans\Space::solve` method, you may provide an alternative seeding method in order to initialize the clusters with the [David Arthur and Sergei Vassilvitskii algorithm](http://en.wikipedia.org/wiki/K-means%2B%2B) which avoids poor clustering results.
```PHP
// resolve 3 clusters using David Arthur and Sergei Vassilvitskii seeding algorithm
$clusters = $space->solve(3, KMeans\Space::SEED_DASV);
```
## Howto
### Get coordinates of a point/cluster:
```PHP
$x = $point[0];
$y = $point[1];
// or
list($x,$y) = $point->getCoordinates();
```
### List all points of a space/cluster:
```PHP
foreach ($cluster as $point)
printf('[%d,%d]', $point[0], $point[1]);
```
### Attach data to a point:
```PHP
$space->addPoint($coordinate, $data);
```
### Retrieve point data:
```PHP
$data = $space[$point];
```
### Watch the algorithm run
Each iteration step can be monitored using a callback function passed to `Kmeans\Space::solve`:
```PHP
$clusters = $space->solve(3, KMeans\Space::SEED_DEFAULT, function($space, $clusters) {
static $iterations = 0;
printf("Iteration: %d\n", ++$iterations);
foreach ($clusters as $i => $cluster)
printf("Cluster %d [%d,%d]: %d points\n", $i, $cluster[0], $cluster[1], count($cluster));
});
```

16
composer.json Normal file
View File

@ -0,0 +1,16 @@
{
"name": "bdelespierre/php-kmeans",
"type": "library",
"description": "K-Means algorithm for PHP",
"keywords": ["php", "kmeans", "kmeans++", "utility"],
"license": "LGPL",
"authors": [
{ "name": "Benjamin Delespierre", "email": "benjamin.delespierre@gmail.com" }
],
"require": {
"php": ">=5.4.0"
},
"autoload": {
"psr-0": { "KMeans": "src/" }
}
}

34
demo.php Normal file
View File

@ -0,0 +1,34 @@
<?php
// include the library
require_once "src/KMeans/Space.php";
require_once "src/KMeans/Point.php";
require_once "src/KMeans/Cluster.php";
// prepare 50 2D points to be clustered
$points = [
[80,55],[86,59],[19,85],[41,47],[57,58],
[76,22],[94,60],[13,93],[90,48],[52,54],
[62,46],[88,44],[85,24],[63,14],[51,40],
[75,31],[86,62],[81,95],[47,22],[43,95],
[71,19],[17,65],[69,21],[59,60],[59,12],
[15,22],[49,93],[56,35],[18,20],[39,59],
[50,15],[81,36],[67,62],[32,15],[75,65],
[10,47],[75,18],[13,45],[30,62],[95,79],
[64,11],[92,14],[94,49],[39,13],[60,68],
[62,10],[74,44],[37,42],[97,60],[47,73],
];
// create a 2-dimentions space
$space = new KMeans\Space(2);
// add points to space
foreach ($points as $coordinates)
$space->addPoint($coordinates);
// cluster these 50 points in 3 clusters
$clusters = $space->solve(3);
// display the cluster centers and attached points
foreach ($clusters as $i => $cluster)
printf("Cluster %s [%d,%d]: %d points\n", $i, $cluster[0], $cluster[1], count($cluster));

106
src/KMeans/Cluster.php Normal file
View File

@ -0,0 +1,106 @@
<?php
/**
* This file is part of PHP K-Means
*
* Copyright (c) 2014 Benjamin Delespierre
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
namespace KMeans;
use \IteratorAggregate;
use \Countable;
use \SplObjectStorage;
use \InvalidArgumentException;
use \LogicException;
class Cluster extends Point implements IteratorAggregate, Countable
{
protected $space;
protected $points;
public function __construct(Space $space, array $coordinates)
{
parent::__construct($space, $coordinates);
$this->points = new SplObjectStorage;
}
public function toArray()
{
$points = [];
foreach ($this->points as $point)
$points[] = $point->toArray();
return [
'centroid' => parent::toArray(),
'points' => $points,
];
}
public function attach(Point $point)
{
if ($point instanceof self)
throw new LogicException("cannot attach a cluster to another");
$this->points->attach($point);
return $point;
}
public function detach(Point $point)
{
$this->points->detach($point);
return $point;
}
public function attachAll(SplObjectStorage $points)
{
$this->points->addAll($points);
}
public function detachAll(SplObjectStorage $points)
{
$this->points->removeAll($points);
}
public function updateCentroid()
{
if (!$count = count($this->points))
return;
$centroid = $this->space->newPoint(array_fill(0, $this->dimention, 0));
foreach ($this->points as $point)
for ($n=0; $n<$this->dimention; $n++)
$centroid->coordinates[$n] += $point->coordinates[$n];
for ($n=0; $n<$this->dimention; $n++)
$this->coordinates[$n] = $centroid->coordinates[$n] / $count;
}
public function getIterator()
{
return $this->points;
}
public function count()
{
return count($this->points);
}
}

120
src/KMeans/Point.php Normal file
View File

@ -0,0 +1,120 @@
<?php
/**
* This file is part of PHP K-Means
*
* Copyright (c) 2014 Benjamin Delespierre
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
namespace KMeans;
use \ArrayAccess;
use \LogicException;
class Point implements ArrayAccess
{
protected $space;
protected $dimention;
protected $coordinates;
public function __construct(Space $space, array $coordinates)
{
$this->space = $space;
$this->dimention = $space->getDimention();
$this->coordinates = $coordinates;
}
public function toArray()
{
return [
'coordinates' => $this->coordinates,
'data' => isset($this->space[$this]) ? $this->space[$this] : null,
];
}
public function getDistanceWith(self $point)
{
if ($point->space !== $this->space)
throw new LogicException("can only calculate distances from points in the same space");
$distance = 0;
for ($n=0; $n<$this->dimention; $n++) {
$difference = $this->coordinates[$n] - $point->coordinates[$n];
$distance += $difference * $difference;
}
return $distance;
}
public function getClosest($points)
{
foreach($points as $point) {
$distance = $this->getDistanceWith($point);
if (!isset($minDistance)) {
$minDistance = $distance;
$minPoint = $point;
continue;
}
if ($distance < $minDistance) {
$minDistance = $distance;
$minPoint = $point;
}
}
return $minPoint;
}
public function belongsTo(Space $space)
{
return $this->space === $space;
}
public function getSpace()
{
return $this->space;
}
public function getCoordinates()
{
return $this->coordinates;
}
public function offsetExists($offset)
{
return isset($this->coordinates[$offset]);
}
public function offsetGet($offset)
{
return $this->coordinates[$offset];
}
public function offsetSet($offset, $value)
{
$this->coordinates[$offset] = $value;
}
public function offsetUnset($offset)
{
unset($this->coordinates[$offset]);
}
}

232
src/KMeans/Space.php Normal file
View File

@ -0,0 +1,232 @@
<?php
/**
* This file is part of PHP K-Means
*
* Copyright (c) 2014 Benjamin Delespierre
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is furnished
* to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
namespace KMeans;
use \SplObjectStorage;
use \LogicException;
use \InvalidArgumentException;
class Space extends SplObjectStorage
{
// Default seeding method, initial cluster centroid are randomly choosen
const SEED_DEFAULT = 1;
// Alternative seeding method by David Arthur and Sergei Vassilvitskii
// (see http://en.wikipedia.org/wiki/K-means++)
const SEED_DASV = 2;
protected $dimention;
public function __construct($dimention)
{
if ($dimention < 1)
throw new LogicException("a space dimention cannot be null or negative");
$this->dimention = $dimention;
}
public function toArray()
{
$points = [];
foreach ($this as $point)
$points[] = $point->toArray();
return ['points' => $points];
}
public function newPoint(array $coordinates)
{
if (count($coordinates) != $this->dimention)
throw new LogicException("(" . implode(',', $coordinates) . ") is not a point of this space");
return new Point($this, $coordinates);
}
public function addPoint(array $coordinates, $data = null)
{
return $this->attach($this->newPoint($coordinates), $data);
}
public function attach($point, $data = null)
{
if (!$point instanceof Point)
throw new InvalidArgumentException("can only attach points to spaces");
return parent::attach($point, $data);
}
public function getDimention()
{
return $this->dimention;
}
public function getBoundaries()
{
if (!count($this))
return false;
$min = $this->newPoint(array_fill(0, $this->dimention, null));
$max = $this->newPoint(array_fill(0, $this->dimention, null));
foreach ($this as $point) {
for ($n=0; $n < $this->dimention; $n++) {
($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n];
($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n];
}
}
return [$min, $max];
}
public function getRandomPoint(Point $min, Point $max)
{
$point = $this->newPoint(array_fill(0, $this->dimention, null));
for ($n=0; $n < $this->dimention; $n++)
$point[$n] = rand($min[$n], $max[$n]);
return $point;
}
public function solve($nbClusters, $seed = self::SEED_DEFAULT, $iterationCallback = null)
{
if ($iterationCallback && !is_callable($iterationCallback))
throw new InvalidArgumentException("invalid iteration callback");
// initialize K clusters
$clusters = $this->initializeClusters($nbClusters, $seed);
// there's only one cluster, clusterization has no meaning
if (count($clusters) == 1)
return $clusters[0];
// until convergence is reached
do {
$iterationCallback && $iterationCallback($this, $clusters);
} while ($this->iterate($clusters));
// clustering is done.
return $clusters;
}
protected function initializeClusters($nbClusters, $seed)
{
if ($nbClusters <= 0)
throw new InvalidArgumentException("invalid clusters number");
switch ($seed) {
// the default seeding method chooses completely random centroid
case self::SEED_DEFAULT:
// get the space boundaries to avoid placing clusters centroid too far from points
list($min, $max) = $this->getBoundaries();
// initialize N clusters with a random point within space boundaries
for ($n=0; $n<$nbClusters; $n++)
$clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates());
break;
// the DASV seeding method consists of finding good initial centroids for the clusters
case self::SEED_DASV:
// find a random point
$position = rand(1, count($this));
for ($i=1, $this->rewind(); $i<$position && $this->valid(); $i++, $this->next());
$clusters[] = new Cluster($this, $this->current()->getCoordinates());
// retains the distances between points and their closest clusters
$distances = new SplObjectStorage;
// create k clusters
for ($i=1; $i<$nbClusters; $i++) {
$sum = 0;
// for each points, get the distance with the closest centroid already choosen
foreach ($this as $point) {
$distance = $point->getDistanceWith($point->getClosest($clusters));
$sum += $distances[$point] = $distance;
}
// choose a new random point using a weighted probability distribution
$sum = rand(0, $sum);
foreach ($this as $point) {
if (($sum -= $distances[$point]) > 0)
continue;
$clusters[] = new Cluster($this, $point->getCoordinates());
break;
}
}
break;
}
// assing all points to the first cluster
$clusters[0]->attachAll($this);
return $clusters;
}
protected function iterate($clusters)
{
$continue = false;
// migration storages
$attach = new SplObjectStorage;
$detach = new SplObjectStorage;
// calculate proximity amongst points and clusters
foreach ($clusters as $cluster) {
foreach ($cluster as $point) {
// find the closest cluster
$closest = $point->getClosest($clusters);
// move the point from its old cluster to its closest
if ($closest !== $cluster) {
isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage;
isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage;
$attach[$closest]->attach($point);
$detach[$cluster]->attach($point);
$continue = true;
}
}
}
// perform points migrations
foreach ($attach as $cluster)
$cluster->attachAll($attach[$cluster]);
foreach ($detach as $cluster)
$cluster->detachAll($detach[$cluster]);
// update all cluster's centroids
foreach ($clusters as $cluster)
$cluster->updateCentroid();
return $continue;
}
}