commit 6a2a38f64fbc3c0ca048b512db3aa0bdc11f9163 Author: Benjamin Delespierre Date: Wed Oct 8 17:03:58 2014 +0200 initial import diff --git a/README.md b/README.md new file mode 100644 index 0000000..15de5d9 --- /dev/null +++ b/README.md @@ -0,0 +1,116 @@ +# PHP K-Means +_Clustering made simple_ + +k-means clustering is a method of vector quantization, originally from signal processing, that is popular for cluster analysis in data mining. k-means clustering aims to partition n observations into k clusters in which each observation belongs to the cluster with the nearest mean, serving as a prototype of the cluster. This results in a partitioning of the data space into Voronoi cells. + +Read more on [Wikipedia](http://en.wikipedia.org/wiki/K-means_clustering) + +PHP K-Means, like its name suggest, is an implementation of K-Means and K-Means++ algorithms for the PHP plateform. It works with an unlimited number of dimentions. + +## Usage + +Given the following points of R² + +```PHP +$points = [ + [80,55],[86,59],[19,85],[41,47],[57,58], + [76,22],[94,60],[13,93],[90,48],[52,54], + [62,46],[88,44],[85,24],[63,14],[51,40], + [75,31],[86,62],[81,95],[47,22],[43,95], + [71,19],[17,65],[69,21],[59,60],[59,12], + [15,22],[49,93],[56,35],[18,20],[39,59], + [50,15],[81,36],[67,62],[32,15],[75,65], + [10,47],[75,18],[13,45],[30,62],[95,79], + [64,11],[92,14],[94,49],[39,13],[60,68], + [62,10],[74,44],[37,42],[97,60],[47,73], +]; +``` + +We want to find 3 clusters: + +```PHP +// create a 2 dimentionnal space and fill it +$space = new KMeans\Space(2); + +foreach ($points as $point) + $space->addPoint($point); + + // resolve 3 clusters +$clusters = $space->solve(3); +``` + +Now we can retrieve each cluster's centroid (the average meaning amongts its points) and all the points in it: + +```PHP +foreach ($clusters as $i => $cluster) + printf("Cluster %d [%d,%d]: %d points\n", $i, $cluster[0], $cluster[1], count($cluster)); +``` + +Example of output: + +``` +Cluster 0 [79,58]: 18 points +Cluster 1 [57,19]: 19 points +Cluster 2 [31,66]: 13 points +``` + +### Heads up! + +K-Means algorithm is non-deterministic so you may get different results when running it multiple times with the same data. The more points you add in the space, the more accurate the result will be. + +You are strongly advised to read the Wikipedia article thoroughly before using this library. + +## K-Means++ + +When triggering the `Kmeans\Space::solve` method, you may provide an alternative seeding method in order to initialize the clusters with the [David Arthur and Sergei Vassilvitskii algorithm](http://en.wikipedia.org/wiki/K-means%2B%2B) which avoids poor clustering results. + +```PHP +// resolve 3 clusters using David Arthur and Sergei Vassilvitskii seeding algorithm +$clusters = $space->solve(3, KMeans\Space::SEED_DASV); +``` + +## Howto + +### Get coordinates of a point/cluster: +```PHP +$x = $point[0]; +$y = $point[1]; + +// or + +list($x,$y) = $point->getCoordinates(); +``` + +### List all points of a space/cluster: + +```PHP +foreach ($cluster as $point) + printf('[%d,%d]', $point[0], $point[1]); +``` + +### Attach data to a point: + +```PHP +$space->addPoint($coordinate, $data); +``` + +### Retrieve point data: + +```PHP +$data = $space[$point]; +``` + +### Watch the algorithm run + +Each iteration step can be monitored using a callback function passed to `Kmeans\Space::solve`: + +```PHP +$clusters = $space->solve(3, KMeans\Space::SEED_DEFAULT, function($space, $clusters) { + static $iterations = 0; + + printf("Iteration: %d\n", ++$iterations); + + foreach ($clusters as $i => $cluster) + printf("Cluster %d [%d,%d]: %d points\n", $i, $cluster[0], $cluster[1], count($cluster)); +}); +``` \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..ba5b7b6 --- /dev/null +++ b/composer.json @@ -0,0 +1,16 @@ +{ + "name": "bdelespierre/php-kmeans", + "type": "library", + "description": "K-Means algorithm for PHP", + "keywords": ["php", "kmeans", "kmeans++", "utility"], + "license": "LGPL", + "authors": [ + { "name": "Benjamin Delespierre", "email": "benjamin.delespierre@gmail.com" } + ], + "require": { + "php": ">=5.4.0" + }, + "autoload": { + "psr-0": { "KMeans": "src/" } + } +} \ No newline at end of file diff --git a/demo.php b/demo.php new file mode 100644 index 0000000..f1ad192 --- /dev/null +++ b/demo.php @@ -0,0 +1,34 @@ +addPoint($coordinates); + +// cluster these 50 points in 3 clusters +$clusters = $space->solve(3); + +// display the cluster centers and attached points +foreach ($clusters as $i => $cluster) + printf("Cluster %s [%d,%d]: %d points\n", $i, $cluster[0], $cluster[1], count($cluster)); diff --git a/src/KMeans/Cluster.php b/src/KMeans/Cluster.php new file mode 100644 index 0000000..48564e7 --- /dev/null +++ b/src/KMeans/Cluster.php @@ -0,0 +1,106 @@ +points = new SplObjectStorage; + } + + public function toArray() + { + $points = []; + foreach ($this->points as $point) + $points[] = $point->toArray(); + + return [ + 'centroid' => parent::toArray(), + 'points' => $points, + ]; + } + + public function attach(Point $point) + { + if ($point instanceof self) + throw new LogicException("cannot attach a cluster to another"); + + $this->points->attach($point); + return $point; + } + + public function detach(Point $point) + { + $this->points->detach($point); + return $point; + } + + public function attachAll(SplObjectStorage $points) + { + $this->points->addAll($points); + } + + public function detachAll(SplObjectStorage $points) + { + $this->points->removeAll($points); + } + + public function updateCentroid() + { + if (!$count = count($this->points)) + return; + + $centroid = $this->space->newPoint(array_fill(0, $this->dimention, 0)); + + foreach ($this->points as $point) + for ($n=0; $n<$this->dimention; $n++) + $centroid->coordinates[$n] += $point->coordinates[$n]; + + for ($n=0; $n<$this->dimention; $n++) + $this->coordinates[$n] = $centroid->coordinates[$n] / $count; + } + + public function getIterator() + { + return $this->points; + } + + public function count() + { + return count($this->points); + } +} \ No newline at end of file diff --git a/src/KMeans/Point.php b/src/KMeans/Point.php new file mode 100644 index 0000000..63a9137 --- /dev/null +++ b/src/KMeans/Point.php @@ -0,0 +1,120 @@ +space = $space; + $this->dimention = $space->getDimention(); + $this->coordinates = $coordinates; + } + + public function toArray() + { + return [ + 'coordinates' => $this->coordinates, + 'data' => isset($this->space[$this]) ? $this->space[$this] : null, + ]; + } + + public function getDistanceWith(self $point) + { + if ($point->space !== $this->space) + throw new LogicException("can only calculate distances from points in the same space"); + + $distance = 0; + for ($n=0; $n<$this->dimention; $n++) { + $difference = $this->coordinates[$n] - $point->coordinates[$n]; + $distance += $difference * $difference; + } + + return $distance; + } + + public function getClosest($points) + { + foreach($points as $point) { + $distance = $this->getDistanceWith($point); + + if (!isset($minDistance)) { + $minDistance = $distance; + $minPoint = $point; + continue; + } + + if ($distance < $minDistance) { + $minDistance = $distance; + $minPoint = $point; + } + } + + return $minPoint; + } + + public function belongsTo(Space $space) + { + return $this->space === $space; + } + + public function getSpace() + { + return $this->space; + } + + public function getCoordinates() + { + return $this->coordinates; + } + + public function offsetExists($offset) + { + return isset($this->coordinates[$offset]); + } + + public function offsetGet($offset) + { + return $this->coordinates[$offset]; + } + + public function offsetSet($offset, $value) + { + $this->coordinates[$offset] = $value; + } + + public function offsetUnset($offset) + { + unset($this->coordinates[$offset]); + } +} \ No newline at end of file diff --git a/src/KMeans/Space.php b/src/KMeans/Space.php new file mode 100644 index 0000000..0210c3f --- /dev/null +++ b/src/KMeans/Space.php @@ -0,0 +1,232 @@ +dimention = $dimention; + } + + public function toArray() + { + $points = []; + foreach ($this as $point) + $points[] = $point->toArray(); + + return ['points' => $points]; + } + + public function newPoint(array $coordinates) + { + if (count($coordinates) != $this->dimention) + throw new LogicException("(" . implode(',', $coordinates) . ") is not a point of this space"); + + return new Point($this, $coordinates); + } + + public function addPoint(array $coordinates, $data = null) + { + return $this->attach($this->newPoint($coordinates), $data); + } + + public function attach($point, $data = null) + { + if (!$point instanceof Point) + throw new InvalidArgumentException("can only attach points to spaces"); + + return parent::attach($point, $data); + } + + public function getDimention() + { + return $this->dimention; + } + + public function getBoundaries() + { + if (!count($this)) + return false; + + $min = $this->newPoint(array_fill(0, $this->dimention, null)); + $max = $this->newPoint(array_fill(0, $this->dimention, null)); + + foreach ($this as $point) { + for ($n=0; $n < $this->dimention; $n++) { + ($min[$n] > $point[$n] || $min[$n] === null) && $min[$n] = $point[$n]; + ($max[$n] < $point[$n] || $max[$n] === null) && $max[$n] = $point[$n]; + } + } + + return [$min, $max]; + } + + public function getRandomPoint(Point $min, Point $max) + { + $point = $this->newPoint(array_fill(0, $this->dimention, null)); + + for ($n=0; $n < $this->dimention; $n++) + $point[$n] = rand($min[$n], $max[$n]); + + return $point; + } + + public function solve($nbClusters, $seed = self::SEED_DEFAULT, $iterationCallback = null) + { + if ($iterationCallback && !is_callable($iterationCallback)) + throw new InvalidArgumentException("invalid iteration callback"); + + // initialize K clusters + $clusters = $this->initializeClusters($nbClusters, $seed); + + // there's only one cluster, clusterization has no meaning + if (count($clusters) == 1) + return $clusters[0]; + + // until convergence is reached + do { + $iterationCallback && $iterationCallback($this, $clusters); + } while ($this->iterate($clusters)); + + // clustering is done. + return $clusters; + } + + protected function initializeClusters($nbClusters, $seed) + { + if ($nbClusters <= 0) + throw new InvalidArgumentException("invalid clusters number"); + + switch ($seed) { + // the default seeding method chooses completely random centroid + case self::SEED_DEFAULT: + // get the space boundaries to avoid placing clusters centroid too far from points + list($min, $max) = $this->getBoundaries(); + + // initialize N clusters with a random point within space boundaries + for ($n=0; $n<$nbClusters; $n++) + $clusters[] = new Cluster($this, $this->getRandomPoint($min, $max)->getCoordinates()); + + break; + + // the DASV seeding method consists of finding good initial centroids for the clusters + case self::SEED_DASV: + // find a random point + $position = rand(1, count($this)); + for ($i=1, $this->rewind(); $i<$position && $this->valid(); $i++, $this->next()); + $clusters[] = new Cluster($this, $this->current()->getCoordinates()); + + // retains the distances between points and their closest clusters + $distances = new SplObjectStorage; + + // create k clusters + for ($i=1; $i<$nbClusters; $i++) { + $sum = 0; + + // for each points, get the distance with the closest centroid already choosen + foreach ($this as $point) { + $distance = $point->getDistanceWith($point->getClosest($clusters)); + $sum += $distances[$point] = $distance; + } + + // choose a new random point using a weighted probability distribution + $sum = rand(0, $sum); + foreach ($this as $point) { + if (($sum -= $distances[$point]) > 0) + continue; + + $clusters[] = new Cluster($this, $point->getCoordinates()); + break; + } + } + + break; + } + + // assing all points to the first cluster + $clusters[0]->attachAll($this); + + return $clusters; + } + + protected function iterate($clusters) + { + $continue = false; + + // migration storages + $attach = new SplObjectStorage; + $detach = new SplObjectStorage; + + // calculate proximity amongst points and clusters + foreach ($clusters as $cluster) { + foreach ($cluster as $point) { + // find the closest cluster + $closest = $point->getClosest($clusters); + + // move the point from its old cluster to its closest + if ($closest !== $cluster) { + isset($attach[$closest]) || $attach[$closest] = new SplObjectStorage; + isset($detach[$cluster]) || $detach[$cluster] = new SplObjectStorage; + + $attach[$closest]->attach($point); + $detach[$cluster]->attach($point); + + $continue = true; + } + } + } + + // perform points migrations + foreach ($attach as $cluster) + $cluster->attachAll($attach[$cluster]); + + foreach ($detach as $cluster) + $cluster->detachAll($detach[$cluster]); + + // update all cluster's centroids + foreach ($clusters as $cluster) + $cluster->updateCentroid(); + + return $continue; + } +} \ No newline at end of file