MOA 12.03
Real Time Analytics for Data Streams
StreamKM.java
Go to the documentation of this file.
00001 package moa.clusterers.streamkm;
00002 
00003 import moa.cluster.Cluster;
00004 import moa.cluster.Clustering;
00005 import moa.clusterers.AbstractClusterer;
00006 import moa.core.Measurement;
00007 import moa.options.IntOption;
00008 
00009 import weka.core.Instance;
00010 
00017 public class StreamKM extends AbstractClusterer {
00018 
00019     public IntOption sizeCoresetOption = new IntOption("sizeCoreset",
00020                         's', "Size of the coreset.", 10000);
00021 
00022     public IntOption numClustersOption = new IntOption(
00023                         "numClusters", 'k',
00024                         "Number of clusters to compute.", 5);
00025                         
00026         public IntOption widthOption = new IntOption("width",
00027                         'w', "Size of Window for training learner.", 100000, 0, Integer.MAX_VALUE);
00028                         
00029         public IntOption randomSeedOption = new IntOption("randomSeed", 'r',
00030                                         "Seed for random behaviour of the classifier.", 1);     
00031                                                         
00032         protected MTRandom clustererRandom;
00033         protected Point[] centresStreamingCoreset;
00034                         
00035         protected int numberInstances;
00036         
00037         protected int dimension;        
00038         protected int length;
00039         protected int numberOfCentres;
00040         protected int coresetsize;      
00041         
00042         protected BucketManager manager;
00043         
00044         protected boolean initialized = false;  
00045         
00046         private final static double THRESHOLD = 1.000;
00047 
00048     @Override
00049     public void resetLearningImpl() {
00050                 this.initialized = false;
00051                 this.coresetsize = sizeCoresetOption.getValue();
00052                 this.numberOfCentres = numClustersOption.getValue();
00053                 this.length = widthOption.getValue();
00054                 this.centresStreamingCoreset = new Point[this.numberOfCentres];
00055 
00056                 //initalize random generator with seed
00057                 this.clustererRandom = new MTRandom(this.randomSeedOption.getValue());
00058         }
00059 
00060     @Override
00061     public void trainOnInstanceImpl(Instance inst) {
00062                 
00063                 if (this.initialized == false) {
00064                         this.dimension =  inst.numAttributes();
00065                         manager = new BucketManager(this.length, this.dimension, this.coresetsize, this.clustererRandom);
00066                         this.initialized = true;
00067                 }
00068                 
00069                 manager.insertPoint(new Point(inst, this.numberInstances));     
00070         
00071         this.numberInstances++;
00072                 if (this.numberInstances % widthOption.getValue() == 0) {
00073                         
00074                         Point[] streamingCoreset = manager.getCoresetFromManager(dimension);
00075                         
00076                         //compute 5 clusterings of the coreset with kMeans++ and take the best
00077                         double minCost = 0.0;
00078                         double curCost = 0.0;
00079                         
00080                         minCost = lloydPlusPlus(numberOfCentres, coresetsize, dimension, streamingCoreset, centresStreamingCoreset);
00081                         curCost = minCost;
00082 
00083                         for(int i = 1; i < 5; i++){
00084                                 Point[] tmpCentresStreamingCoreset= new Point[0];
00085                                 curCost = lloydPlusPlus(numberOfCentres, coresetsize, dimension, streamingCoreset, tmpCentresStreamingCoreset);
00086                                 if(curCost < minCost) {
00087                                         minCost = curCost;
00088                                         centresStreamingCoreset = tmpCentresStreamingCoreset;
00089                                 }
00090                     }
00091                 }
00092     }
00093 
00094     @Override
00095     protected Measurement[] getModelMeasurementsImpl() {
00096         throw new UnsupportedOperationException("Not supported yet.");
00097     }
00098 
00099     @Override
00100     public void getModelDescription(StringBuilder out, int indent) {
00101         throw new UnsupportedOperationException("Not supported yet.");
00102     }
00103 
00104     public boolean isRandomizable() {
00105         return true;
00106     }
00107 
00108     public double[] getVotesForInstance(Instance inst) {
00109         throw new UnsupportedOperationException("Not supported yet.");
00110     }
00111 
00112 
00113     @Override
00114     public Clustering getClusteringResult() {
00115                 if ( !this.initialized ) {
00116                         return new Clustering();
00117                 }
00118                 
00119                 Clustering clustering = new Clustering();
00120                 for ( int i = 0; i < centresStreamingCoreset.length; i++ ) {
00121                         if(centresStreamingCoreset[i] != null){
00122                                 clustering.add(centresStreamingCoreset[i].toCluster());
00123                         }
00124                 }
00125                 
00126                 return clustering;
00127     }
00128 
00129     
00130     public double lloydPlusPlus(int k, int n, int d, Point points[], Point centres[]){
00131                 //printf("starting kMeans++\n");
00132                 //choose random centres
00133                 centres = chooseRandomCentres(k, n, d, points);
00134                 double cost = targetFunctionValue(k, n, centres, points);
00135                 double newCost = cost;
00136                 
00137 
00138                 Point[] massCentres = new Point[k];
00139                 double[] numberOfPoints = new double[k];
00140 
00141                 do{
00142                         cost = newCost;
00143                         //reset centres of mass
00144                         int i = 0;
00145                         for(i = 0; i < k; i++){ 
00146                                 massCentres[i] = new Point(d);
00147                                 numberOfPoints[i] = 0.0;
00148                         }
00149                         //compute centres of mass
00150                         for(i = 0; i < n; i++){
00151                                 int centre = points[i].determineClusterCentreKMeans(k,centres);
00152                                 for(int l = 0; l < massCentres[centre].dimension; l++){
00153                                         if(points[i].weight != 0.0)
00154                                                 massCentres[centre].coordinates[l] += points[i].coordinates[l];
00155                                 }
00156                                 numberOfPoints[centre] += points[i].weight;
00157                         
00158                         }
00159                         
00160                         //move centres
00161                         for(i=0; i<k; i++){
00162                                 for(int l=0; l<centres[i].dimension; l++){
00163                                         centres[i].coordinates[l] = massCentres[i].coordinates[l];
00164                                         centres[i].weight = numberOfPoints[i];
00165                                 }
00166                         }
00167                         
00168                         //calculate costs
00169                         newCost = targetFunctionValue(k, n, centres, points);
00170                         //printf("old cost:%f, new cost:%f \n",cost,newCost);
00171                 } while (newCost < THRESHOLD * cost);
00172 
00173                 /*printf("Centres: \n");
00174                 int i=0;
00175                 for(i=0;i<k;i++){
00176                         printf("(");
00177                         int l = 0;
00178                         for(l=0;l<centres[i].dimension;l++){
00179                                 printf("%f,",centres[i].coordinates[l] / centres[i].weight);
00180                         }
00181                         printf(")\n");
00182                 }
00183                 printf("kMeans++ finished\n");
00184                 */ 
00185                 return newCost; 
00186         }
00187         
00188         private Point[] chooseRandomCentres(int k, int n, int d, Point points[]){
00189 
00190                 //array to store the choosen centres
00191                 Point[] centres = new Point[k]; 
00192 
00193                 //choose the first centre (each point has the same probability of being choosen)
00194                 int i = 0;
00195                 
00196                 int next = 0;
00197                 int j = 0;
00198                 do{ //only choose from the n-i points not already choosen
00199                         next = this.clustererRandom.nextInt(n-1); 
00200                         
00201                         //check if the choosen point is not a dummy
00202                 } while( points[next].weight < 1);
00203                 
00204                 //set j to next unchoosen point
00205                 j = next;
00206                 //copy the choosen point to the array
00207                 centres[i] = points[j].clone();
00208                         
00209                 //set the current centre for all points to the choosen centre
00210                 for(i = 0; i < n; i++){
00211                         points[i].centreIndex = 0;
00212                         points[i].curCost = points[i].costOfPointToCenter(centres[0]);
00213                 
00214                 }
00215                 //choose centre 1 to k-1 with the kMeans++ distribution
00216                 for(i = 1; i < k; i++){
00217 
00218                         double cost = 0.0;
00219                         for(j = 0; j < n; j++){
00220                                 cost += points[j].curCost;
00221                         }
00222                         
00223                         double random = 0;
00224                         double sum = 0.0;
00225                         int pos = -1;
00226                         
00227                         do{
00228                                         random = this.clustererRandom.nextDouble();//genrand_real3();
00229                                         sum = 0.0;
00230                                         pos = -1;
00231 
00232                                 for(j = 0; j < n; j++){
00233                                         sum = sum + points[j].curCost;
00234                                         if(random <= sum/cost){
00235                                                 pos = j;
00236                                                 break;
00237                                         }       
00238                                 }       
00239                         } while (points[pos].weight < 1);
00240                                 
00241                         //copy the choosen centre
00242                         centres[i] = points[pos].clone();
00243                         //check which points are closest to the new centre
00244                         for(j = 0; j < n; j++){
00245                                 double newCost = points[j].costOfPointToCenter(centres[i]);
00246                                 if(points[j].curCost > newCost){
00247                                         points[j].curCost = newCost;
00248                                         points[j].centreIndex = i;
00249                                 }
00250                         }
00251                         
00252                 }
00253                 
00254                 /*printf("random centres: \n");
00255                 for(i = 0; i < k; i++){
00256                         //printf("%d: (",i);
00257                         int l = 0;
00258                         for(l = 0; l < centres[i].dimension; l++){
00259                                 printf("%f,",centres[i].coordinates[l] / centres[i].weight);
00260                         }
00261                         printf(")\n");
00262                 }*/
00263 
00264                 return centres;
00265         }
00266 
00271         public double targetFunctionValue(int k, int n, Point[] centres, Point[] points){
00272                 int i=0;
00273                 double sum = 0.0;
00274                 for(i=0;i<n;i++){
00275                         double nearestCost = -1.0;
00276                         int j=0;
00277                         for(j=0;j<k;j++){
00278                                 double distance = 0.0;
00279                                 int l = 0;
00280                                 for(l=0;l<points[i].dimension;l++){
00281                                         //Centroid coordinate of the point
00282                                         double centroidCoordinatePoint;
00283                                         if(points[i].weight != 0.0){
00284                                                 centroidCoordinatePoint = points[i].coordinates[l] / points[i].weight;
00285                                         } else {
00286                                                 centroidCoordinatePoint = points[i].coordinates[l];
00287                                         }
00288                                         //Centroid coordinate of the centre
00289                                         double centroidCoordinateCentre;
00290                                         if(centres[j].weight != 0.0){
00291                                                 centroidCoordinateCentre = centres[j].coordinates[l] / centres[j].weight;
00292                                         } else {
00293                                                 centroidCoordinateCentre = centres[j].coordinates[l];
00294                                         }
00295                                         distance += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00296                                                         (centroidCoordinatePoint-centroidCoordinateCentre) ;
00297                                         
00298                                 }
00299                                 if(nearestCost <0 || distance < nearestCost) {
00300                                         nearestCost = distance;
00301                                 } 
00302                         }
00303                         sum += nearestCost * points[i].weight;
00304                 }
00305                 return sum;
00306         }
00307 
00308 }
 All Classes Namespaces Files Functions Variables Enumerations