MOA 12.03
Real Time Analytics for Data Streams
TreeCoreset.java
Go to the documentation of this file.
00001 package moa.clusterers.streamkm;
00002 
00008 public class TreeCoreset {
00009 
00013         protected class treeNode {
00014                 //number of points in this node
00015                 int n;
00016                 
00017                 //array with pointers on points
00018                 Point[] points;
00019 
00020                 //pointer on the centre of the treenode
00021                 Point centre;
00022 
00023                 //pointer on the left childnode
00024                 treeNode lc;
00025                 
00026                 //pointer on the right childnode
00027                 treeNode rc;
00028 
00029                 //pointer on the parent node
00030                 treeNode parent;
00031 
00032                 //cost of the treenode
00033                 double cost;
00034                 
00035                 void free(){
00036                         this.parent     = null;
00037                         this.lc         = null;
00038                         this.rc         = null;
00039                         this.points             = null; 
00040                         this.centre     = null;  
00041                 }
00042                 
00043                 public treeNode(int n, Point[] points, Point centre, treeNode parent) {
00044                         this.n = n;
00045                         this.points = points; 
00046                         this.centre = centre; 
00047                         this.lc = null;
00048                         this.rc = null;
00049                         this.parent = parent;
00050                         this.cost = treeNodeTargetFunctionValue();;
00051                 }
00052                 
00056             public treeNode(Point[] setA, Point[] setB, int n_1, int n_2, Point centre, int centreIndex){
00057                         //loop counter variable
00058                         int i;  
00059 
00060                         //the root has no parent and no child nodes in the beginning
00061                         this.parent     = null;
00062                         this.lc         = null;
00063                         this.rc         = null; 
00064 
00065                         //array with points to the points
00066                         this.points = new Point[n_1+n_2];
00067                         this.n = n_1 + n_2;
00068                         
00069                         for(i=0;i<this.n;i++){
00070                                 if(i < n_1){
00071                                         this.points[i] = setA[i];
00072                                         this.points[i].centreIndex = centreIndex;
00073                                 } else {
00074                                         this.points[i] = setB[i-n_1];
00075                                         this.points[i].centreIndex = centreIndex;
00076                                 }
00077                         }
00078 
00079                         //set the centre
00080                         this.centre = centre;
00081 
00082                         //calculate costs
00083                         this.cost = treeNodeTargetFunctionValue();
00084                 }
00085                 
00095                 double treeNodeTargetFunctionValue(){
00096                         //loop counter variable
00097                         int i;
00098                         
00099                         //stores the cost
00100                         double sum = 0.0;
00101 
00102                         for(i=0; i<this.n; i++){
00103                                 //stores the distance
00104                                 double distance = 0.0;
00105 
00106                                 //loop counter variable
00107                                 int l;
00108 
00109                                 for(l=0;l<this.points[i].dimension;l++){
00110                                         //centroid coordinate of the point
00111                                         double centroidCoordinatePoint;
00112                                         if(this.points[i].weight != 0.0){
00113                                                 centroidCoordinatePoint = this.points[i].coordinates[l] / this.points[i].weight;
00114                                         } else {
00115                                                 centroidCoordinatePoint = this.points[i].coordinates[l];
00116                                         }
00117                                         //centroid coordinate of the centre
00118                                         double centroidCoordinateCentre;
00119                                         if(this.centre.weight != 0.0){
00120                                                 centroidCoordinateCentre = this.centre.coordinates[l] / this.centre.weight;
00121                                         } else {
00122                                                 centroidCoordinateCentre = this.centre.coordinates[l];
00123                                         }
00124                                         distance += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00125                                                         (centroidCoordinatePoint-centroidCoordinateCentre) ;
00126                                                 
00127                                 }
00128 
00129                                 sum += distance*this.points[i].weight;  
00130                         }
00131                         return sum;
00132                 }
00133         };
00134 
00138         double treeNodeSplitCost(treeNode node, Point centreA, Point centreB){
00139                 //loop counter variable
00140                 int i;
00141                 
00142                 //stores the cost
00143                 double sum = 0.0;
00144                 
00145                 for(i=0; i<node.n; i++){
00146                         //loop counter variable
00147                         int l;  
00148 
00149                         //stores the distance between p and centreA
00150                         double distanceA = 0.0;
00151 
00152                         for(l=0;l<node.points[i].dimension;l++){
00153                                 //centroid coordinate of the point
00154                                 double centroidCoordinatePoint;
00155                                 if(node.points[i].weight != 0.0){
00156                                         centroidCoordinatePoint = node.points[i].coordinates[l] / node.points[i].weight;
00157                                 } else {
00158                                         centroidCoordinatePoint = node.points[i].coordinates[l];
00159                                 }
00160                                 //centroid coordinate of the centre
00161                                 double centroidCoordinateCentre;
00162                                 if(centreA.weight != 0.0){
00163                                         centroidCoordinateCentre = centreA.coordinates[l] / centreA.weight;
00164                                 } else {
00165                                         centroidCoordinateCentre = centreA.coordinates[l];
00166                                 }
00167 
00168                                 distanceA += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00169                                                 (centroidCoordinatePoint-centroidCoordinateCentre) ;
00170                         }
00171 
00172                         //stores the distance between p and centreB
00173                         double distanceB = 0.0;
00174 
00175                         for(l=0;l<node.points[i].dimension;l++){
00176                                 //centroid coordinate of the point
00177                                 double centroidCoordinatePoint;
00178                                 if(node.points[i].weight != 0.0){
00179                                         centroidCoordinatePoint = node.points[i].coordinates[l] / node.points[i].weight;
00180                                 } else {
00181                                         centroidCoordinatePoint = node.points[i].coordinates[l];
00182                                 }
00183                                 //centroid coordinate of the centre
00184                                 double centroidCoordinateCentre;
00185                                 if(centreB.weight != 0.0){
00186                                         centroidCoordinateCentre = centreB.coordinates[l] / centreB.weight;
00187                                 } else {
00188                                         centroidCoordinateCentre = centreB.coordinates[l];
00189                                 }
00190 
00191                                 distanceB += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00192                                                 (centroidCoordinatePoint-centroidCoordinateCentre) ;
00193                         }
00194 
00195                         //add the cost of the closest centre to the sum
00196                         if(distanceA < distanceB){
00197                                 sum += distanceA*node.points[i].weight;
00198                         } else {
00199                                 sum += distanceB*node.points[i].weight;
00200                         }
00201 
00202                 }
00203                 
00204                 //return the total cost
00205                 return sum;
00206 
00207         }
00208 
00209 
00213         double treeNodeCostOfPoint(treeNode node, Point p){
00214                 if(p.weight == 0.0){
00215                         return 0.0;
00216                 }
00217 
00218                 //stores the distance between centre and p
00219                 double distance = 0.0;
00220                 
00221                 //loop counter variable
00222                 int l;
00223 
00224                 for(l=0;l<p.dimension;l++){
00225                         //centroid coordinate of the point
00226                         double centroidCoordinatePoint;
00227                         if(p.weight != 0.0){
00228                                 centroidCoordinatePoint = p.coordinates[l] / p.weight;
00229                         } else {
00230                                 centroidCoordinatePoint = p.coordinates[l];
00231                         }
00232                         //centroid coordinate of the centre
00233                         double centroidCoordinateCentre;
00234                         if(node.centre.weight != 0.0){
00235                                 centroidCoordinateCentre = node.centre.coordinates[l] / node.centre.weight;
00236                         } else {
00237                                 centroidCoordinateCentre = node.centre.coordinates[l];
00238                         }
00239                         distance += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00240                                         (centroidCoordinatePoint-centroidCoordinateCentre) ;
00241                                         
00242                 }
00243                 return distance * p.weight;
00244         }
00245 
00249         boolean isLeaf(treeNode node){
00250 
00251                 if(node.lc == null && node.rc == null){
00252                         return true;
00253                 } else {
00254                         return false;
00255                 }
00256 
00257         }
00258 
00262    treeNode selectNode(treeNode root, MTRandom clustererRandom){
00263                 
00264                 //random number between 0 and 1
00265                 double random = clustererRandom.nextDouble();
00266                 
00267                 while(!isLeaf(root)){
00268                         if(root.lc.cost == 0 && root.rc.cost == 0){
00269                                 if(root.lc.n == 0){
00270                                         root = root.rc;
00271                                 } else if(root.rc.n == 0){
00272                                         root = root.lc;
00273                                 }else if(random < 0.5){
00274                                         random = clustererRandom.nextDouble();
00275                                         root = root.lc;
00276                                 } else {                
00277                                         random = clustererRandom.nextDouble();          
00278                                         root = root.rc;
00279                                 }
00280                         } else {
00281 
00282                                 if(random < root.lc.cost/root.cost){
00283                         
00284                                         root = root.lc;
00285                                 } else {                
00286 
00287                                         root = root.rc;
00288                                 }
00289                         }
00290                 }
00291 
00292                 return root;
00293         }
00294 
00298         Point chooseCentre(treeNode node, MTRandom clustererRandom){
00299 
00300                 //How many times should we try to choose a centre ??
00301                 int times = 3;
00302                         
00303                 //stores the nodecost if node is split with the best centre
00304                 double minCost = node.cost;
00305                 Point bestCentre = null;
00306                 
00307                 //loop counter variable
00308                 int i;
00309                 int j;
00310                 
00311                 for(j=0;j<times;j++){
00312                         //sum of the relativ cost of the points
00313                         double sum = 0.0;
00314                         //random number between 0 and 1
00315                         double random = clustererRandom.nextDouble();
00316                         
00317                         for(i=0;i<node.n;i++){
00318                         
00319                                 sum += treeNodeCostOfPoint(node,node.points[i]) / node.cost;
00320                                 if(sum >= random){
00321                                         if(node.points[i].weight == 0.0){
00322                                                 //printf("ERROR: CHOOSEN DUMMY NODE THOUGH OTHER AVAILABLE \n");
00323                                                 return null;
00324                                         }
00325                                         double curCost = treeNodeSplitCost(node,node.centre,node.points[i]);
00326                                         if(curCost < minCost){
00327                                                 bestCentre = node.points[i];
00328                                                 minCost = curCost;
00329                                         }
00330                                         break;
00331                                 }
00332                         }
00333                 }
00334                 if(bestCentre == null){
00335                         return node.points[0];
00336                 } else {
00337                         return bestCentre;
00338                 }
00339         }
00340 
00344         Point determineClosestCentre(Point p, Point centreA, Point centreB){
00345                 
00346                 //loop counter variable
00347                 int l;  
00348 
00349                 //stores the distance between p and centreA
00350                 double distanceA = 0.0;
00351 
00352                 for(l=0;l<p.dimension;l++){
00353                         //centroid coordinate of the point
00354                         double centroidCoordinatePoint;
00355                         if(p.weight != 0.0){
00356                                 centroidCoordinatePoint = p.coordinates[l] / p.weight;
00357                         } else {
00358                                 centroidCoordinatePoint = p.coordinates[l];
00359                         }
00360                         //centroid coordinate of the centre
00361                         double centroidCoordinateCentre;
00362                         if(centreA.weight != 0.0){
00363                                 centroidCoordinateCentre = centreA.coordinates[l] / centreA.weight;
00364                         } else {
00365                                 centroidCoordinateCentre = centreA.coordinates[l];
00366                         }
00367 
00368                         distanceA += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00369                                         (centroidCoordinatePoint-centroidCoordinateCentre) ;
00370                 }
00371 
00372                 //stores the distance between p and centreB
00373                 double distanceB = 0.0;
00374 
00375                 for(l=0;l<p.dimension;l++){
00376                         //centroid coordinate of the point
00377                         double centroidCoordinatePoint;
00378                         if(p.weight != 0.0){
00379                                 centroidCoordinatePoint = p.coordinates[l] / p.weight;
00380                         } else {
00381                                 centroidCoordinatePoint = p.coordinates[l];
00382                         }
00383                         //centroid coordinate of the centre
00384                         double centroidCoordinateCentre;
00385                         if(centreB.weight != 0.0){
00386                                 centroidCoordinateCentre = centreB.coordinates[l] / centreB.weight;
00387                         } else {
00388                                 centroidCoordinateCentre = centreB.coordinates[l];
00389                         }
00390 
00391                         distanceB += (centroidCoordinatePoint-centroidCoordinateCentre) * 
00392                                         (centroidCoordinatePoint-centroidCoordinateCentre) ;
00393                 }
00394 
00395                 //return the nearest centre
00396                 if(distanceA < distanceB){
00397                         return centreA;
00398                 } else {
00399                         return centreB;
00400                 }
00401         }
00402 
00406         void split(treeNode parent, Point newCentre, int newCentreIndex){
00407                 
00408                 //loop counter variable
00409                 int i;
00410 
00411                 //1. Counts how many points belong to the new and how many points belong to the old centre
00412                 int nOld = 0;
00413                 int nNew = 0;
00414                 for(i=0;i<parent.n;i++){
00415                         Point centre = determineClosestCentre(parent.points[i], parent.centre, newCentre);
00416                         if(centre == newCentre){
00417                                 nNew++;
00418                         } else {
00419                                 nOld++;
00420                         } 
00421                 }
00422 
00423                 //2. initalizes the arrays for the pointer
00424                 
00425                 //array for pointer on the points belonging to the old centre
00426                 Point[] oldPoints = new Point[nOld];
00427 
00428                 //array for pointer on the points belonging to the new centre
00429                 Point[] newPoints = new Point[nNew];
00430 
00431                 int indexOld = 0;
00432                 int indexNew = 0;
00433 
00434                 for(i=0;i<parent.n;i++){
00435                         Point centre = determineClosestCentre(parent.points[i],parent.centre,newCentre);
00436                         if(centre == newCentre){
00437                                 newPoints[indexNew] = parent.points[i];
00438                                 newPoints[indexNew].centreIndex = newCentreIndex;
00439                                 indexNew++;
00440                         } else if(centre == parent.centre){
00441                                 oldPoints[indexOld] = parent.points[i];
00442                                 indexOld++;
00443                         } else {
00444                                 //printf("ERROR !!! NO CENTER NEAREST !! \n");
00445                         }
00446                 }
00447 
00448                 //left child: old centre
00449                 treeNode lc = new treeNode(nOld, oldPoints, 
00450                                                 parent.centre, parent);
00451                 /*lc.centre = parent.centre;
00452                 lc.points = oldPoints;
00453                 lc.n = nOld;
00454 
00455                 lc.lc = null;
00456                 lc.rc = null;
00457                 lc.parent = parent;
00458 
00459                 treeNodeTargetFunctionValue(lc);*/
00460                 
00461                 //right child: new centre
00462                 treeNode rc = new treeNode(nNew, newPoints, newCentre, 
00463                                                          parent);
00464                 /*rc.centre = newCentre;
00465                 rc.points = newPoints;
00466                 rc.n = nNew;
00467 
00468                 rc.lc = null;
00469                 rc.rc = null;
00470                 rc.parent = parent;
00471 
00472                 treeNodeTargetFunctionValue(rc);*/
00473 
00474                 //set childs of the parent node
00475                 parent.lc = lc;
00476                 parent.rc = rc;
00477 
00478                 //propagate the cost changes to the parent nodes
00479                 while(parent != null){
00480                         parent.cost = parent.lc.cost + parent.rc.cost;
00481                         parent = parent.parent;
00482                 }
00483 
00484         }
00485 
00489         boolean treeFinished(treeNode root){
00490                 return (root.parent == null && root.lc == null && root.rc == null);
00491         }
00492 
00496         void freeTree(treeNode root){
00497 
00498                 while(!treeFinished(root)){
00499                         if(root.lc == null && root.rc == null){
00500                                 root = root.parent;
00501                         } else if(root.lc == null && root.rc != null){
00502                                 //Schau ob rc ein Blatt ist
00503                                 if(isLeaf(root.rc)){
00504                                         //Gebe rechtes Kind frei
00505                                         root.rc.free();
00506                                         root.rc = null;
00507                                 } else {
00508                                         //Fahre mit rechtem Kind fort
00509                                         root = root.rc;
00510                                 }
00511                         } else if(root.lc != null) {
00512                                 if(isLeaf(root.lc)){
00513                                         root.lc.free();
00514                                         root.lc = null;
00515                                 } else {
00516                                         root = root.lc;
00517                                 }
00518                         }
00519                 }
00520                 root.free();
00521 
00522         }
00523 
00527         void unionTreeCoreset(int k,int n_1,int n_2,int d, Point[] setA,Point[] setB, Point[] centres, MTRandom clustererRandom) {
00528                 //printf("Computing coreset...\n");
00529                 //total number of points
00530                 int n = n_1+n_2;
00531 
00532                 //choose the first centre (each point has the same probability of being choosen)
00533                 
00534                 //stores, how many centres have been choosen yet
00535                 int choosenPoints = 0; 
00536                 
00537                 //only choose from the n-i points not already choosen
00538                 int j = clustererRandom.nextInt(n-choosenPoints); 
00539 
00540                 //copy the choosen point
00541                 if(j < n_1){
00542                         //copyPointWithoutInit(&setA[j],&centres[choosenPoints]);
00543                         centres[choosenPoints] = setA[j].clone();
00544                 } else {
00545                         j = j - n_1;
00546                         //copyPointWithoutInit(&setB[j],&centres[choosenPoints]);
00547                         centres[choosenPoints] = setB[j].clone();
00548                 }
00549                 treeNode root = new treeNode(setA,setB,n_1,n_2, centres[choosenPoints],choosenPoints); //??
00550                 choosenPoints = 1;
00551                 
00552                 //choose the remaining points
00553                 while(choosenPoints < k){
00554                         if(root.cost > 0.0){
00555                                 treeNode leaf = selectNode(root, clustererRandom);
00556                                 Point centre = chooseCentre(leaf, clustererRandom);
00557                                 split(leaf,centre,choosenPoints);
00558                                 //copyPointWithoutInit(centre,&centres[choosenPoints]);
00559                                 centres[choosenPoints] = centre;
00560                         } else {
00561                                 //create a dummy point
00562                                 //copyPointWithoutInit(root.centre,&centres[choosenPoints]);
00563                                 centres[choosenPoints] = root.centre;
00564                                 int l;
00565                                 for(l=0;l<root.centre.dimension;l++){
00566                                         centres[choosenPoints].coordinates[l] = -1 * 1000000;
00567                                 }
00568                                 centres[choosenPoints].id = -1;
00569                                 centres[choosenPoints].weight = 0.0;
00570                                 centres[choosenPoints].squareSum = 0.0;
00571                         }
00572                         
00573                         choosenPoints++;
00574                 }
00575 
00576                 //free the tree
00577                 freeTree(root);
00578 
00579                 //recalculate clustering features
00580                 int i;
00581                 for(i=0;i<n;i++){
00582                                 
00583                         if(i < n_1) {
00584                                 
00585                                 int index = setA[i].centreIndex;
00586                                 if(centres[index].id != setA[i].id){
00587                                         centres[index].weight += setA[i].weight;
00588                                         centres[index].squareSum += setA[i].squareSum;
00589                                         int l;
00590                                         for(l=0;l<centres[index].dimension;l++){
00591                                                 if(setA[i].weight != 0.0){
00592                                                         centres[index].coordinates[l] += setA[i].coordinates[l];
00593                                                 }
00594                                         }
00595                                 }
00596                         } else {
00597                                 
00598                                 int index = setB[i-n_1].centreIndex;
00599                                 if(centres[index].id != setB[i-n_1].id){
00600                                         centres[index].weight += setB[i-n_1].weight;
00601                                         centres[index].squareSum += setB[i-n_1].squareSum;
00602                                         int l;
00603                                         for(l=0;l<centres[index].dimension;l++){
00604                                                 if(setB[i-n_1].weight != 0.0){
00605                                                         centres[index].coordinates[l] += setB[i-n_1].coordinates[l];
00606                                                 }
00607                                         }
00608                                 }
00609                         }
00610                 }
00611         }
00612 
00613 
00614 }
 All Classes Namespaces Files Functions Variables Enumerations