MOA 12.03
Real Time Analytics for Data Streams
|
00001 package moa.clusterers.streamkm; 00002 00008 public class TreeCoreset { 00009 00013 protected class treeNode { 00014 //number of points in this node 00015 int n; 00016 00017 //array with pointers on points 00018 Point[] points; 00019 00020 //pointer on the centre of the treenode 00021 Point centre; 00022 00023 //pointer on the left childnode 00024 treeNode lc; 00025 00026 //pointer on the right childnode 00027 treeNode rc; 00028 00029 //pointer on the parent node 00030 treeNode parent; 00031 00032 //cost of the treenode 00033 double cost; 00034 00035 void free(){ 00036 this.parent = null; 00037 this.lc = null; 00038 this.rc = null; 00039 this.points = null; 00040 this.centre = null; 00041 } 00042 00043 public treeNode(int n, Point[] points, Point centre, treeNode parent) { 00044 this.n = n; 00045 this.points = points; 00046 this.centre = centre; 00047 this.lc = null; 00048 this.rc = null; 00049 this.parent = parent; 00050 this.cost = treeNodeTargetFunctionValue();; 00051 } 00052 00056 public treeNode(Point[] setA, Point[] setB, int n_1, int n_2, Point centre, int centreIndex){ 00057 //loop counter variable 00058 int i; 00059 00060 //the root has no parent and no child nodes in the beginning 00061 this.parent = null; 00062 this.lc = null; 00063 this.rc = null; 00064 00065 //array with points to the points 00066 this.points = new Point[n_1+n_2]; 00067 this.n = n_1 + n_2; 00068 00069 for(i=0;i<this.n;i++){ 00070 if(i < n_1){ 00071 this.points[i] = setA[i]; 00072 this.points[i].centreIndex = centreIndex; 00073 } else { 00074 this.points[i] = setB[i-n_1]; 00075 this.points[i].centreIndex = centreIndex; 00076 } 00077 } 00078 00079 //set the centre 00080 this.centre = centre; 00081 00082 //calculate costs 00083 this.cost = treeNodeTargetFunctionValue(); 00084 } 00085 00095 double treeNodeTargetFunctionValue(){ 00096 //loop counter variable 00097 int i; 00098 00099 //stores the cost 00100 double sum = 0.0; 00101 00102 for(i=0; i<this.n; i++){ 00103 //stores the distance 00104 double distance = 0.0; 00105 00106 //loop counter variable 00107 int l; 00108 00109 for(l=0;l<this.points[i].dimension;l++){ 00110 //centroid coordinate of the point 00111 double centroidCoordinatePoint; 00112 if(this.points[i].weight != 0.0){ 00113 centroidCoordinatePoint = this.points[i].coordinates[l] / this.points[i].weight; 00114 } else { 00115 centroidCoordinatePoint = this.points[i].coordinates[l]; 00116 } 00117 //centroid coordinate of the centre 00118 double centroidCoordinateCentre; 00119 if(this.centre.weight != 0.0){ 00120 centroidCoordinateCentre = this.centre.coordinates[l] / this.centre.weight; 00121 } else { 00122 centroidCoordinateCentre = this.centre.coordinates[l]; 00123 } 00124 distance += (centroidCoordinatePoint-centroidCoordinateCentre) * 00125 (centroidCoordinatePoint-centroidCoordinateCentre) ; 00126 00127 } 00128 00129 sum += distance*this.points[i].weight; 00130 } 00131 return sum; 00132 } 00133 }; 00134 00138 double treeNodeSplitCost(treeNode node, Point centreA, Point centreB){ 00139 //loop counter variable 00140 int i; 00141 00142 //stores the cost 00143 double sum = 0.0; 00144 00145 for(i=0; i<node.n; i++){ 00146 //loop counter variable 00147 int l; 00148 00149 //stores the distance between p and centreA 00150 double distanceA = 0.0; 00151 00152 for(l=0;l<node.points[i].dimension;l++){ 00153 //centroid coordinate of the point 00154 double centroidCoordinatePoint; 00155 if(node.points[i].weight != 0.0){ 00156 centroidCoordinatePoint = node.points[i].coordinates[l] / node.points[i].weight; 00157 } else { 00158 centroidCoordinatePoint = node.points[i].coordinates[l]; 00159 } 00160 //centroid coordinate of the centre 00161 double centroidCoordinateCentre; 00162 if(centreA.weight != 0.0){ 00163 centroidCoordinateCentre = centreA.coordinates[l] / centreA.weight; 00164 } else { 00165 centroidCoordinateCentre = centreA.coordinates[l]; 00166 } 00167 00168 distanceA += (centroidCoordinatePoint-centroidCoordinateCentre) * 00169 (centroidCoordinatePoint-centroidCoordinateCentre) ; 00170 } 00171 00172 //stores the distance between p and centreB 00173 double distanceB = 0.0; 00174 00175 for(l=0;l<node.points[i].dimension;l++){ 00176 //centroid coordinate of the point 00177 double centroidCoordinatePoint; 00178 if(node.points[i].weight != 0.0){ 00179 centroidCoordinatePoint = node.points[i].coordinates[l] / node.points[i].weight; 00180 } else { 00181 centroidCoordinatePoint = node.points[i].coordinates[l]; 00182 } 00183 //centroid coordinate of the centre 00184 double centroidCoordinateCentre; 00185 if(centreB.weight != 0.0){ 00186 centroidCoordinateCentre = centreB.coordinates[l] / centreB.weight; 00187 } else { 00188 centroidCoordinateCentre = centreB.coordinates[l]; 00189 } 00190 00191 distanceB += (centroidCoordinatePoint-centroidCoordinateCentre) * 00192 (centroidCoordinatePoint-centroidCoordinateCentre) ; 00193 } 00194 00195 //add the cost of the closest centre to the sum 00196 if(distanceA < distanceB){ 00197 sum += distanceA*node.points[i].weight; 00198 } else { 00199 sum += distanceB*node.points[i].weight; 00200 } 00201 00202 } 00203 00204 //return the total cost 00205 return sum; 00206 00207 } 00208 00209 00213 double treeNodeCostOfPoint(treeNode node, Point p){ 00214 if(p.weight == 0.0){ 00215 return 0.0; 00216 } 00217 00218 //stores the distance between centre and p 00219 double distance = 0.0; 00220 00221 //loop counter variable 00222 int l; 00223 00224 for(l=0;l<p.dimension;l++){ 00225 //centroid coordinate of the point 00226 double centroidCoordinatePoint; 00227 if(p.weight != 0.0){ 00228 centroidCoordinatePoint = p.coordinates[l] / p.weight; 00229 } else { 00230 centroidCoordinatePoint = p.coordinates[l]; 00231 } 00232 //centroid coordinate of the centre 00233 double centroidCoordinateCentre; 00234 if(node.centre.weight != 0.0){ 00235 centroidCoordinateCentre = node.centre.coordinates[l] / node.centre.weight; 00236 } else { 00237 centroidCoordinateCentre = node.centre.coordinates[l]; 00238 } 00239 distance += (centroidCoordinatePoint-centroidCoordinateCentre) * 00240 (centroidCoordinatePoint-centroidCoordinateCentre) ; 00241 00242 } 00243 return distance * p.weight; 00244 } 00245 00249 boolean isLeaf(treeNode node){ 00250 00251 if(node.lc == null && node.rc == null){ 00252 return true; 00253 } else { 00254 return false; 00255 } 00256 00257 } 00258 00262 treeNode selectNode(treeNode root, MTRandom clustererRandom){ 00263 00264 //random number between 0 and 1 00265 double random = clustererRandom.nextDouble(); 00266 00267 while(!isLeaf(root)){ 00268 if(root.lc.cost == 0 && root.rc.cost == 0){ 00269 if(root.lc.n == 0){ 00270 root = root.rc; 00271 } else if(root.rc.n == 0){ 00272 root = root.lc; 00273 }else if(random < 0.5){ 00274 random = clustererRandom.nextDouble(); 00275 root = root.lc; 00276 } else { 00277 random = clustererRandom.nextDouble(); 00278 root = root.rc; 00279 } 00280 } else { 00281 00282 if(random < root.lc.cost/root.cost){ 00283 00284 root = root.lc; 00285 } else { 00286 00287 root = root.rc; 00288 } 00289 } 00290 } 00291 00292 return root; 00293 } 00294 00298 Point chooseCentre(treeNode node, MTRandom clustererRandom){ 00299 00300 //How many times should we try to choose a centre ?? 00301 int times = 3; 00302 00303 //stores the nodecost if node is split with the best centre 00304 double minCost = node.cost; 00305 Point bestCentre = null; 00306 00307 //loop counter variable 00308 int i; 00309 int j; 00310 00311 for(j=0;j<times;j++){ 00312 //sum of the relativ cost of the points 00313 double sum = 0.0; 00314 //random number between 0 and 1 00315 double random = clustererRandom.nextDouble(); 00316 00317 for(i=0;i<node.n;i++){ 00318 00319 sum += treeNodeCostOfPoint(node,node.points[i]) / node.cost; 00320 if(sum >= random){ 00321 if(node.points[i].weight == 0.0){ 00322 //printf("ERROR: CHOOSEN DUMMY NODE THOUGH OTHER AVAILABLE \n"); 00323 return null; 00324 } 00325 double curCost = treeNodeSplitCost(node,node.centre,node.points[i]); 00326 if(curCost < minCost){ 00327 bestCentre = node.points[i]; 00328 minCost = curCost; 00329 } 00330 break; 00331 } 00332 } 00333 } 00334 if(bestCentre == null){ 00335 return node.points[0]; 00336 } else { 00337 return bestCentre; 00338 } 00339 } 00340 00344 Point determineClosestCentre(Point p, Point centreA, Point centreB){ 00345 00346 //loop counter variable 00347 int l; 00348 00349 //stores the distance between p and centreA 00350 double distanceA = 0.0; 00351 00352 for(l=0;l<p.dimension;l++){ 00353 //centroid coordinate of the point 00354 double centroidCoordinatePoint; 00355 if(p.weight != 0.0){ 00356 centroidCoordinatePoint = p.coordinates[l] / p.weight; 00357 } else { 00358 centroidCoordinatePoint = p.coordinates[l]; 00359 } 00360 //centroid coordinate of the centre 00361 double centroidCoordinateCentre; 00362 if(centreA.weight != 0.0){ 00363 centroidCoordinateCentre = centreA.coordinates[l] / centreA.weight; 00364 } else { 00365 centroidCoordinateCentre = centreA.coordinates[l]; 00366 } 00367 00368 distanceA += (centroidCoordinatePoint-centroidCoordinateCentre) * 00369 (centroidCoordinatePoint-centroidCoordinateCentre) ; 00370 } 00371 00372 //stores the distance between p and centreB 00373 double distanceB = 0.0; 00374 00375 for(l=0;l<p.dimension;l++){ 00376 //centroid coordinate of the point 00377 double centroidCoordinatePoint; 00378 if(p.weight != 0.0){ 00379 centroidCoordinatePoint = p.coordinates[l] / p.weight; 00380 } else { 00381 centroidCoordinatePoint = p.coordinates[l]; 00382 } 00383 //centroid coordinate of the centre 00384 double centroidCoordinateCentre; 00385 if(centreB.weight != 0.0){ 00386 centroidCoordinateCentre = centreB.coordinates[l] / centreB.weight; 00387 } else { 00388 centroidCoordinateCentre = centreB.coordinates[l]; 00389 } 00390 00391 distanceB += (centroidCoordinatePoint-centroidCoordinateCentre) * 00392 (centroidCoordinatePoint-centroidCoordinateCentre) ; 00393 } 00394 00395 //return the nearest centre 00396 if(distanceA < distanceB){ 00397 return centreA; 00398 } else { 00399 return centreB; 00400 } 00401 } 00402 00406 void split(treeNode parent, Point newCentre, int newCentreIndex){ 00407 00408 //loop counter variable 00409 int i; 00410 00411 //1. Counts how many points belong to the new and how many points belong to the old centre 00412 int nOld = 0; 00413 int nNew = 0; 00414 for(i=0;i<parent.n;i++){ 00415 Point centre = determineClosestCentre(parent.points[i], parent.centre, newCentre); 00416 if(centre == newCentre){ 00417 nNew++; 00418 } else { 00419 nOld++; 00420 } 00421 } 00422 00423 //2. initalizes the arrays for the pointer 00424 00425 //array for pointer on the points belonging to the old centre 00426 Point[] oldPoints = new Point[nOld]; 00427 00428 //array for pointer on the points belonging to the new centre 00429 Point[] newPoints = new Point[nNew]; 00430 00431 int indexOld = 0; 00432 int indexNew = 0; 00433 00434 for(i=0;i<parent.n;i++){ 00435 Point centre = determineClosestCentre(parent.points[i],parent.centre,newCentre); 00436 if(centre == newCentre){ 00437 newPoints[indexNew] = parent.points[i]; 00438 newPoints[indexNew].centreIndex = newCentreIndex; 00439 indexNew++; 00440 } else if(centre == parent.centre){ 00441 oldPoints[indexOld] = parent.points[i]; 00442 indexOld++; 00443 } else { 00444 //printf("ERROR !!! NO CENTER NEAREST !! \n"); 00445 } 00446 } 00447 00448 //left child: old centre 00449 treeNode lc = new treeNode(nOld, oldPoints, 00450 parent.centre, parent); 00451 /*lc.centre = parent.centre; 00452 lc.points = oldPoints; 00453 lc.n = nOld; 00454 00455 lc.lc = null; 00456 lc.rc = null; 00457 lc.parent = parent; 00458 00459 treeNodeTargetFunctionValue(lc);*/ 00460 00461 //right child: new centre 00462 treeNode rc = new treeNode(nNew, newPoints, newCentre, 00463 parent); 00464 /*rc.centre = newCentre; 00465 rc.points = newPoints; 00466 rc.n = nNew; 00467 00468 rc.lc = null; 00469 rc.rc = null; 00470 rc.parent = parent; 00471 00472 treeNodeTargetFunctionValue(rc);*/ 00473 00474 //set childs of the parent node 00475 parent.lc = lc; 00476 parent.rc = rc; 00477 00478 //propagate the cost changes to the parent nodes 00479 while(parent != null){ 00480 parent.cost = parent.lc.cost + parent.rc.cost; 00481 parent = parent.parent; 00482 } 00483 00484 } 00485 00489 boolean treeFinished(treeNode root){ 00490 return (root.parent == null && root.lc == null && root.rc == null); 00491 } 00492 00496 void freeTree(treeNode root){ 00497 00498 while(!treeFinished(root)){ 00499 if(root.lc == null && root.rc == null){ 00500 root = root.parent; 00501 } else if(root.lc == null && root.rc != null){ 00502 //Schau ob rc ein Blatt ist 00503 if(isLeaf(root.rc)){ 00504 //Gebe rechtes Kind frei 00505 root.rc.free(); 00506 root.rc = null; 00507 } else { 00508 //Fahre mit rechtem Kind fort 00509 root = root.rc; 00510 } 00511 } else if(root.lc != null) { 00512 if(isLeaf(root.lc)){ 00513 root.lc.free(); 00514 root.lc = null; 00515 } else { 00516 root = root.lc; 00517 } 00518 } 00519 } 00520 root.free(); 00521 00522 } 00523 00527 void unionTreeCoreset(int k,int n_1,int n_2,int d, Point[] setA,Point[] setB, Point[] centres, MTRandom clustererRandom) { 00528 //printf("Computing coreset...\n"); 00529 //total number of points 00530 int n = n_1+n_2; 00531 00532 //choose the first centre (each point has the same probability of being choosen) 00533 00534 //stores, how many centres have been choosen yet 00535 int choosenPoints = 0; 00536 00537 //only choose from the n-i points not already choosen 00538 int j = clustererRandom.nextInt(n-choosenPoints); 00539 00540 //copy the choosen point 00541 if(j < n_1){ 00542 //copyPointWithoutInit(&setA[j],¢res[choosenPoints]); 00543 centres[choosenPoints] = setA[j].clone(); 00544 } else { 00545 j = j - n_1; 00546 //copyPointWithoutInit(&setB[j],¢res[choosenPoints]); 00547 centres[choosenPoints] = setB[j].clone(); 00548 } 00549 treeNode root = new treeNode(setA,setB,n_1,n_2, centres[choosenPoints],choosenPoints); //?? 00550 choosenPoints = 1; 00551 00552 //choose the remaining points 00553 while(choosenPoints < k){ 00554 if(root.cost > 0.0){ 00555 treeNode leaf = selectNode(root, clustererRandom); 00556 Point centre = chooseCentre(leaf, clustererRandom); 00557 split(leaf,centre,choosenPoints); 00558 //copyPointWithoutInit(centre,¢res[choosenPoints]); 00559 centres[choosenPoints] = centre; 00560 } else { 00561 //create a dummy point 00562 //copyPointWithoutInit(root.centre,¢res[choosenPoints]); 00563 centres[choosenPoints] = root.centre; 00564 int l; 00565 for(l=0;l<root.centre.dimension;l++){ 00566 centres[choosenPoints].coordinates[l] = -1 * 1000000; 00567 } 00568 centres[choosenPoints].id = -1; 00569 centres[choosenPoints].weight = 0.0; 00570 centres[choosenPoints].squareSum = 0.0; 00571 } 00572 00573 choosenPoints++; 00574 } 00575 00576 //free the tree 00577 freeTree(root); 00578 00579 //recalculate clustering features 00580 int i; 00581 for(i=0;i<n;i++){ 00582 00583 if(i < n_1) { 00584 00585 int index = setA[i].centreIndex; 00586 if(centres[index].id != setA[i].id){ 00587 centres[index].weight += setA[i].weight; 00588 centres[index].squareSum += setA[i].squareSum; 00589 int l; 00590 for(l=0;l<centres[index].dimension;l++){ 00591 if(setA[i].weight != 0.0){ 00592 centres[index].coordinates[l] += setA[i].coordinates[l]; 00593 } 00594 } 00595 } 00596 } else { 00597 00598 int index = setB[i-n_1].centreIndex; 00599 if(centres[index].id != setB[i-n_1].id){ 00600 centres[index].weight += setB[i-n_1].weight; 00601 centres[index].squareSum += setB[i-n_1].squareSum; 00602 int l; 00603 for(l=0;l<centres[index].dimension;l++){ 00604 if(setB[i-n_1].weight != 0.0){ 00605 centres[index].coordinates[l] += setB[i-n_1].coordinates[l]; 00606 } 00607 } 00608 } 00609 } 00610 } 00611 } 00612 00613 00614 }