MOA 12.03
Real Time Analytics for Data Streams
|
00001 /* 00002 * CMM.java 00003 * Copyright (C) 2010 RWTH Aachen University, Germany 00004 * @author Jansen ([email protected]) 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * This program is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 * GNU General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU General Public License 00017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 * 00019 */ 00020 00021 00036 package moa.evaluation; 00037 00038 import java.util.ArrayList; 00039 import moa.cluster.Cluster; 00040 import moa.cluster.Clustering; 00041 import moa.cluster.SphereCluster; 00042 import moa.evaluation.CMM_GTAnalysis.CMMPoint; 00043 import moa.gui.visualization.DataPoint; 00044 00045 00046 public class CMM extends MeasureCollection{ 00050 private Clustering clustering; 00051 00055 private CMM_GTAnalysis gtAnalysis; 00056 00060 private int numPoints; 00061 00065 private int numFClusters; 00066 00071 private int numGT0Classes; 00072 00076 private int matchMap[]; 00077 00082 private double[][] pointInclusionProbFC; 00083 00087 private double pointInclusionProbThreshold = 0.5; 00088 00092 private double lamdaMissed = 1; 00093 00094 00098 public boolean debug = false; 00099 00100 00104 public boolean enableClassMerge = true; 00105 00110 public boolean enableModelError = true; 00111 00112 00113 @Override 00114 protected String[] getNames() { 00115 String[] names = {"CMM","CMM Basic","CMM Missed","CMM Misplaced","CMM Noise", 00116 "CA Seperability", "CA Noise", "CA Modell"}; 00117 return names; 00118 } 00119 00120 @Override 00121 protected boolean[] getDefaultEnabled() { 00122 boolean [] defaults = {false, false, false, false, false, false, false, false}; 00123 return defaults; 00124 } 00125 00126 00127 @Override 00128 public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception{ 00129 this.clustering = clustering; 00130 00131 numPoints = points.size(); 00132 numFClusters = clustering.size(); 00133 00134 gtAnalysis = new CMM_GTAnalysis(trueClustering, points, enableClassMerge); 00135 00136 numGT0Classes = gtAnalysis.getNumberOfGT0Classes(); 00137 00138 addValue("CA Seperability",gtAnalysis.getClassSeparability()); 00139 addValue("CA Noise",gtAnalysis.getNoiseSeparability()); 00140 addValue("CA Modell",gtAnalysis.getModelQuality()); 00141 00142 /* init the matching and point distances */ 00143 calculateMatching(); 00144 00145 /* calculate the actual error */ 00146 calculateError(); 00147 } 00148 00149 00153 private void calculateMatching(){ 00154 00158 int[][] mapFC = new int[numFClusters][numGT0Classes]; 00159 00163 int[][] mapGT = new int[numGT0Classes][numGT0Classes]; 00164 int [] sumsFC = new int[numFClusters]; 00165 00166 //calculate fuzzy mapping from 00167 pointInclusionProbFC = new double[numPoints][numFClusters]; 00168 for (int p = 0; p < numPoints; p++) { 00169 CMMPoint cmdp = gtAnalysis.getPoint(p); 00170 //found cluster frequencies 00171 for (int fc = 0; fc < numFClusters; fc++) { 00172 Cluster cl = clustering.get(fc); 00173 pointInclusionProbFC[p][fc] = cl.getInclusionProbability(cmdp); 00174 if (pointInclusionProbFC[p][fc] >= pointInclusionProbThreshold) { 00175 //make sure we don't count points twice that are contained in two merged clusters 00176 if(cmdp.isNoise()) continue; 00177 mapFC[fc][cmdp.workclass()]++; 00178 sumsFC[fc]++; 00179 } 00180 } 00181 00182 //ground truth cluster frequencies 00183 if(!cmdp.isNoise()){ 00184 for(int hc = 0; hc < numGT0Classes;hc++){ 00185 if(hc == cmdp.workclass()){ 00186 mapGT[hc][hc]++; 00187 } 00188 else{ 00189 if(gtAnalysis.getGT0Cluster(hc).getInclusionProbability(cmdp) >= 1){ 00190 mapGT[hc][cmdp.workclass()]++; 00191 } 00192 } 00193 } 00194 } 00195 } 00196 00197 //assign each found cluster to a hidden cluster 00198 matchMap = new int[numFClusters]; 00199 for (int fc = 0; fc < numFClusters; fc++) { 00200 int matchIndex = -1; 00201 //check if we only have one entry anyway 00202 for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { 00203 if(mapFC[fc][hc0]!=0){ 00204 if(matchIndex == -1) 00205 matchIndex = hc0; 00206 else{ 00207 matchIndex = -1; 00208 break; 00209 } 00210 } 00211 } 00212 00213 //more then one entry, so look for most similar frequency profile 00214 int minDiff = Integer.MAX_VALUE; 00215 if(sumsFC[fc]!=0 && matchIndex == -1){ 00216 ArrayList<Integer> fitCandidates = new ArrayList<Integer>(); 00217 for (int hc0 = 0; hc0 < numGT0Classes; hc0++) { 00218 int errDiff = 0; 00219 for (int hc1 = 0; hc1 < numGT0Classes; hc1++) { 00220 //fc profile doesn't fit into current hc profile 00221 double freq_diff = mapFC[fc][hc1] - mapGT[hc0][hc1]; 00222 if(freq_diff > 0){ 00223 errDiff+= freq_diff; 00224 } 00225 } 00226 if(errDiff == 0){ 00227 fitCandidates.add(hc0); 00228 } 00229 if(errDiff < minDiff){ 00230 minDiff = errDiff; 00231 matchIndex = hc0; 00232 } 00233 if(debug){ 00234 //System.out.println("FC"+fc+"("+Arrays.toString(mapFC[fc])+") - HC0_"+hc0+"("+Arrays.toString(mapGT[hc0])+"):"+errDiff); 00235 } 00236 } 00237 //if we have a fitting profile overwrite the min error choice 00238 //if we have multiple fit candidates, use majority vote of corresponding classes 00239 if(fitCandidates.size()!=0){ 00240 int bestGTfit = fitCandidates.get(0); 00241 for(int i = 1; i < fitCandidates.size(); i++){ 00242 int GTfit = fitCandidates.get(i); 00243 if(mapFC[fc][GTfit] > mapFC[fc][bestGTfit]) 00244 bestGTfit=fitCandidates.get(i); 00245 } 00246 matchIndex = bestGTfit; 00247 } 00248 } 00249 00250 matchMap[fc] = matchIndex; 00251 int realMatch = -1; 00252 if(matchIndex==-1){ 00253 if(debug) 00254 System.out.println("No cluster match: needs to be implemented?"); 00255 } 00256 else{ 00257 realMatch = gtAnalysis.getGT0Cluster(matchMap[fc]).getLabel(); 00258 } 00259 clustering.get(fc).setMeasureValue("CMM Match", "C"+realMatch); 00260 clustering.get(fc).setMeasureValue("CMM Workclass", "C"+matchMap[fc]); 00261 } 00262 00263 //print matching table 00264 if(debug){ 00265 for (int i = 0; i < numFClusters; i++) { 00266 System.out.print("C"+((int)clustering.get(i).getId()) + " N:"+((int)clustering.get(i).getWeight())+" | "); 00267 for (int j = 0; j < numGT0Classes; j++) { 00268 System.out.print(mapFC[i][j] + " "); 00269 } 00270 System.out.print(" = "+sumsFC[i] + " | "); 00271 String match = "-"; 00272 if (matchMap[i]!=-1) { 00273 match = Integer.toString(gtAnalysis.getGT0Cluster(matchMap[i]).getLabel()); 00274 } 00275 System.out.println(" --> " + match + "(work:"+matchMap[i]+")"); 00276 } 00277 } 00278 } 00279 00280 00284 private void calculateError(){ 00285 int totalErrorCount = 0; 00286 int totalRedundancy = 0; 00287 int trueCoverage = 0; 00288 int totalCoverage = 0; 00289 00290 int numNoise = 0; 00291 double errorNoise = 0; 00292 double errorNoiseMax = 0; 00293 00294 double errorMissed = 0; 00295 double errorMissedMax = 0; 00296 00297 double errorMisplaced = 0; 00298 double errorMisplacedMax = 0; 00299 00300 double totalError = 0.0; 00301 double totalErrorMax = 0.0; 00302 00306 for (int p = 0; p < numPoints; p++) { 00307 CMMPoint cmdp = gtAnalysis.getPoint(p); 00308 double weight = cmdp.weight(); 00309 //noise counter 00310 if(cmdp.isNoise()){ 00311 numNoise++; 00312 //this is always 1 00313 errorNoiseMax+=cmdp.connectivity*weight; 00314 } 00315 else{ 00316 errorMissedMax+=cmdp.connectivity*weight; 00317 errorMisplacedMax+=cmdp.connectivity*weight; 00318 } 00319 //sum up maxError as the individual errors are the quality weighted between 0-1 00320 totalErrorMax+=cmdp.connectivity*weight; 00321 00322 00323 double err = 0; 00324 int coverage = 0; 00325 00326 //check every FCluster 00327 for (int c = 0; c < numFClusters; c++) { 00328 //contained in cluster c? 00329 if(pointInclusionProbFC[p][c] >= pointInclusionProbThreshold){ 00330 coverage++; 00331 00332 if(!cmdp.isNoise()){ 00333 //PLACED CORRECTLY 00334 if(matchMap[c] == cmdp.workclass()){ 00335 } 00336 //MISPLACED 00337 else{ 00338 double errvalue = misplacedError(cmdp, c); 00339 if(errvalue > err) 00340 err = errvalue; 00341 } 00342 } 00343 else{ 00344 //NOISE 00345 double errvalue = noiseError(cmdp, c); 00346 if(errvalue > err) err = errvalue; 00347 } 00348 } 00349 } 00350 //not in any cluster 00351 if(coverage == 0){ 00352 //MISSED 00353 if(!cmdp.isNoise()){ 00354 err = missedError(cmdp,true); 00355 errorMissed+= weight*err; 00356 } 00357 //NOISE 00358 else{ 00359 } 00360 } 00361 else{ 00362 if(!cmdp.isNoise()){ 00363 errorMisplaced+= err*weight; 00364 } 00365 else{ 00366 errorNoise+= err*weight; 00367 } 00368 } 00369 00370 /* processing of other evaluation values */ 00371 totalError+= err*weight; 00372 if(err!=0)totalErrorCount++; 00373 if(coverage>0) totalCoverage++; //points covered by clustering (incl. noise) 00374 if(coverage>0 && !cmdp.isNoise()) trueCoverage++; //points covered by clustering, don't count noise 00375 if(coverage>1) totalRedundancy++; //include noise 00376 00377 cmdp.p.setMeasureValue("CMM",err); 00378 cmdp.p.setMeasureValue("Redundancy", coverage); 00379 } 00380 00381 addValue("CMM", (totalErrorMax!=0)?1-totalError/totalErrorMax:1); 00382 addValue("CMM Missed", (errorMissedMax!=0)?1-errorMissed/errorMissedMax:1); 00383 addValue("CMM Misplaced", (errorMisplacedMax!=0)?1-errorMisplaced/errorMisplacedMax:1); 00384 addValue("CMM Noise", (errorNoiseMax!=0)?1-errorNoise/errorNoiseMax:1); 00385 addValue("CMM Basic", 1-((double)totalErrorCount/(double)numPoints)); 00386 00387 if(debug){ 00388 System.out.println("-------------"); 00389 } 00390 } 00391 00392 00393 private double noiseError(CMMPoint cmdp, int assignedClusterID){ 00394 int gtAssignedID = matchMap[assignedClusterID]; 00395 double error; 00396 00397 //Cluster wasn't matched, so just contains noise 00398 //TODO: Noiscluster? 00399 //also happens when we decrease the radius and there is only a noise point in the center 00400 if(gtAssignedID==-1){ 00401 error = 1; 00402 cmdp.p.setMeasureValue("CMM Type","noise - cluster"); 00403 } 00404 else{ 00405 if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){ 00406 //set to MIN_ERROR so we can still track the error 00407 error = 0.00001; 00408 cmdp.p.setMeasureValue("CMM Type","noise - byModel"); 00409 } 00410 else{ 00411 error = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); 00412 cmdp.p.setMeasureValue("CMM Type","noise"); 00413 } 00414 } 00415 00416 return error; 00417 } 00418 00419 private double missedError(CMMPoint cmdp, boolean useHullDistance){ 00420 cmdp.p.setMeasureValue("CMM Type","missed"); 00421 if(!useHullDistance){ 00422 return cmdp.connectivity; 00423 } 00424 else{ 00425 //main idea: look at relative distance of missed point to cluster 00426 double minHullDist = 1; 00427 for (int fc = 0; fc < numFClusters; fc++){ 00428 //if fc is mappend onto the class of the point, check it for its hulldist 00429 if(matchMap[fc]!=-1 && matchMap[fc] == cmdp.workclass()){ 00430 if(clustering.get(fc) instanceof SphereCluster){ 00431 SphereCluster sc = (SphereCluster)clustering.get(fc); 00432 double distanceFC = sc.getCenterDistance(cmdp); 00433 double radius = sc.getRadius(); 00434 double hullDist = (distanceFC-radius)/(distanceFC+radius); 00435 if(hullDist < minHullDist) 00436 minHullDist = hullDist; 00437 } 00438 else{ 00439 double min = 1; 00440 double max = 1; 00441 00442 //TODO: distance for random shape 00443 //generate X points from the cluster with clustering.get(fc).sample(null) 00444 //and find Min and Max values 00445 00446 double hullDist = min/max; 00447 if(hullDist < minHullDist) 00448 minHullDist = hullDist; 00449 } 00450 } 00451 } 00452 00453 //use distance as weight 00454 if(minHullDist>1) minHullDist = 1; 00455 00456 double weight = (1-Math.exp(-lamdaMissed*minHullDist)); 00457 cmdp.p.setMeasureValue("HullDistWeight",weight); 00458 00459 return weight*cmdp.connectivity; 00460 } 00461 } 00462 00463 00464 private double misplacedError(CMMPoint cmdp, int assignedClusterID){ 00465 double weight = 0; 00466 00467 int gtAssignedID = matchMap[assignedClusterID]; 00468 //TODO take care of noise cluster? 00469 if(gtAssignedID ==-1){ 00470 System.out.println("Point "+cmdp.getTimestamp()+" from gtcluster "+cmdp.trueClass+" assigned to noise cluster "+assignedClusterID); 00471 return 1; 00472 } 00473 00474 if(gtAssignedID == cmdp.workclass()) 00475 return 0; 00476 else{ 00477 //assigned and real GT0 cluster are not connected, but does the model have the 00478 //chance of separating this point after all? 00479 if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){ 00480 weight = 0; 00481 cmdp.p.setMeasureValue("CMM Type","missplaced - byModel"); 00482 } 00483 else{ 00484 //point was mapped onto wrong cluster (assigned), so check how far away 00485 //the nearest point is within the wrongly assigned cluster 00486 weight = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID); 00487 } 00488 } 00489 double err_value; 00490 //set to MIN_ERROR so we can still track the error 00491 if(weight == 0){ 00492 err_value= 0.00001; 00493 } 00494 else{ 00495 err_value = weight*cmdp.connectivity; 00496 cmdp.p.setMeasureValue("CMM Type","missplaced"); 00497 } 00498 00499 return err_value; 00500 } 00501 00502 public String getParameterString(){ 00503 String para = gtAnalysis.getParameterString(); 00504 para+="lambdaMissed="+lamdaMissed+";"; 00505 return para; 00506 } 00507 00508 } 00509 00510