MOA 12.03
Real Time Analytics for Data Streams
CMM.java
Go to the documentation of this file.
00001 /*
00002  *    CMM.java
00003  *    Copyright (C) 2010 RWTH Aachen University, Germany
00004  *    @author Jansen (moa@cs.rwth-aachen.de)
00005  *
00006  *    This program is free software; you can redistribute it and/or modify
00007  *    it under the terms of the GNU General Public License as published by
00008  *    the Free Software Foundation; either version 3 of the License, or
00009  *    (at your option) any later version.
00010  *
00011  *    This program is distributed in the hope that it will be useful,
00012  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *    GNU General Public License for more details.
00015  *
00016  *    You should have received a copy of the GNU General Public License
00017  *    along with this program. If not, see <http://www.gnu.org/licenses/>.
00018  *    
00019  */
00020 
00021 
00036 package moa.evaluation;
00037 
00038 import java.util.ArrayList;
00039 import moa.cluster.Cluster;
00040 import moa.cluster.Clustering;
00041 import moa.cluster.SphereCluster;
00042 import moa.evaluation.CMM_GTAnalysis.CMMPoint;
00043 import moa.gui.visualization.DataPoint;
00044 
00045 
00046 public class CMM extends MeasureCollection{
00050         private Clustering clustering;
00051 
00055     private CMM_GTAnalysis gtAnalysis;
00056 
00060     private int numPoints;
00061     
00065     private int numFClusters;
00066     
00071     private int numGT0Classes;
00072 
00076     private int matchMap[]; 
00077     
00082     private double[][] pointInclusionProbFC;
00083     
00087     private double pointInclusionProbThreshold = 0.5;
00088     
00092     private double lamdaMissed = 1;
00093 
00094     
00098     public boolean debug = false;
00099 
00100     
00104     public boolean enableClassMerge = true;
00105     
00110     public boolean enableModelError = true;
00111     
00112 
00113     @Override
00114     protected String[] getNames() {
00115         String[] names = {"CMM","CMM Basic","CMM Missed","CMM Misplaced","CMM Noise",
00116                             "CA Seperability", "CA Noise", "CA Modell"};
00117         return names;
00118     }
00119 
00120     @Override
00121     protected boolean[] getDefaultEnabled() {
00122         boolean [] defaults = {false, false, false, false, false, false, false, false};
00123         return defaults;
00124     }
00125 
00126     
00127     @Override
00128     public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception{
00129         this.clustering = clustering;
00130 
00131         numPoints = points.size();
00132         numFClusters = clustering.size();
00133 
00134         gtAnalysis = new CMM_GTAnalysis(trueClustering, points, enableClassMerge);
00135 
00136         numGT0Classes = gtAnalysis.getNumberOfGT0Classes();
00137 
00138         addValue("CA Seperability",gtAnalysis.getClassSeparability());
00139         addValue("CA Noise",gtAnalysis.getNoiseSeparability());
00140         addValue("CA Modell",gtAnalysis.getModelQuality());
00141 
00142         /* init the matching and point distances */
00143         calculateMatching();
00144 
00145         /* calculate the actual error */
00146         calculateError();
00147     }
00148 
00149     
00153     private void calculateMatching(){
00154 
00158         int[][] mapFC = new int[numFClusters][numGT0Classes];
00159 
00163         int[][] mapGT = new int[numGT0Classes][numGT0Classes];
00164         int [] sumsFC = new int[numFClusters];
00165 
00166         //calculate fuzzy mapping from
00167         pointInclusionProbFC = new double[numPoints][numFClusters];
00168         for (int p = 0; p < numPoints; p++) {
00169             CMMPoint cmdp = gtAnalysis.getPoint(p);
00170             //found cluster frequencies
00171             for (int fc = 0; fc < numFClusters; fc++) {
00172                 Cluster cl = clustering.get(fc);
00173                 pointInclusionProbFC[p][fc] = cl.getInclusionProbability(cmdp);
00174                 if (pointInclusionProbFC[p][fc] >= pointInclusionProbThreshold) {
00175                     //make sure we don't count points twice that are contained in two merged clusters
00176                     if(cmdp.isNoise()) continue;
00177                     mapFC[fc][cmdp.workclass()]++;
00178                     sumsFC[fc]++;
00179                 }
00180             }
00181 
00182             //ground truth cluster frequencies
00183             if(!cmdp.isNoise()){
00184                 for(int hc = 0; hc < numGT0Classes;hc++){
00185                     if(hc == cmdp.workclass()){
00186                         mapGT[hc][hc]++;
00187                     }
00188                     else{
00189                         if(gtAnalysis.getGT0Cluster(hc).getInclusionProbability(cmdp) >= 1){
00190                             mapGT[hc][cmdp.workclass()]++;
00191                         }
00192                     }
00193                 }
00194             }
00195         }
00196 
00197         //assign each found cluster to a hidden cluster
00198         matchMap = new int[numFClusters];
00199         for (int fc = 0; fc < numFClusters; fc++) {
00200             int matchIndex = -1;
00201             //check if we only have one entry anyway
00202             for (int hc0 = 0; hc0 < numGT0Classes; hc0++) {
00203                 if(mapFC[fc][hc0]!=0){
00204                     if(matchIndex == -1)
00205                         matchIndex = hc0;
00206                     else{
00207                         matchIndex = -1;
00208                         break;
00209                     }
00210                 }
00211             }
00212 
00213             //more then one entry, so look for most similar frequency profile
00214             int minDiff = Integer.MAX_VALUE;
00215             if(sumsFC[fc]!=0 && matchIndex == -1){
00216                 ArrayList<Integer> fitCandidates = new ArrayList<Integer>();
00217                 for (int hc0 = 0; hc0 < numGT0Classes; hc0++) {
00218                     int errDiff = 0;
00219                     for (int hc1 = 0; hc1 < numGT0Classes; hc1++) {
00220                         //fc profile doesn't fit into current hc profile
00221                         double freq_diff = mapFC[fc][hc1] - mapGT[hc0][hc1];
00222                         if(freq_diff > 0){
00223                             errDiff+= freq_diff;
00224                         }
00225                     }
00226                     if(errDiff == 0){
00227                         fitCandidates.add(hc0);
00228                     }
00229                     if(errDiff < minDiff){
00230                         minDiff = errDiff;
00231                         matchIndex = hc0;
00232                     }
00233                     if(debug){
00234                         //System.out.println("FC"+fc+"("+Arrays.toString(mapFC[fc])+") - HC0_"+hc0+"("+Arrays.toString(mapGT[hc0])+"):"+errDiff);
00235                     }
00236                 }
00237                 //if we have a fitting profile overwrite the min error choice
00238                 //if we have multiple fit candidates, use majority vote of corresponding classes
00239                 if(fitCandidates.size()!=0){
00240                     int bestGTfit = fitCandidates.get(0);
00241                     for(int i = 1; i < fitCandidates.size(); i++){
00242                         int GTfit = fitCandidates.get(i);
00243                         if(mapFC[fc][GTfit] > mapFC[fc][bestGTfit])
00244                             bestGTfit=fitCandidates.get(i);
00245                     }
00246                     matchIndex = bestGTfit;
00247                 }
00248             }
00249             
00250             matchMap[fc] = matchIndex;
00251             int realMatch = -1;
00252             if(matchIndex==-1){
00253                 if(debug)
00254                     System.out.println("No cluster match: needs to be implemented?");
00255             }
00256             else{
00257                 realMatch = gtAnalysis.getGT0Cluster(matchMap[fc]).getLabel();
00258             }
00259             clustering.get(fc).setMeasureValue("CMM Match", "C"+realMatch);
00260             clustering.get(fc).setMeasureValue("CMM Workclass", "C"+matchMap[fc]);
00261         }
00262 
00263         //print matching table
00264         if(debug){
00265             for (int i = 0; i < numFClusters; i++) {
00266                     System.out.print("C"+((int)clustering.get(i).getId()) + " N:"+((int)clustering.get(i).getWeight())+"  |  ");
00267                     for (int j = 0; j < numGT0Classes; j++) {
00268                           System.out.print(mapFC[i][j] + " ");
00269                     }
00270                     System.out.print(" = "+sumsFC[i] + " | ");
00271                     String match = "-";
00272                     if (matchMap[i]!=-1) {
00273                         match = Integer.toString(gtAnalysis.getGT0Cluster(matchMap[i]).getLabel());
00274                     }
00275                     System.out.println(" --> " + match + "(work:"+matchMap[i]+")");
00276              }
00277         }
00278     }
00279     
00280     
00284     private void calculateError(){
00285         int totalErrorCount = 0;
00286         int totalRedundancy = 0;
00287         int trueCoverage = 0;
00288         int totalCoverage = 0;
00289 
00290         int numNoise = 0;
00291         double errorNoise = 0;
00292         double errorNoiseMax = 0;
00293 
00294         double errorMissed = 0;
00295         double errorMissedMax = 0;
00296 
00297         double errorMisplaced = 0;
00298         double errorMisplacedMax = 0;
00299 
00300         double totalError = 0.0;
00301         double totalErrorMax = 0.0;
00302 
00306         for (int p = 0; p < numPoints; p++) {
00307             CMMPoint cmdp = gtAnalysis.getPoint(p);
00308             double weight = cmdp.weight();
00309             //noise counter
00310             if(cmdp.isNoise()){
00311                 numNoise++;
00312                 //this is always 1
00313                 errorNoiseMax+=cmdp.connectivity*weight;
00314             }
00315             else{
00316                 errorMissedMax+=cmdp.connectivity*weight;
00317                 errorMisplacedMax+=cmdp.connectivity*weight;
00318             }
00319             //sum up maxError as the individual errors are the quality weighted between 0-1
00320             totalErrorMax+=cmdp.connectivity*weight;
00321 
00322 
00323             double err = 0;
00324             int coverage = 0;
00325 
00326             //check every FCluster
00327             for (int c = 0; c < numFClusters; c++) {
00328                 //contained in cluster c?
00329                 if(pointInclusionProbFC[p][c] >= pointInclusionProbThreshold){
00330                     coverage++;
00331 
00332                     if(!cmdp.isNoise()){
00333                         //PLACED CORRECTLY
00334                         if(matchMap[c] == cmdp.workclass()){
00335                         }
00336                         //MISPLACED
00337                         else{
00338                             double errvalue = misplacedError(cmdp, c);
00339                             if(errvalue > err)
00340                                 err = errvalue;
00341                         }
00342                     }
00343                     else{
00344                         //NOISE
00345                         double errvalue = noiseError(cmdp, c);
00346                         if(errvalue > err) err = errvalue;
00347                     }
00348                 }
00349             }
00350             //not in any cluster
00351             if(coverage == 0){
00352                 //MISSED
00353                 if(!cmdp.isNoise()){
00354                     err = missedError(cmdp,true);
00355                     errorMissed+= weight*err;
00356                 }
00357                 //NOISE
00358                 else{
00359                 }
00360             }
00361             else{
00362                 if(!cmdp.isNoise()){
00363                     errorMisplaced+= err*weight;
00364                 }
00365                 else{
00366                     errorNoise+= err*weight;
00367                 }
00368             }
00369 
00370             /* processing of other evaluation values */
00371             totalError+= err*weight;
00372             if(err!=0)totalErrorCount++;
00373             if(coverage>0) totalCoverage++;  //points covered by clustering (incl. noise)
00374             if(coverage>0 && !cmdp.isNoise()) trueCoverage++; //points covered by clustering, don't count noise
00375             if(coverage>1) totalRedundancy++; //include noise
00376 
00377             cmdp.p.setMeasureValue("CMM",err);
00378             cmdp.p.setMeasureValue("Redundancy", coverage);
00379         }
00380 
00381         addValue("CMM", (totalErrorMax!=0)?1-totalError/totalErrorMax:1);
00382         addValue("CMM Missed", (errorMissedMax!=0)?1-errorMissed/errorMissedMax:1);
00383         addValue("CMM Misplaced", (errorMisplacedMax!=0)?1-errorMisplaced/errorMisplacedMax:1);
00384         addValue("CMM Noise", (errorNoiseMax!=0)?1-errorNoise/errorNoiseMax:1);
00385         addValue("CMM Basic", 1-((double)totalErrorCount/(double)numPoints));
00386 
00387         if(debug){
00388             System.out.println("-------------");
00389         }
00390     }
00391 
00392 
00393     private double noiseError(CMMPoint cmdp, int assignedClusterID){
00394         int gtAssignedID = matchMap[assignedClusterID];
00395         double error;
00396         
00397         //Cluster wasn't matched, so just contains noise
00398         //TODO: Noiscluster?
00399         //also happens when we decrease the radius and there is only a noise point in the center
00400         if(gtAssignedID==-1){
00401             error = 1;
00402             cmdp.p.setMeasureValue("CMM Type","noise - cluster");
00403         }
00404         else{
00405             if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){
00406                 //set to MIN_ERROR so we can still track the error
00407                 error = 0.00001;
00408                 cmdp.p.setMeasureValue("CMM Type","noise - byModel");
00409             }
00410             else{
00411                 error = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID);
00412                 cmdp.p.setMeasureValue("CMM Type","noise");
00413             }
00414         }
00415 
00416         return error;
00417     }
00418 
00419     private double missedError(CMMPoint cmdp, boolean useHullDistance){
00420         cmdp.p.setMeasureValue("CMM Type","missed");
00421         if(!useHullDistance){
00422             return cmdp.connectivity;
00423         }
00424         else{
00425             //main idea: look at relative distance of missed point to cluster
00426             double minHullDist = 1;
00427             for (int fc = 0; fc < numFClusters; fc++){
00428                 //if fc is mappend onto the class of the point, check it for its hulldist
00429                 if(matchMap[fc]!=-1 && matchMap[fc] == cmdp.workclass()){
00430                     if(clustering.get(fc) instanceof SphereCluster){
00431                         SphereCluster sc = (SphereCluster)clustering.get(fc);
00432                         double distanceFC = sc.getCenterDistance(cmdp);
00433                         double radius = sc.getRadius();
00434                         double hullDist = (distanceFC-radius)/(distanceFC+radius);
00435                         if(hullDist < minHullDist)
00436                             minHullDist = hullDist;
00437                     }
00438                     else{
00439                         double min = 1;
00440                         double max = 1;
00441 
00442                         //TODO: distance for random shape
00443                         //generate X points from the cluster with clustering.get(fc).sample(null)
00444                         //and find Min and Max values
00445 
00446                         double hullDist = min/max;
00447                         if(hullDist < minHullDist)
00448                             minHullDist = hullDist;
00449                     }
00450                 }
00451             }
00452 
00453             //use distance as weight
00454             if(minHullDist>1) minHullDist = 1;
00455 
00456             double weight = (1-Math.exp(-lamdaMissed*minHullDist));
00457             cmdp.p.setMeasureValue("HullDistWeight",weight);
00458 
00459             return weight*cmdp.connectivity;
00460         }
00461     }
00462 
00463 
00464     private double misplacedError(CMMPoint cmdp, int assignedClusterID){
00465         double weight = 0;
00466 
00467         int gtAssignedID = matchMap[assignedClusterID];
00468         //TODO take care of noise cluster?
00469         if(gtAssignedID ==-1){
00470             System.out.println("Point "+cmdp.getTimestamp()+" from gtcluster "+cmdp.trueClass+" assigned to noise cluster "+assignedClusterID);
00471             return 1;
00472         }
00473 
00474         if(gtAssignedID == cmdp.workclass())
00475             return 0;
00476         else{
00477             //assigned and real GT0 cluster are not connected, but does the model have the
00478             //chance of separating this point after all?
00479             if(enableModelError && gtAnalysis.getGT0Cluster(gtAssignedID).getInclusionProbability(cmdp) >= pointInclusionProbThreshold){
00480                 weight = 0;
00481                 cmdp.p.setMeasureValue("CMM Type","missplaced - byModel");
00482             }
00483             else{
00484                 //point was mapped onto wrong cluster (assigned), so check how far away
00485                 //the nearest point is within the wrongly assigned cluster
00486                 weight = 1 - gtAnalysis.getConnectionValue(cmdp, gtAssignedID);
00487             }
00488         }
00489         double err_value;
00490         //set to MIN_ERROR so we can still track the error
00491         if(weight == 0){
00492             err_value= 0.00001;
00493         }
00494         else{
00495             err_value = weight*cmdp.connectivity;
00496             cmdp.p.setMeasureValue("CMM Type","missplaced");
00497         }
00498 
00499         return err_value;
00500     }
00501 
00502     public String getParameterString(){
00503         String para = gtAnalysis.getParameterString();
00504         para+="lambdaMissed="+lamdaMissed+";";
00505         return para;
00506     }
00507 
00508 }
00509 
00510 
 All Classes Namespaces Files Functions Variables Enumerations