MOA 12.03
Real Time Analytics for Data Streams
StatisticalCollection.java
Go to the documentation of this file.
00001 /*
00002  *    StatisticalCollection.java
00003  *    Copyright (C) 2010 RWTH Aachen University, Germany
00004  *    @author Jansen (moa@cs.rwth-aachen.de)
00005  *
00006  *    This program is free software; you can redistribute it and/or modify
00007  *    it under the terms of the GNU General Public License as published by
00008  *    the Free Software Foundation; either version 3 of the License, or
00009  *    (at your option) any later version.
00010  *
00011  *    This program is distributed in the hope that it will be useful,
00012  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *    GNU General Public License for more details.
00015  *
00016  *    You should have received a copy of the GNU General Public License
00017  *    along with this program. If not, see <http://www.gnu.org/licenses/>.
00018  *    
00019  */
00020 
00021 package moa.evaluation;
00022 
00023 import java.util.ArrayList;
00024 import java.util.Arrays;
00025 import moa.cluster.Clustering;
00026 import moa.gui.visualization.DataPoint;
00027 
00028 
00029 public class StatisticalCollection extends MeasureCollection{
00030     private boolean debug = false;
00031 
00032     @Override
00033     protected String[] getNames() {
00034         //String[] names = {"van Dongen","Rand statistic", "C Index"};
00035         String[] names = {"van Dongen","Rand statistic"};
00036         return names;
00037     }
00038 
00039     @Override
00040     protected boolean[] getDefaultEnabled() {
00041         boolean [] defaults = {false, false};
00042         return defaults;
00043     }
00044 
00045     @Override
00046     public void evaluateClustering(Clustering clustering, Clustering trueClustering, ArrayList<DataPoint> points) throws Exception {
00047 
00048 
00049         MembershipMatrix mm = new MembershipMatrix(clustering, points);
00050         int numClasses = mm.getNumClasses();
00051         int numCluster = clustering.size()+1;
00052         int n = mm.getTotalEntries();
00053 
00054         double dongenMaxFC = 0;
00055         double dongenMaxSumFC = 0;
00056         for (int i = 0; i < numCluster; i++){
00057                 double max = 0;
00058                 for (int j = 0; j < numClasses; j++) {
00059                     if(mm.getClusterClassWeight(i, j)>max) max = mm.getClusterClassWeight(i, j);
00060                 }
00061                 dongenMaxFC+=max;
00062                 if(mm.getClusterSum(i)>dongenMaxSumFC) dongenMaxSumFC = mm.getClusterSum(i);
00063         }
00064 
00065         double dongenMaxHC = 0;
00066         double dongenMaxSumHC = 0;
00067         for (int j = 0; j < numClasses; j++) {
00068                 double max = 0;
00069                 for (int i = 0; i < numCluster; i++){
00070                     if(mm.getClusterClassWeight(i, j)>max) max = mm.getClusterClassWeight(i, j);
00071                 }
00072                 dongenMaxHC+=max;
00073                 if(mm.getClassSum(j)>dongenMaxSumHC) dongenMaxSumHC = mm.getClassSum(j);
00074         }
00075 
00076         double dongen = (dongenMaxFC + dongenMaxHC)/(2*n);
00077         //normalized dongen
00078         //double dongen = 1-(2*n - dongenMaxFC - dongenMaxHC)/(2*n - dongenMaxSumFC - dongenMaxSumHC);
00079         if(debug)
00080             System.out.println("Dongen HC:"+dongenMaxHC+" FC:"+dongenMaxFC+" Total:"+dongen+" n "+n);
00081 
00082         addValue("van Dongen", dongen);
00083 
00084 
00085         //Rand index
00086         //http://www.cais.ntu.edu.sg/~qihe/menu4.html
00087         double m1 = 0;
00088         for (int j = 0; j < numClasses; j++) {
00089             double v = mm.getClassSum(j);
00090             m1+= v*(v-1)/2.0;
00091         }
00092         double m2 = 0;
00093         for (int i = 0; i < numCluster; i++){
00094             double v = mm.getClusterSum(i);
00095             m2+= v*(v-1)/2.0;
00096         }
00097 
00098         double m = 0;
00099         for (int i = 0; i < numCluster; i++){
00100             for (int j = 0; j < numClasses; j++) {
00101                     double v = mm.getClusterClassWeight(i, j);
00102                     m+= v*(v-1)/2.0;
00103                 }
00104         }
00105         double M = n*(n-1)/2.0;
00106         double rand = (M - m1 - m2 +2*m)/M;
00107         //normalized rand
00108         //double rand = (m - m1*m2/M)/(m1/2.0 + m2/2.0 - m1*m2/M);
00109 
00110         addValue("Rand statistic", rand);
00111 
00112 
00113         //addValue("C Index",cindex(clustering, points));
00114     }
00115 
00116 
00117 
00118     public double cindex(Clustering clustering,  ArrayList<DataPoint> points){
00119         int numClusters = clustering.size();
00120         double withinClustersDistance = 0;
00121         int numDistancesWithin = 0;
00122         double numDistances = 0;
00123 
00124         //double[] withinClusters = new double[numClusters];
00125         double[] minWithinClusters = new double[numClusters];
00126         double[] maxWithinClusters = new double[numClusters];
00127         ArrayList<Integer>[] pointsInClusters = new ArrayList[numClusters];
00128         for (int c = 0; c < numClusters; c++) {
00129             pointsInClusters[c] = new ArrayList<Integer>();
00130             minWithinClusters[c] = Double.MAX_VALUE;
00131             maxWithinClusters[c] = Double.MIN_VALUE;
00132         }
00133 
00134         for (int p = 0; p < points.size(); p++) {
00135             for (int c = 0; c < clustering.size(); c++) {
00136                 if(clustering.get(c).getInclusionProbability(points.get(p)) > 0.8){
00137                     pointsInClusters[c].add(p);
00138                     numDistances++;
00139                 }
00140             }
00141         }
00142 
00143         //calc within cluster distances + min and max values
00144         for (int c = 0; c < numClusters; c++) {
00145             int numDistancesInC = 0;
00146             ArrayList<Integer> pointsInC = pointsInClusters[c];
00147             for (int p = 0; p < pointsInC.size(); p++) {
00148                 DataPoint point = points.get(pointsInC.get(p));
00149                 for (int p1 = p+1; p1 < pointsInC.size(); p1++) {
00150                     numDistancesWithin++;
00151                     numDistancesInC++;
00152                     DataPoint point1 = points.get(pointsInC.get(p1));
00153                     double dist = point.getDistance(point1);
00154                     withinClustersDistance+=dist;
00155                     if(minWithinClusters[c] > dist) minWithinClusters[c] = dist;
00156                     if(maxWithinClusters[c] < dist) maxWithinClusters[c] = dist;
00157                 }
00158             }
00159         }
00160 
00161         double minWithin = Double.MAX_VALUE;
00162         double maxWithin = Double.MIN_VALUE;
00163         for (int c = 0; c < numClusters; c++) {
00164             if(minWithinClusters[c] < minWithin)
00165                minWithin = minWithinClusters[c];
00166             if(maxWithinClusters[c] > maxWithin)
00167                maxWithin = maxWithinClusters[c];
00168         }
00169 
00170         double cindex = 0;
00171         if(numDistancesWithin != 0){
00172             double meanWithinClustersDistance = withinClustersDistance/numDistancesWithin;
00173             cindex = (meanWithinClustersDistance - minWithin)/(maxWithin-minWithin);
00174         }
00175 
00176 
00177         if(debug){
00178             System.out.println("Min:"+Arrays.toString(minWithinClusters));
00179             System.out.println("Max:"+Arrays.toString(maxWithinClusters));
00180             System.out.println("totalWithin:"+numDistancesWithin);
00181         }
00182         return cindex;
00183     }
00184 
00185 
00186 }
 All Classes Namespaces Files Functions Variables Enumerations