MOA 12.03
Real Time Analytics for Data Streams
|
00001 /* 00002 * MetaMultilabelGenerator.java 00003 * Copyright (C) 2010 University of Waikato, Hamilton, New Zealand 00004 * @author Jesse Read ([email protected]) 00005 * 00006 * This program is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 3 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * This program is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 * GNU General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU General Public License 00017 * along with this program. If not, see <http://www.gnu.org/licenses/>. 00018 * 00019 */ 00020 package moa.streams.generators.multilabel; 00021 00022 import java.util.*; 00023 00024 import moa.core.InstancesHeader; 00025 import moa.core.MultilabelInstancesHeader; 00026 import moa.core.ObjectRepository; 00027 import moa.options.AbstractOptionHandler; 00028 import moa.options.FloatOption; 00029 import moa.options.ClassOption; 00030 import moa.options.IntOption; 00031 import moa.streams.*; 00032 import moa.tasks.TaskMonitor; 00033 00034 import weka.core.Attribute; 00035 import weka.core.FastVector; 00036 import weka.core.Instance; 00037 import weka.core.Instances; 00038 import weka.core.SparseInstance; 00039 import weka.core.Utils; 00040 00047 public class MetaMultilabelGenerator extends AbstractOptionHandler implements InstanceStream { 00048 00049 private static final long serialVersionUID = 1L; 00050 00051 public ClassOption binaryGeneratorOption = new ClassOption( 00052 "binaryGenerator", 's', "Binary Generator (use this option to specify the number of attributes, but specify two classes only).", InstanceStream.class, "generators.RandomTreeGenerator"); 00053 00054 public IntOption metaRandomSeedOption = new IntOption( 00055 "metaRandomSeed", 'm', "Random seed (for the meta process).", 1); 00056 00057 public IntOption numLabelsOption = new IntOption( 00058 "numLabels", 'c', "Number of labels.", 1); 00059 00060 public IntOption skewOption = new IntOption( 00061 "skew", 'k', "Skewed label distribution: 1 (default) = yes; 0 = no (relatively uniform).", 1, 0, 1); 00062 00063 public FloatOption labelCardinalityOption = new FloatOption( 00064 "labelCardinality", 'z', "Target label cardinality of resulting set", 1.5, 0.0, Integer.MAX_VALUE); 00065 00066 protected MultilabelInstancesHeader m_MultilabelInstancesHeader = null; 00067 00068 protected InstanceStream m_BinaryGenerator = null; 00069 00070 protected Instances multilabelStreamTemplate = null; 00071 00072 protected Random m_MetaRandom = null; 00073 00074 protected int m_N = 0, m_A = 0; 00075 00076 protected double m_Z = 0.0; 00077 00078 protected double skew[] = null, skew_n[] = null; 00079 00080 protected double matrix[][] = null; 00081 00082 protected ArrayList m_FeatureEffects[] = null; 00083 00084 @Override 00085 public void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) { 00086 this.restart(); 00087 } 00088 00089 @Override 00090 public void restart() { 00091 00092 // Extract option 'c' (number of classes(labels)) 00093 this.m_N = numLabelsOption.getValue(); 00094 00095 // Binary generator 00096 this.m_BinaryGenerator = (InstanceStream) getPreparedClassOption(this.binaryGeneratorOption); 00097 this.m_BinaryGenerator.restart(); 00098 00099 // Extract number of attributes (minus class-attribute) 00100 this.m_A = this.m_BinaryGenerator.getHeader().numAttributes() - 1; 00101 00102 // Random seed 00103 this.m_MetaRandom = new Random(this.metaRandomSeedOption.getValue()); 00104 00105 // Setup queue system (so that generated binary instances aren't 'wasted') 00106 this.queue = new LinkedList[2]; 00107 for (int i = 0; i < this.queue.length; i++) { 00108 this.queue[i] = new LinkedList<Instance>(); 00109 } 00110 00111 // Generate the multi-label header 00112 this.m_MultilabelInstancesHeader = generateMultilabelHeader(this.m_BinaryGenerator.getHeader()); 00113 00114 // Determine Z : label cardinality as a percentage of |L| (m_N) 00115 m_Z = labelCardinalityOption.getValue(); 00116 double z = m_Z; 00117 00118 // Chceck that the label sets we generate fit the label cardinality we specified 00119 while (true) { 00120 // Create the label skew 00121 this.skew = fillSkew(m_MetaRandom, z); 00122 // Create a normalised version of the skew (for wwhen we choose at least one label) 00123 this.skew_n = Arrays.copyOf(skew, skew.length); 00124 Utils.normalize(this.skew_n); 00125 // Create a matrix from the label skew 00126 this.matrix = fillMatrix(skew, m_Z / (double) m_N, m_MetaRandom); 00127 double total = 0.0; 00128 for (int i = 0; i < 10000; i++) { 00129 total += (generateSet(discreteRandomIndex(this.skew_n))).size(); 00130 } 00131 total /= 10000.0; 00132 if (total - m_Z < -0.1) { 00133 z += 0.1; 00134 } else if (total - m_Z > 0.1) { 00135 z -= 0.1; 00136 } else { 00137 break; 00138 } 00139 } 00140 00141 // Create the feature-label mappings 00142 m_FeatureEffects = getTopCombinations(m_N * 2); 00143 00144 } 00145 00149 protected MultilabelInstancesHeader generateMultilabelHeader(Instances si) { 00150 Instances mi = new Instances(si, 0, 0); 00151 mi.setClassIndex(-1); 00152 mi.deleteAttributeAt(mi.numAttributes() - 1); 00153 FastVector bfv = new FastVector(); 00154 bfv.addElement("0"); 00155 bfv.addElement("1"); 00156 for (int i = 0; i < this.m_N; i++) { 00157 mi.insertAttributeAt(new Attribute("class" + i, bfv), i); 00158 } 00159 this.multilabelStreamTemplate = mi; 00160 this.multilabelStreamTemplate.setRelationName("SYN_Z" + this.labelCardinalityOption.getValue() + "L" + this.m_N + "X" + m_A + "S" + metaRandomSeedOption.getValue() + ": -C " + this.m_N); 00161 this.multilabelStreamTemplate.setClassIndex(this.m_N); 00162 return new MultilabelInstancesHeader(multilabelStreamTemplate, m_N); 00163 } 00164 00171 private double[] fillSkew(Random r, double z) { 00172 double d[] = new double[m_N]; 00173 for (int i = 0; i < m_N; i++) { 00174 if (skewOption.getValue() >= 1) { 00175 d[i] = m_MetaRandom.nextDouble(); 00176 } else { 00177 d[i] = 1.0; 00178 } 00179 } 00180 Utils.normalize(d, Utils.sum(d) / z); 00181 for (int i = 0; i < m_N; i++) { 00182 if (Double.isNaN(d[i])) { 00183 d[i] = 0.01; 00184 } 00185 } 00186 return d; 00187 } 00188 00194 LinkedList<Instance> queue[] = null; 00195 00196 private Instance getNextWithBinary(int i) { 00197 int lim = 1000; 00198 if (queue[i].size() <= 0) { 00199 int c = -1; 00200 while (lim-- > 0) { 00201 Instance tinst = this.m_BinaryGenerator.nextInstance(); 00202 //System.err.println("next binary : "+tinst); 00203 c = (int) Math.round(tinst.classValue()); 00204 if (i == c) { 00205 return tinst; 00206 } else if (queue[c].size() < 100) { 00207 queue[c].add(tinst); 00208 } 00209 } 00210 System.err.println("[Overflow] The binary stream is too skewed, could not get an example of class " + i + ""); 00211 System.exit(1); 00212 return null; 00213 } else { 00214 return queue[i].remove(); 00215 } 00216 } 00217 00223 private int labelCorrelation(ArrayList<Integer> lbls) { 00224 double r[] = new double[m_N]; 00225 Arrays.fill(r, 1.0); 00226 for (int l : lbls) { 00227 //get row 00228 for (int j = 0; j < matrix[l].length; j++) { 00229 // *= P(j|l) (probability of label 'j', given that label 'l' is in the set 00230 r[j] = (j == l) ? 0.0 : r[j] * matrix[j][l]; 00231 } 00232 } 00233 return discreteRandomIndex(r); 00234 } 00235 00240 @Override 00241 public Instance nextInstance() { 00242 00243 try { 00244 return generateMLInstance(generateSet(discreteRandomIndex(this.skew_n))); 00245 } catch (Exception e) { 00246 e.printStackTrace(); 00247 System.exit(1); 00248 } 00249 return null; 00250 } 00251 00252 private ArrayList generateSet(int l) { 00253 ArrayList<Integer> lbls = new ArrayList<Integer>(); 00254 while (l >= 0) { 00255 lbls.add(l); 00256 l = labelCorrelation(lbls); 00257 } 00258 return lbls; 00259 } 00260 00264 private Instance generateMLInstance(ArrayList<Integer> lbls) throws Exception { 00265 00266 // create a multi-label instance : 00267 Instance ml_x = new SparseInstance(this.multilabelStreamTemplate.numAttributes()); 00268 ml_x.setDataset(this.multilabelStreamTemplate); 00269 00270 // set classes 00271 for (int i = 0; i < m_N; i++) { 00272 ml_x.setValue(i, 0.0); 00273 } 00274 for (int i = 0; i < lbls.size(); i++) { 00275 ml_x.setValue(lbls.get(i), 1.0); 00276 } 00277 00278 // generate binary instances 00279 Instance binary0 = getNextWithBinary(0); 00280 Instance binary1 = getNextWithBinary(1); 00281 00282 // Loop through each feature attribute @warning: assumes class is last index 00283 for (int a = 0; a < m_A; a++) { 00284 00285 // The combination is present: use a positive value 00286 if (lbls.containsAll(m_FeatureEffects[a % m_FeatureEffects.length])) { 00287 ml_x.setValue(m_N + a, binary1.value(a)); 00288 } // The combination is absent: use a negative value 00289 else { 00290 ml_x.setValue(m_N + a, binary0.value(a)); 00291 } 00292 } 00293 00294 return ml_x; 00295 00296 } 00297 00303 private int discreteRandomIndex(double p[]) { 00304 00305 double r = m_MetaRandom.nextDouble(); 00306 00307 if (Utils.sum(p) <= r || Double.isNaN(Utils.sum(p))) { 00308 return -1; //m_MetaRandom.nextInt(p.length); 00309 } 00310 int i = 0; 00311 double sum = 0.0; 00312 while (r > sum) { 00313 // won't be selecting anything 00314 if (i >= p.length) { 00315 return -1; 00316 } 00317 sum += p[i++]; 00318 } 00319 //System.out.println("i="+i); 00320 return i - 1; 00321 } 00322 00323 protected static double genE(int i, double L) { 00324 return L * Math.pow(Math.E, -L * i); 00325 } 00326 00335 protected double[][] fillMatrix(double skew[], double Z, Random r) { 00336 00337 this.matrix = new double[skew.length][skew.length]; 00338 00339 //System.out.println("skew "+Arrays.toString(skew)); 00340 00341 for (int i = 0; i < skew.length; i++) { 00342 matrix[i][i] = Utils.roundDouble(skew[i], 3); 00343 } 00344 00345 for (int i = 0; i < matrix.length; i++) { 00346 for (int j = i + 1; j < matrix[i].length; j++) { 00347 // label-dependence factors 00348 if (r.nextDouble() <= (Z * 2.0)) { 00349 matrix[i][j] = randFromRange(min(P(i), P(j)), max(P(i), P(j))); 00350 matrix[j][i] = (matrix[i][j] * matrix[i][i]) / matrix[j][j]; // Bayes Rule 00351 } // label-exclusivity factors 00352 else { 00353 matrix[i][j] = min(P(i), P(j)); 00354 matrix[j][i] = (matrix[i][j] * matrix[j][j]) / matrix[i][i]; // Bayes Rule 00355 } 00356 // this is just rounding 00357 matrix[i][j] = Utils.roundDouble(matrix[i][j], 3); 00358 matrix[j][i] = Utils.roundDouble(matrix[j][i], 3); 00359 } 00360 } 00361 00362 return matrix; 00363 } 00364 00365 protected double randFromRange(double min, double max) { 00366 return min + genE(m_MetaRandom.nextInt(5), (max - min)); 00367 } 00368 00369 // P(i) 00370 protected double P(int i) { 00371 return matrix[i][i]; 00372 } 00373 00374 // P(i|j) 00375 protected double P(int i, int j) { 00376 return matrix[i][j]; 00377 } 00378 00379 // the highest possible prob. of P(A|B) given A and B 00380 protected double max(double A, double B) { 00381 return Math.min(1.0, (B / A)); 00382 } 00383 00384 // the lowest possible prob. of P(A|B) given A and B 00385 protected double min(double A, double B) { 00386 return Math.max(0.0, (-1.0 + A + B)); 00387 } 00388 00393 private ArrayList[] getTopCombinations(int n) { 00394 00395 HashMap<String, Integer> top = new HashMap<String, Integer>(); 00396 00397 for (int i = 0; i < 10000; i++) { 00398 String s = arrayToString(generateSet(discreteRandomIndex(this.skew_n)), m_N); 00399 top.put(s, top.get(s) != null ? top.get(s) + 1 : 1); 00400 } 00401 00402 HashMap<String, Integer> rating = getAsReverseSortedHashMap(top); 00403 00404 ArrayList al[] = new ArrayList[rating.size()]; 00405 int i = 0; 00406 for (String s : rating.keySet()) { 00407 al[i++] = stringToArray(s); 00408 } 00409 return al; 00410 } 00411 00412 // auxilliary functions follow 00413 private static HashMap<String, Integer> getAsReverseSortedHashMap(HashMap<String, Integer> c) { 00414 00415 Map<String, Integer> tempMap = new HashMap<String, Integer>(); 00416 for (String wsState : c.keySet()) { 00417 tempMap.put(wsState, c.get(wsState)); 00418 } 00419 00420 List<String> mapKeys = new ArrayList<String>(tempMap.keySet()); 00421 List<Integer> mapValues = new ArrayList<Integer>(tempMap.values()); 00422 HashMap<String, Integer> sortedMap = new LinkedHashMap<String, Integer>(); 00423 TreeSet<Integer> sortedSet = new TreeSet<Integer>(mapValues); 00424 Object[] sortedArray = sortedSet.toArray(); 00425 int size = sortedArray.length; 00426 for (int i = 0; i < size; i++) { 00427 sortedMap.put(mapKeys.get(mapValues.indexOf(sortedArray[size - 1 - i])), (Integer) sortedArray[size - 1 - i]); 00428 } 00429 return sortedMap; 00430 } 00431 00432 private static ArrayList stringToArray(String s) { 00433 ArrayList al = new ArrayList(); 00434 for (int i = 0; i < s.length(); i++) { 00435 if (s.charAt(i) == '1') { 00436 al.add(i); 00437 } 00438 } 00439 return al; 00440 } 00441 00442 private static String arrayToString(ArrayList<Integer> lbls, int N) { 00443 StringBuilder sb = new StringBuilder(N); 00444 for (int i = 0; i < N; i++) { 00445 sb.append('0'); 00446 } 00447 for (int l : lbls) { 00448 sb.setCharAt(l, '1'); 00449 } 00450 return sb.toString(); 00451 } 00452 00453 @Override 00454 public InstancesHeader getHeader() { 00455 return m_MultilabelInstancesHeader; 00456 } 00457 00458 @Override 00459 public String getPurposeString() { 00460 return "Generates a multi-label stream using a binary generator."; 00461 } 00462 00463 @Override 00464 public long estimatedRemainingInstances() { 00465 return -1; 00466 } 00467 00468 @Override 00469 public boolean hasMoreInstances() { 00470 return true; 00471 } 00472 00473 @Override 00474 public boolean isRestartable() { 00475 return true; 00476 } 00477 00478 @Override 00479 public void getDescription(StringBuilder sb, int indent) { 00480 // TODO Auto-generated method stub 00481 } 00482 }