MOA 12.03
Real Time Analytics for Data Streams
MetaMultilabelGenerator.java
Go to the documentation of this file.
00001 /*
00002  *    MetaMultilabelGenerator.java
00003  *    Copyright (C) 2010 University of Waikato, Hamilton, New Zealand
00004  *    @author Jesse Read ([email protected])
00005  *
00006  *    This program is free software; you can redistribute it and/or modify
00007  *    it under the terms of the GNU General Public License as published by
00008  *    the Free Software Foundation; either version 3 of the License, or
00009  *    (at your option) any later version.
00010  *
00011  *    This program is distributed in the hope that it will be useful,
00012  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014  *    GNU General Public License for more details.
00015  *
00016  *    You should have received a copy of the GNU General Public License
00017  *    along with this program. If not, see <http://www.gnu.org/licenses/>.
00018  *    
00019  */
00020 package moa.streams.generators.multilabel;
00021 
00022 import java.util.*;
00023 
00024 import moa.core.InstancesHeader;
00025 import moa.core.MultilabelInstancesHeader;
00026 import moa.core.ObjectRepository;
00027 import moa.options.AbstractOptionHandler;
00028 import moa.options.FloatOption;
00029 import moa.options.ClassOption;
00030 import moa.options.IntOption;
00031 import moa.streams.*;
00032 import moa.tasks.TaskMonitor;
00033 
00034 import weka.core.Attribute;
00035 import weka.core.FastVector;
00036 import weka.core.Instance;
00037 import weka.core.Instances;
00038 import weka.core.SparseInstance;
00039 import weka.core.Utils;
00040 
00047 public class MetaMultilabelGenerator extends AbstractOptionHandler implements InstanceStream {
00048 
00049     private static final long serialVersionUID = 1L;
00050 
00051     public ClassOption binaryGeneratorOption = new ClassOption(
00052             "binaryGenerator", 's', "Binary Generator (use this option to specify the number of attributes, but specify two classes only).", InstanceStream.class, "generators.RandomTreeGenerator");
00053 
00054     public IntOption metaRandomSeedOption = new IntOption(
00055             "metaRandomSeed", 'm', "Random seed (for the meta process).", 1);
00056 
00057     public IntOption numLabelsOption = new IntOption(
00058             "numLabels", 'c', "Number of labels.", 1);
00059 
00060     public IntOption skewOption = new IntOption(
00061             "skew", 'k', "Skewed label distribution: 1 (default) = yes; 0 = no (relatively uniform).", 1, 0, 1);
00062 
00063     public FloatOption labelCardinalityOption = new FloatOption(
00064             "labelCardinality", 'z', "Target label cardinality of resulting set", 1.5, 0.0, Integer.MAX_VALUE);
00065 
00066     protected MultilabelInstancesHeader m_MultilabelInstancesHeader = null;
00067 
00068     protected InstanceStream m_BinaryGenerator = null;
00069 
00070     protected Instances multilabelStreamTemplate = null;
00071 
00072     protected Random m_MetaRandom = null;
00073 
00074     protected int m_N = 0, m_A = 0;
00075 
00076     protected double m_Z = 0.0;
00077 
00078     protected double skew[] = null, skew_n[] = null;
00079 
00080     protected double matrix[][] = null;
00081 
00082     protected ArrayList m_FeatureEffects[] = null;
00083 
00084     @Override
00085     public void prepareForUseImpl(TaskMonitor monitor, ObjectRepository repository) {
00086         this.restart();
00087     }
00088 
00089     @Override
00090     public void restart() {
00091 
00092         // Extract option 'c' (number of classes(labels))
00093         this.m_N = numLabelsOption.getValue();
00094 
00095         // Binary generator
00096         this.m_BinaryGenerator = (InstanceStream) getPreparedClassOption(this.binaryGeneratorOption);
00097         this.m_BinaryGenerator.restart();
00098 
00099         // Extract number of attributes (minus class-attribute)
00100         this.m_A = this.m_BinaryGenerator.getHeader().numAttributes() - 1;
00101 
00102         // Random seed
00103         this.m_MetaRandom = new Random(this.metaRandomSeedOption.getValue());
00104 
00105         // Setup queue system (so that generated binary instances aren't 'wasted')
00106         this.queue = new LinkedList[2];
00107         for (int i = 0; i < this.queue.length; i++) {
00108             this.queue[i] = new LinkedList<Instance>();
00109         }
00110 
00111         // Generate the multi-label header
00112         this.m_MultilabelInstancesHeader = generateMultilabelHeader(this.m_BinaryGenerator.getHeader());
00113 
00114         // Determine Z : label cardinality as a percentage of |L| (m_N)
00115         m_Z = labelCardinalityOption.getValue();
00116         double z = m_Z;
00117 
00118         // Chceck that the label sets we generate fit the label cardinality we specified
00119         while (true) {
00120             // Create the label skew
00121             this.skew = fillSkew(m_MetaRandom, z);
00122             // Create a normalised version of the skew (for wwhen we choose at least one label)
00123             this.skew_n = Arrays.copyOf(skew, skew.length);
00124             Utils.normalize(this.skew_n);
00125             // Create a matrix from the label skew
00126             this.matrix = fillMatrix(skew, m_Z / (double) m_N, m_MetaRandom);
00127             double total = 0.0;
00128             for (int i = 0; i < 10000; i++) {
00129                 total += (generateSet(discreteRandomIndex(this.skew_n))).size();
00130             }
00131             total /= 10000.0;
00132             if (total - m_Z < -0.1) {
00133                 z += 0.1;
00134             } else if (total - m_Z > 0.1) {
00135                 z -= 0.1;
00136             } else {
00137                 break;
00138             }
00139         }
00140 
00141         // Create the feature-label mappings
00142         m_FeatureEffects = getTopCombinations(m_N * 2);
00143 
00144     }
00145 
00149     protected MultilabelInstancesHeader generateMultilabelHeader(Instances si) {
00150         Instances mi = new Instances(si, 0, 0);
00151         mi.setClassIndex(-1);
00152         mi.deleteAttributeAt(mi.numAttributes() - 1);
00153         FastVector bfv = new FastVector();
00154         bfv.addElement("0");
00155         bfv.addElement("1");
00156         for (int i = 0; i < this.m_N; i++) {
00157             mi.insertAttributeAt(new Attribute("class" + i, bfv), i);
00158         }
00159         this.multilabelStreamTemplate = mi;
00160         this.multilabelStreamTemplate.setRelationName("SYN_Z" + this.labelCardinalityOption.getValue() + "L" + this.m_N + "X" + m_A + "S" + metaRandomSeedOption.getValue() + ": -C " + this.m_N);
00161         this.multilabelStreamTemplate.setClassIndex(this.m_N);
00162         return new MultilabelInstancesHeader(multilabelStreamTemplate, m_N);
00163     }
00164 
00171     private double[] fillSkew(Random r, double z) {
00172         double d[] = new double[m_N];
00173         for (int i = 0; i < m_N; i++) {
00174             if (skewOption.getValue() >= 1) {
00175                 d[i] = m_MetaRandom.nextDouble();
00176             } else {
00177                 d[i] = 1.0;
00178             }
00179         }
00180         Utils.normalize(d, Utils.sum(d) / z);
00181         for (int i = 0; i < m_N; i++) {
00182             if (Double.isNaN(d[i])) {
00183                 d[i] = 0.01;
00184             }
00185         }
00186         return d;
00187     }
00188 
00194     LinkedList<Instance> queue[] = null;
00195 
00196     private Instance getNextWithBinary(int i) {
00197         int lim = 1000;
00198         if (queue[i].size() <= 0) {
00199             int c = -1;
00200             while (lim-- > 0) {
00201                 Instance tinst = this.m_BinaryGenerator.nextInstance();
00202                 //System.err.println("next binary : "+tinst);
00203                 c = (int) Math.round(tinst.classValue());
00204                 if (i == c) {
00205                     return tinst;
00206                 } else if (queue[c].size() < 100) {
00207                     queue[c].add(tinst);
00208                 }
00209             }
00210             System.err.println("[Overflow] The binary stream is too skewed, could not get an example of class " + i + "");
00211             System.exit(1);
00212             return null;
00213         } else {
00214             return queue[i].remove();
00215         }
00216     }
00217 
00223     private int labelCorrelation(ArrayList<Integer> lbls) {
00224         double r[] = new double[m_N];
00225         Arrays.fill(r, 1.0);
00226         for (int l : lbls) {
00227             //get row
00228             for (int j = 0; j < matrix[l].length; j++) {
00229                 // *= P(j|l) (probability of label 'j', given that label 'l' is in the set
00230                 r[j] = (j == l) ? 0.0 : r[j] * matrix[j][l];
00231             }
00232         }
00233         return discreteRandomIndex(r);
00234     }
00235 
00240     @Override
00241     public Instance nextInstance() {
00242 
00243         try {
00244             return generateMLInstance(generateSet(discreteRandomIndex(this.skew_n)));
00245         } catch (Exception e) {
00246             e.printStackTrace();
00247             System.exit(1);
00248         }
00249         return null;
00250     }
00251 
00252     private ArrayList generateSet(int l) {
00253         ArrayList<Integer> lbls = new ArrayList<Integer>();
00254         while (l >= 0) {
00255             lbls.add(l);
00256             l = labelCorrelation(lbls);
00257         }
00258         return lbls;
00259     }
00260 
00264     private Instance generateMLInstance(ArrayList<Integer> lbls) throws Exception {
00265 
00266         // create a multi-label instance   :
00267         Instance ml_x = new SparseInstance(this.multilabelStreamTemplate.numAttributes());
00268         ml_x.setDataset(this.multilabelStreamTemplate);
00269 
00270         // set classes
00271         for (int i = 0; i < m_N; i++) {
00272             ml_x.setValue(i, 0.0);
00273         }
00274         for (int i = 0; i < lbls.size(); i++) {
00275             ml_x.setValue(lbls.get(i), 1.0);
00276         }
00277 
00278         // generate binary instances
00279         Instance binary0 = getNextWithBinary(0);
00280         Instance binary1 = getNextWithBinary(1);
00281 
00282         // Loop through each feature attribute @warning: assumes class is last index
00283         for (int a = 0; a < m_A; a++) {
00284 
00285             // The combination is present: use a positive value
00286             if (lbls.containsAll(m_FeatureEffects[a % m_FeatureEffects.length])) {
00287                 ml_x.setValue(m_N + a, binary1.value(a));
00288             } // The combination is absent: use a negative value
00289             else {
00290                 ml_x.setValue(m_N + a, binary0.value(a));
00291             }
00292         }
00293 
00294         return ml_x;
00295 
00296     }
00297 
00303     private int discreteRandomIndex(double p[]) {
00304 
00305         double r = m_MetaRandom.nextDouble();
00306 
00307         if (Utils.sum(p) <= r || Double.isNaN(Utils.sum(p))) {
00308             return -1; //m_MetaRandom.nextInt(p.length);
00309         }
00310         int i = 0;
00311         double sum = 0.0;
00312         while (r > sum) {
00313             // won't be selecting anything
00314             if (i >= p.length) {
00315                 return -1;
00316             }
00317             sum += p[i++];
00318         }
00319         //System.out.println("i="+i);
00320         return i - 1;
00321     }
00322 
00323     protected static double genE(int i, double L) {
00324         return L * Math.pow(Math.E, -L * i);
00325     }
00326 
00335     protected double[][] fillMatrix(double skew[], double Z, Random r) {
00336 
00337         this.matrix = new double[skew.length][skew.length];
00338 
00339         //System.out.println("skew "+Arrays.toString(skew));
00340 
00341         for (int i = 0; i < skew.length; i++) {
00342             matrix[i][i] = Utils.roundDouble(skew[i], 3);
00343         }
00344 
00345         for (int i = 0; i < matrix.length; i++) {
00346             for (int j = i + 1; j < matrix[i].length; j++) {
00347                 // label-dependence factors
00348                 if (r.nextDouble() <= (Z * 2.0)) {
00349                     matrix[i][j] = randFromRange(min(P(i), P(j)), max(P(i), P(j)));
00350                     matrix[j][i] = (matrix[i][j] * matrix[i][i]) / matrix[j][j]; // Bayes Rule
00351                 } // label-exclusivity factors
00352                 else {
00353                     matrix[i][j] = min(P(i), P(j));
00354                     matrix[j][i] = (matrix[i][j] * matrix[j][j]) / matrix[i][i]; // Bayes Rule
00355                 }
00356                 // this is just rounding
00357                 matrix[i][j] = Utils.roundDouble(matrix[i][j], 3);
00358                 matrix[j][i] = Utils.roundDouble(matrix[j][i], 3);
00359             }
00360         }
00361 
00362         return matrix;
00363     }
00364 
00365     protected double randFromRange(double min, double max) {
00366         return min + genE(m_MetaRandom.nextInt(5), (max - min));
00367     }
00368 
00369     // P(i)
00370     protected double P(int i) {
00371         return matrix[i][i];
00372     }
00373 
00374     // P(i|j)
00375     protected double P(int i, int j) {
00376         return matrix[i][j];
00377     }
00378 
00379     // the highest possible prob. of P(A|B) given A and B
00380     protected double max(double A, double B) {
00381         return Math.min(1.0, (B / A));
00382     }
00383 
00384     // the lowest possible prob. of P(A|B) given A and B
00385     protected double min(double A, double B) {
00386         return Math.max(0.0, (-1.0 + A + B));
00387     }
00388 
00393     private ArrayList[] getTopCombinations(int n) {
00394 
00395         HashMap<String, Integer> top = new HashMap<String, Integer>();
00396 
00397         for (int i = 0; i < 10000; i++) {
00398             String s = arrayToString(generateSet(discreteRandomIndex(this.skew_n)), m_N);
00399             top.put(s, top.get(s) != null ? top.get(s) + 1 : 1);
00400         }
00401 
00402         HashMap<String, Integer> rating = getAsReverseSortedHashMap(top);
00403 
00404         ArrayList al[] = new ArrayList[rating.size()];
00405         int i = 0;
00406         for (String s : rating.keySet()) {
00407             al[i++] = stringToArray(s);
00408         }
00409         return al;
00410     }
00411 
00412     // auxilliary functions follow
00413     private static HashMap<String, Integer> getAsReverseSortedHashMap(HashMap<String, Integer> c) {
00414 
00415         Map<String, Integer> tempMap = new HashMap<String, Integer>();
00416         for (String wsState : c.keySet()) {
00417             tempMap.put(wsState, c.get(wsState));
00418         }
00419 
00420         List<String> mapKeys = new ArrayList<String>(tempMap.keySet());
00421         List<Integer> mapValues = new ArrayList<Integer>(tempMap.values());
00422         HashMap<String, Integer> sortedMap = new LinkedHashMap<String, Integer>();
00423         TreeSet<Integer> sortedSet = new TreeSet<Integer>(mapValues);
00424         Object[] sortedArray = sortedSet.toArray();
00425         int size = sortedArray.length;
00426         for (int i = 0; i < size; i++) {
00427             sortedMap.put(mapKeys.get(mapValues.indexOf(sortedArray[size - 1 - i])), (Integer) sortedArray[size - 1 - i]);
00428         }
00429         return sortedMap;
00430     }
00431 
00432     private static ArrayList stringToArray(String s) {
00433         ArrayList al = new ArrayList();
00434         for (int i = 0; i < s.length(); i++) {
00435             if (s.charAt(i) == '1') {
00436                 al.add(i);
00437             }
00438         }
00439         return al;
00440     }
00441 
00442     private static String arrayToString(ArrayList<Integer> lbls, int N) {
00443         StringBuilder sb = new StringBuilder(N);
00444         for (int i = 0; i < N; i++) {
00445             sb.append('0');
00446         }
00447         for (int l : lbls) {
00448             sb.setCharAt(l, '1');
00449         }
00450         return sb.toString();
00451     }
00452 
00453     @Override
00454     public InstancesHeader getHeader() {
00455         return m_MultilabelInstancesHeader;
00456     }
00457 
00458     @Override
00459     public String getPurposeString() {
00460         return "Generates a multi-label stream using a binary generator.";
00461     }
00462 
00463     @Override
00464     public long estimatedRemainingInstances() {
00465         return -1;
00466     }
00467 
00468     @Override
00469     public boolean hasMoreInstances() {
00470         return true;
00471     }
00472 
00473     @Override
00474     public boolean isRestartable() {
00475         return true;
00476     }
00477 
00478     @Override
00479     public void getDescription(StringBuilder sb, int indent) {
00480         // TODO Auto-generated method stub
00481     }
00482 }
 All Classes Namespaces Files Functions Variables Enumerations