@COMMENT{{Automatically generated - DO NOT MODIFY!}}

  AUTHOR = {Remco R. Bouckaert and
              Michael Goebel and
              Patricia J. Riddle},
  TITLE = {Generalized Unified Decomposition of Ensemble Loss},
  BOOKTITLE = {Proc 19th Australian Joint Conference on Artificial Intelligence},
  YEAR = 2006,
  PAGES = {1133-1139},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/gudel.pdf},
  ABSTRACT = {Goebel et al. [4] presented a unified
decomposition of ensemble loss for explaining ensemble
performance. They considered democratic voting schemes with uniform
weights, where the various base classifiers each can vote for a single
class once only. In this article, we generalize their decomposition to
cover weighted, probabilistic voting schemes and non-uniform
(progressive) voting schemes.  Empirical results suggest that
democratic voting schemes can be outperformed by probabilistic and
progressive voting schemes.  This makes the generalization worth
exploring and we show how to use the generalization to analyze
ensemble loss.}

  AUTHOR = {Remco R. Bouckaert},
  TITLE = {Efficient AUC Learning Curve Calculation},
  BOOKTITLE = {Proc 19th Australian Joint Conference on Artificial Intelligence},
  YEAR = 2006,
  PAGES = {181-191},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/roc.pdf},
  ABSTRACT = {A learning curve of a performance measure provides a graphical method 
with many benefits for judging classifier properties. The area under 
the ROC curve (AUC) is a useful and increasingly popular performance 
measure. In this paper, we consider the computational aspects of 
calculating AUC learning curves. A new method is provided for 
incrementally updating exact AUC curves and for calculating approximate 
AUC curves for datasets with millions of instances. Both theoretical 
and empirical justifications are given for the approximation. Variants
for incremental exact and approximate AUC curves are provided as well.}

  AUTHOR = {Remco R. Bouckaert},
  TITLE = {Voting Massive Collections of Bayesian Network Classifiers
              for Data Streams},
  BOOKTITLE = {Proc 19th Australian Joint Conference on Artificial Intelligence},
  YEAR = 2006,
  PAGES = {243-252},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/stream.pdf},
  ABSTRACT = {We present a new method for voting exponential (in the
number of attributes) size sets of Bayesian classifiers in polynomial
time with polynomial memory requirements.  Training is linear in the
number of instances in the dataset and can be performed
incrementally. This allows the collection to learn from massive data
streams. The method allows for flexibility in balancing computational
complexity, memory requirements and classification performance. Unlike
many other incremental Bayesian methods, all statistics kept in memory
are directly used in classification.
Experimental results show that the classifiers perform well on both
small and very large data sets, and that classification performance
can be weighed against computational and memory costs.}

  AUTHOR = {Chi-San Lin and Tony Smith},
  TITLE = {A tree-based algorithm for predicate-argument recognition},
  JOURNAL = {Association for Computing Machinery New Zealand Bulletin},
  VOLUME = 2,
  NUMBER = 1,
  MONTH = {January},
  PAGES = {online journal},
  YEAR = 2006

  AUTHOR = {Eibe Frank and Bernhard Pfahringer},
  TITLE = {Improving on Bagging with Input Smearing},
  BOOKTITLE = {Proc 10th Pacific-Asia Conference on Knowledge Discovery and Data Mining},
  SERIES = {Singapore},
  PUBLISHER = {Springer},
  YEAR = 2006,
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2006/FrankAndPfahringerPAKDD06.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/FrankAndPfahringerPAKDD06.pdf},
  ABSTRACT = {Bagging is an ensemble learning method that has proved to
be a useful tool in the arsenal of machine learning
practitioners. Commonly applied in conjunction with decision tree
learners to build an ensemble of decision trees, it often leads to
reduced errors in the predictions when compared to using a single
tree. A single tree is built from a training set of size N . Bagging
is based on the idea that, ideally, we would like to eliminate the
variance due to a particular training set by combining trees built
from all training sets of size N . However, in practice, only one
training set is available, and bagging simulates this platonic method
by sampling with replacement from the original training data to form
new training sets. In this paper we pursue the idea of sampling from a
kernel density estimator of the underlying distribution to form new
training sets, in addition to sampling from the data itself. This can
be viewed as "smearing out" the resampled training data to generate
new datasets, and the amount of "smear" is controlled by a parameter.
We show that the resulting method, called "input smearing", can lead
to improved results when compared to bagging. We present results for
both classification and regression problems.}

  AUTHOR = {Kurt Driessens and Peter Reutemann and Bernhard Pfahringer and Claire Leschi},
  TITLE = {Using Weighted Nearest Neighbor to Benefit from Unlabeled Data},
  BOOKTITLE = {Advances in Knowledge Discovery and Data Mining, 10th Pacific-Asia Conference, PAKDD 2006},
  EDITOR = {Wee Keong Ng and Masaru Kitsuregawa and Jianzhong Li and Kuiyu Chang},
  VOLUME = 3918,
  YEAR = 2006,
  PAGES = {60-69},
  ABSTRACT = {The development of data-mining applications such as textclassification and molecular profiling has shown the need for machine learning algorithms that can benefit from both labeled and unlabeled data, where often the unlabeled examples greatly outnumber the labeled examples. In this paper we present a two-stage classifier that improves its predictive accuracy by making use of the available unlabeled data. It uses a weighted nearest neighbor classification algorithm using the combined example-sets as a knowledge base. The examples from the unlabeled set are "pre-labeled" by an initial classifier that is build using the limited available training data. By choosing appropriate weights for this pre-labeled data, the nearest neighbor classifier consistently improves on the original classifier.},
  EE = {http://dx.doi.org/10.1007/11731139_10},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/driessensEtalPAKDD06.pdf}

  AUTHOR = {Bernhard Pfahringer},
  TITLE = {A Semi-Supervised Spam Mail Detector},
  BOOKTITLE = {Proceedings of the ECML/PKDD 2006 Discovery Challenge Workshop},
  EDITOR = {Steffen Bickel},
  PUBLISHER = {Humboldt University Berlin},
  YEAR = 2006,
  PAGES = {48-53},
  ABSTRACT = {This document describes a novel semi-supervised approach to spam classification, which was successful at the ECML/PKDD2006 spam classification challenge. A local learning method based on lazy projections was successfully combined with a variant of a standard semi-supervised learning algorithm.},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/Pfahringer-semiSuperSpam-ECMLPKDD2006.pdf}

  AUTHOR = {Eibe Frank and Remco R. Bouckaert},
  TITLE = {Naive Bayes for Text Classification with Unbalanced Classes},
  BOOKTITLE = {Proc 10th European Conference on Principles and Practice of Knowledge Discovery in Databases},
  PAGES = {503-510},
  YEAR = 2006,
  SERIES = {Berlin, Germany},
  PUBLISHER = {Springer},
  ABSTRACT = {Multinomial naive Bayes (MNB) is a popular method for document classification due to its computational efficiency and relatively good predictive performance. It has recently been established that predictive performance can be improved further by appropriate data transformations. In this paper we present another transformation that is designed to combat a potential problem with the application of MNB to unbalanced datasets. We propose an appropriate correction by adjusting attribute priors. This correction can be implemented as another data normalization step, and we show that it can significantly improve the area under the ROC curve. We also show that the modified version of MNB is very closely related to the simple centroid-based classifier and compare the two methods empirically.},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/FrankAndBouckaertPKDD06new.pdf}

  AUTHOR = {Michael Mayo},
  TITLE = {Automatic Species Identification of Live Moths},
  BOOKTITLE = {Proc. of the 26th SGAI International Conference on Innovative Techniques and Applications of Artificial Intelligence},
  EDITOR = {Ellis et. al},
  YEAR = 2006,
  PAGES = {195-202},
  ABSTRACT = {A collection consisting of the images of 774 live moth individuals, each moth belonging to one of 35 different UK species, was analysed to determine if data mining techniques could be used effectively for automatic species identification. Feature vectors were extracted from each of the moth images and the machine learning toolkit WEKA was used to classify the moths by species using the feature vectors. Whereas a previous analysis of this image dataset reported in the literature [1] required that each moth's least worn wing region be highlighted manually for each image, WEKA was able to achieve a greater level of accuracy (85\%) using support vector machines without manual specification of a region of interest at all. This paper describes the features that were extracted from the images, and the various experiments using different classifiers and datasets that were performed. The results show that data mining can be usefully applied to the problem of automatic species identification of live specimens in the field.},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/MayoSGAI_2006.pdf}

  AUTHOR = {Grant Anderson and Bernhard Pfahringer},
  TITLE = {Random Relational Rules},
  BOOKTITLE = {Extended Abstracts for 16th International Conference on Inductive Logic Programming},
  EDITOR = {Stephen Muggleton and Ramon Otero},
  YEAR = 2006,
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2006/AndersonPfahringer-RandomRelationalRules06.pdf},
  ABSTRACT = {Exhaustive search in relational learning is generally
infeasible, therefore some form of heuristic search is usually
employed, such as in FOIL[1]. On the other hand, so-called stochastic
discrimination provides a framework for combining arbitrary numbers of
weak classifiers (in this case randomly generated relational rules) in
a way where accuracy improves with additional rules, even after maxi-
mal accuracy on the training data has been reached.[2] The weak
classifiers must have a slightly higher probability of covering
instances of their target class than of other classes. As the rules
are also independent and identically distributed, the Central Limit
theorem applies and as the number of weak classifiers/rules grows,
coverages for different classes resemble well-separated normal
distribu- tions. Stochastic discrimination is closely related to other
ensemble methods like Bagging, Boosting, or Random forests, all of
which have been tried in relational learning[3, 4, 5].}

  AUTHOR = {Lin Dong},
  TITLE = {A Comparison of Multi-instance Learning Algorithms},
  SCHOOL = {Department of Computer Science, University of Waikato},
  YEAR = 2006,
  HTTP = {http://hdl.handle.net/10289/2453},
  ABSTRACT = {Motivated by various challenging real-world applications,
such as drug activity prediction and image retrieval, multi-instance
(MI) learning has attracted considerable interest in recent
years. Compared with standard supervised learning, the MI learning
task is more difficult as the label information of each training
example is incomplete. Many MI algorithms have been proposed. Some of
them are specifically designed for MI problems whereas others have
been upgraded or adapted from standard single-instance learning
algorithms. Most algorithms have been evaluated on only one or two
benchmark datasets, and there is a lack of systematic comparisons of
MI learning algorithms. This thesis presents a comprehensive study of
MI learning algorithms that aims to compare their performance and find
a suitable way to properly address different MI problems. First, it
briefly reviews the history of research on MI learning. Then it
discusses five general classes of MI approaches that cover a total of
16 MI algorithms. After that, it presents empirical results for these
algorithms that were obtained from 15 datasets which involve five
different real-world application domains. Finally, some conclusions
are drawn from these results: (1) applying suitable standard
single-instance learners to MI problems can often generate the best
result on the datasets that were tested, (2) algorithms exploiting the
standard asymmetric MI assumption do not show significant advantages
over approaches using the so-called collective assumption, and (3)
different MI approaches are suitable for different application
domains, and no MI algorithm works best on all MI problems.}

  AUTHOR = {Benjamin George Cleland},
  TITLE = {Reinforcement Learning for Racecar Control},
  SCHOOL = {Department of Computer Science, University of Waikato},
  YEAR = 2006,
  HTTP = {http://hdl.handle.net/10289/2507},
  ABSTRACT = {This thesis investigates the use of reinforcement learning to learn to drive a racecar in the simulated environment of the Robot Automobile Racing Simulator. Real-life race driving is known to be difficult for humans, and expert human drivers use complex sequences of actions. There are a large number of variables, some of which change stochastically and all of which may affect the outcome. This makes driving a promising domain for testing and developing Machine Learning techniques that have the potential to be robust enough to work in the real world. Therefore the principles of the algorithms from this work may be applicable to a range of problems. The investigation starts by finding a suitable data structure to represent the information learnt. This is tested using supervised learning. Reinforcement learning is added and roughly tuned, and the supervised learning is then removed. A simple tabular representation is found satisfactory, and this avoids difficulties with more complex methods and allows the investigation to concentrate on the essentials of learning. Various reward sources are tested and a combination of three are found to produce the best performance. Exploration of the problem space is investigated. Results show exploration is essential but controlling how much is done is also important. It turns out the learning episodes need to be very long and because of this the task needs to be treated as continuous by using discounting to limit the size of the variables stored. Eligibility traces are used with success to make the learning more efficient. The tabular representation is made more compact by hashing and more accurate by using smaller buckets. This slows the learning but produces better driving. The improvement given by a rough form of generalisation indicates the replacement of the tabular method by a function approximator is warranted. These results show reinforcement learning can work within the Robot Automobile Racing Simulator, and lay the foundations for building a more efficient and competitive agent.}