2005.bib

@COMMENT{{Automatically generated - DO NOT MODIFY!}}

@INPROCEEDINGS{Pfahringer2005,
  AUTHOR = {Pfahringer, Bernhard and Reutemann, Peter and Mayo, Mike},
  TITLE = {A novel two stage scheme utilizing the test set for model selection in text classification},
  EDITOR = {Ghosh, Ranadhir and Verma, Brijesh and Li, Xue},
  BOOKTITLE = {Proc Workshop on Learning Algorithms for Pattern Recognition, Eighteenth Australian Joint Conference on Artificial Intelligence (AI'05)},
  LOCATION = {Sydney, Australia},
  DATE = {December},
  PUBLISHER = {University of Technology},
  ADDRESS = {Sydney, Australia},
  PAGES = {60-65},
  NOTE = {5-9 December 2005},
  YEAR = {2005},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/pfahringerEtalOziDM05.pdf},
  ABSTRACT = {Text classification is a natural application domain for semi-
supervised learning, as labeling documents is expensive, but
on the other hand usually an abundance of unlabeled
documents is available. We describe a novel simple two-
stage scheme based on dagging which allows for utilizing 
the test set in model selection. The dagging ensemble can
also be used by itself instead of the original classifier. We
evaluate the performance of a meta classifier choosing
between various base learners and their respective dagging
ensembles. The selection process seems to perform robustly
especially for small percentages of available labels for
training.}
}

@INPROCEEDINGS{Holmes2005,
  AUTHOR = {Holmes, Geoffrey and Kirkby, Richard and Pfahringer, Bernhard},
  TITLE = {Tie Breaking in Hoeffding trees},
  EDITOR = {Gama, J. and Aguilar-Ruiz, J. S.},
  BOOKTITLE = {Proc Workshop W6: Second International Workshop on Knowledge Discovery in Data Streams},
  LOCATION = {Porto, Portugal},
  DATE = {October},
  PAGES = {107-116},
  YEAR = {2005}
}

@INPROCEEDINGS{Holmes2005_2,
  AUTHOR = {Holmes, Geoffrey and Pfahringer, Bernhard and Kirkby, Richard},
  TITLE = {Cache hierarchy inspired compression: a novel architecture for data streams},
  EDITOR = {Kulathuramaiyer, Narayanan and Yeo, Alvin W. and Chai, Wang Yin and Eng, Tan Chong},
  BOOKTITLE = {Proc Fourth International Conference on Information Technology in Asia (CITA'05)},
  LOCATION = {Sarawak, Malaysia},
  DATE = {December},
  PAGES = {130-136},
  NOTE = {12-15 December 2005},
  YEAR = {2005},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/holmes-etal-chic.pdf},
  ABSTRACT = {We present an architecture for data streams
based on structures typically found in web cache
hierarchies. The main idea is to build a meta level
analyser from a number of levels constructed over time 
from a data stream. We present the general architecture
for such a system and an application to classification.
This architecture is an instance of the general wrapper
idea allowing us to reuse standard batch learning
algorithms in an inherently incremental learning
environment. By artificially generating data sources we 
demonstrate that a hierarchy containing a mixture of 
models is able to adapt over time to the source of the 
data. In these experiments the hierarchies use an
elementary performance based replacement policy and 
unweighted voting for making classification decisions.}
}

@PROCEEDINGS{DBLP:conf/ilp/2005,
  EDITOR = {Stefan Kramer and
               Bernhard Pfahringer},
  TITLE = {Inductive Logic Programming, 15th International Conference,
               ILP 2005, Bonn, Germany, August 10-13, 2005, Proceedings},
  BOOKTITLE = {ILP},
  PUBLISHER = {Springer},
  SERIES = {Lecture Notes in Computer Science},
  VOLUME = {3625},
  YEAR = {2005},
  ISBN = {3-540-28177-0},
  BIBSOURCE = {DBLP, http://dblp.uni-trier.de}
}

@INPROCEEDINGS{SchmidbergerF05,
  AUTHOR = {Gabi Schmidberger and
               Eibe Frank},
  TITLE = {Unsupervised Discretization Using Tree-Based Density Estimation},
  BOOKTITLE = {Proc 9th European
               Conference on Principles and Practice of Knowledge Discovery
               in Databases},
  SERIES = {Porto, Portugal},
  YEAR = {2005},
  PAGES = {240-251},
  PUBLISHER = {Springer},
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2005/schmidberger_and_frank_cr.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/schmidberger_and_frank_cr.pdf},
  ABSTRACT = {This paper presents an unsupervised discretization method 
that performs density estimation for univariate data. The subintervals 
that the discretization produces can be used as the bins of a histogram. 
Histograms are a very simple and broadly understood means for display- 
ing data, and our method automatically adapts bin widths to the data. 
It uses the log-likelihood as the scoring function to select cut points and 
the cross-validated log-likelihood to select the number of intervals. We 
compare this method with equal-width discretization where we also se- 
lect the number of bins using the cross-validated log-likelihood and with 
equal-frequency discretization.}
}

@INPROCEEDINGS{DongFK05,
  AUTHOR = {Lin Dong and
               Eibe Frank and
               Stefan Kramer},
  TITLE = {Ensembles of Balanced Nested Dichotomies for Multi-class
               Problems},
  BOOKTITLE = {Proc 9th European
               Conference on Principles and Practice of Knowledge Discovery
               in Databases},
  SERIES = {Porto, Portugal},
  YEAR = {2005},
  PAGES = {84-95},
  PUBLISHER = {Springer},
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2005/dong_et_al_cr.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/dong_et_al_cr.pdf},
  ABSTRACT = {A system of nested dichotomies is a hierarchical
decomposition of a multi-class problem with c classes into c - 1
two-class problems and can be represented as a tree
structure. Ensembles of randomly-generated nested dichotomies have
proven to be an effective approach to multi-class learning problems
[1]. However, sampling trees by giving each tree equal probability
means that the depth of a tree is limited only by the number of
classes, and very unbalanced trees can negatively affect runtime. In
this paper we investigate two approaches to building balanced nested
dichotomies---class-balanced nested dichotomies and data-balanced nested
dichotomies---and evaluate them in the same ensemble setting. Using
C4.5 decision trees as the base models, we show that both approaches
can reduce runtime with little or no effect on accuracy, especially
on problems with many classes. We also investigate the effect of
caching models when building ensembles of nested dichotomies.  }
}

@INPROCEEDINGS{SumnerFH05,
  AUTHOR = {Marc Sumner and
               Eibe Frank and
               Mark A. Hall},
  TITLE = {Speeding Up Logistic Model Tree Induction},
  BOOKTITLE = {Proc 9th European
               Conference on Principles and Practice of Knowledge Discovery
               in Databases},
  SERIES = {Porto, Portugal},
  YEAR = {2005},
  PAGES = {675-683},
  PUBLISHER = {Springer},
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2005/SumnerFrankHallCameraReady.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/SumnerFrankHallCameraReady.pdf},
  ABSTRACT = {Logistic Model Trees have been shown to be very accurate 
and compact classifiers [8]. Their greatest disadvantage is the computa- 
tional complexity of inducing the logistic regression models in the tree. 
We address this issue by using the AIC criterion [1] instead of cross- 
validation to prevent overfitting these models. In addition, a weight trim- 
ming heuristic is used which produces a significant speedup. We compare 
the training time and accuracy of the new induction process with the 
original one on various datasets and show that the training time often 
decreases while the classification accuracy diminishes only slightly. }
}

@INPROCEEDINGS{HolmesKP05,
  AUTHOR = {Geoffrey Holmes and
               Richard Kirkby and
               Bernhard Pfahringer},
  TITLE = {Stress-Testing Hoeffding Trees},
  BOOKTITLE = {Proc 9th European
               Conference on Principles and Practice of Knowledge Discovery
               in Databases},
  SERIES = {Porto, Portugal},
  YEAR = {2005},
  PAGES = {495-502},
  PUBLISHER = {Springer}
}

@ARTICLE{LandwehrHF05,
  AUTHOR = {Niels Landwehr and
               Mark Hall and
               Eibe Frank},
  TITLE = {Logistic Model Trees},
  JOURNAL = {Machine Learning},
  VOLUME = {59},
  NUMBER = {1-2},
  YEAR = {2005},
  PAGES = {161-205},
  ABSTRACT = {Tree induction methods and linear models are popular techniques for supervised learning tasks, both for the prediction of nominal classes and numeric values. For predicting numeric quantities, there has been work on combining these two schemes into 'model trees', i.e. trees that contain linear regression functions at the leaves. In this paper, we present an algorithm that adapts this idea for classification problems, using logistic regression instead of linear regression. We use a stagewise fitting process to construct the logisitic regression models that can select relevant attributes in the data in a natural way, and show how this approach can be used to build the logistic regression models at the leaves by incrementally refining those constructed at higher levels in the tree. We compare the performance of our algorithm to several other state-of-the-art learning schemes on 36 benchmark UCI datasets, and show that it produces accurate and compact classifiers.},
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2005/LMT.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/LMT.pdf}
}

@ARTICLE{WangTHFFMM05,
  AUTHOR = {Yu Wang and
               Igor V. Tetko and
               Mark A. Hall and
               Eibe Frank and
               Axel Facius and
               Klaus F. X. Mayer and
               Hans-Werner Mewes},
  TITLE = {Gene selection from microarray data for cancer classification
               - a machine learning approach},
  JOURNAL = {Computational Biology and Chemistry},
  VOLUME = {29},
  NUMBER = {1},
  YEAR = {2005},
  PAGES = {37-46},
  HTTP = {http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve\&db=PubMed\&list_uids=15680584\&dopt=Citation}
}

@INCOLLECTION{witten05:_kea,
  AUTHOR = {Ian H. Witten and Gordon W. Paynter and Eibe Frank and 
Carl Gutwin and Craig G. Nevill-Manning},
  TITLE = {Kea: Practical automatic keyphrase extraction},
  BOOKTITLE = {Design and Usability of Digital Libraries: Case Studies in the Asia Pacific},
  PAGES = {129-152},
  PUBLISHER = {Information Science Publishing},
  YEAR = 2005,
  EDITOR = {Y.-L. Theng and S. Foo},
  ADDRESS = {London},
  ABSTRACT = {Keyphrases provide semantic metadata that summarize and characterize documents. This chapter describes Kea, an algorithm for automatically extracting keyphrases from text. Kea identifies candidate keyphrases using lexical methods, calculates feature values for each candidate, and uses a machine-learning algorithm to predict which candidates are good keyphrases. The machine-learning scheme first builds a prediction model using training documents with known keyphrases, and then uses the model to find keyphrases in new documents. We use a large text corpus to evaluate Kea's effectiveness in terms of how many author-assigned keyphrases are correctly identified. The system is simple, robust, and available under the GNU General Public License; the chapter gives instructions for use.},
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2005/chap_Witten-et-al_Windows.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/chap_Witten-et-al_Windows.pdf}
}

@BOOK{witten05:_data_minin,
  AUTHOR = {Ian H. Witten and Eibe Frank},
  TITLE = {Data Mining: Practical Machine Learning Tools and Techniques},
  PUBLISHER = {Morgan Kaufmann},
  YEAR = 2005,
  ADDRESS = {San Francisco},
  EDITION = 2,
  HTTP = {http://www.cs.waikato.ac.nz/~ml/weka/book.html}
}

@INCOLLECTION{FrankHHKP05,
  AUTHOR = {Eibe Frank and
               Mark A. Hall and
               Geoffrey Holmes and
               Richard Kirkby and
               Bernhard Pfahringer and Ian H. Witten and Leonhard Trigg},
  TITLE = {WEKA - A Machine Learning Workbench for Data Mining},
  BOOKTITLE = {The Data Mining and Knowledge Discovery Handbook},
  YEAR = {2005},
  PAGES = {1305-1314},
  EDITOR = {Oded Maimon and
               Lior Rokach},
  PUBLISHER = {Springer},
  PS = {http://www.cs.waikato.ac.nz/~ml/publications/2005/weka_dmh.ps.gz},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/weka_dmh.pdf},
  ABSTRACT = {The Weka workbench is an organized collection of state-of-the-art machine learning algorithms and data preprocessing tools. The basic way 
of interacting with these methods is by invoking them from the com- 
mand line. However, convenient interactive graphical user interfaces are 
provided for data exploration, for setting up large-scale experiments on 
distributed computing platforms, and for designing configurations for 
streamed data processing. These interfaces constitute an advanced en- 
vironment for experimental data mining. The system is written in Java 
and distributed under the terms of the GNU General Public License.}
}

@INPROCEEDINGS{Smith2005a,
  AUTHOR = {Lin, Chi-San and Smith, Tony C.},
  TITLE = {Semantic Role-Labeling via Consensus Pattern-Matching},
  BOOKTITLE = {Proceedings of the Ninth Conference on Computational Natural Language Learning},
  YEAR = {2005},
  PAGES = {185-188},
  ADDRESS = {Ann Arbor, Michigan},
  MONTH = {June},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/Lin-etal-SRL.pdf},
  ABSTRACT = {This paper describes a system for semantic 
role labeling for the CoNLL2005 Shared
task.  We divide the task into two sub-tasks:
boundary recognition by a general tree-
based predicate-argument recognition algo-
rithm to convert a parse tree into a flat rep-
resentation of all predicates and their
related boundaries, and role labeling by a 
consensus model using a pattern-matching
framework to find suitable roles for core
constituents and adjuncts.  We describe the
system architecture and report results for 
the CoNLL2005 development dataset.}
}

@INPROCEEDINGS{BouckaertS05,
  AUTHOR = {Remco R. Bouckaert and Milan Studen{\'y}},
  TITLE = {Racing for Conditional Independence Inference},
  BOOKTITLE = {Symbolic and Quantitative Approaches to Reasoning with Uncertainty, 8th European Conference, ECSQARU 2005},
  YEAR = {2005},
  SERIES = {LNCS},
  VOLUME = {3571},
  PAGES = {221-232},
  EE = {http://dx.doi.org/10.1007/11518655_20},
  ABSTRACT = {In this article, we consider the computational aspects of deciding
   whether a conditional independence statement $t$ is implied by a list
   of conditional independence statements $L$ using the implication related
   to the method of structural imsets.
   We present two methods which have the interesting complementary properties
   that one method performs well to prove that $t$ is implied by $L$, while the
   other performs well to prove that $t$ is not implied by $L$. However, both
   methods do not perform well the opposite. This gives rise to a parallel
   algorithm in which both methods race against each other in order to
   determine effectively whether $t$ is or is not implied. \\
   \\
   Some empirical evidence is provided that suggest this racing algorithms
   method performs a lot better than an existing method based on so-called
   skeletal characterization of the respective implication.
   Furthermore, the method is able to handle more than five variables.},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/remco-ecsqaru.pdf}
}

@INPROCEEDINGS{Bouckaert2005,
  AUTHOR = {Bouckaert, R. R.},
  TITLE = {Low Replicability of Machine Learning Experiments is not a Small Data Set Phenomenon},
  BOOKTITLE = {Proceedings of the ICML'05 Workshop on Meta-Learning},
  YEAR = {2005},
  ABSTRACT = {This paper investigates the relation between replicability of 
   experiments
   for deciding which of two algorithms performs better on a given data set.
   We prove that lack of replicability is not just a small data phenomenon
   (as was shown before),
   but is present in experiments on medium and large data sets as well.
   We establish intuition in the  relation between data set size, power and 
   replicability.\\
   \\
   The main method for improving replicability is to increase the number of 
   samples. For large data sets and/or inefficient learning algorithms, this
   implies that exerperiments may take a long time to completion.
   We propose a procedure for deciding which of two learning algorithms is
   best that has a high replicability but takes moderate computational effort.},
  PDF = {http://www.cs.waikato.ac.nz/~ml/publications/2005/bouckaert-icml05.pdf}
}

@INPROCEEDINGS{Mike2005,
  AUTHOR = {Mike Mayo},
  TITLE = {Bayesian Sequence Learning for Predicting Protein Cleavage Points},
  BOOKTITLE = {Advances in Knowledge Discovery and Data Mining: 9th Pacific-Asia Conference, PAKDD 2005},
  YEAR = {2005},
  PAGES = {192},
  ADDRESS = {Hanoi, Vietnam},
  NOTE = {May 18-20, 2005},
  ABSTRACT = {A challenging problem in data mining is the application of efficient techniques to automatically annotate the vast databases of biological sequence data. This paper describes one such application in this area, to the prediction of the position of signal peptide cleavage points along protein sequences. It is shown that the method, based on Bayesian statistics, is comparable in terms of accuracy to the existing state-of-the-art neural network techniques while providing explanatory information for its predictions.},
  PDF = {http://www.cs.waikato.ac.nz/~mmayo/papers/PAKDD05.pdf}
}

@ARTICLE{Mike2005_2,
  AUTHOR = {Mike Mayo},
  TITLE = {Learning Petri Net Models of Non-Linear Gene Interactions},
  JOURNAL = {BioSystems},
  YEAR = {2005},
  VOLUME = {85},
  NUMBER = {1},
  PAGES = {74-82},
  ABSTRACT = {Understanding how an individual's genetic make-up influences their risk of disease is a problem of paramount importance. Although machine learning techniques are able to uncover the relationships between genotype and disease, the problem of automatically building the best biochemical model or "explanation" of the relationship has received less attention. In this paper, I describe a method based on random hill climbing that automatically builds Petri net models of non-linear (or multi-factorial) disease-causing gene-gene interactions. Petri nets are a suitable formalism for this problem, because they are used to model concurrent, dynamic processes analogous to biochemical reaction networks. I show that this method is routinely able to identify perfect Petri net models for three disease-causing gene-gene interactions recently reported in the literature.},
  PDF = {http://www.cs.waikato.ac.nz/~mmayo/papers/BioSystems05.pdf}
}