/** * Java program for classifying short text messages into two classes. */ import weka.core.*; import weka.classifiers.*; import weka.filters.*; import java.io.*; import java.util.*; public class MessageClassifier implements Serializable { /* Our (rather arbitrary) set of keywords. */ private final String[] m_Keywords = {"product", "only", "offer", "great", "amazing", "phantastic", "opportunity", "buy", "now"}; /* The training data. */ private Instances m_Data = null; /* The filter. */ private Filter m_Filter = new DiscretizeFilter(); /* The classifier. */ private Classifier m_Classifier = new IBk(); /** * Constructs empty training dataset. */ public MessageClassifier() throws Exception { String nameOfDataset = "MessageClassificationProblem"; // Create numeric attributes. FastVector attributes = new FastVector(m_Keywords.length + 1); for (int i = 0 ; i < m_Keywords.length; i++) { attributes.addElement(new Attribute(m_Keywords[i])); } // Add class attribute. FastVector classValues = new FastVector(2); classValues.addElement("miss"); classValues.addElement("hit"); attributes.addElement(new Attribute("Class", classValues)); // Create dataset with initial capacity of 100, and set index of class. m_Data = new Instances(nameOfDataset, attributes, 100); m_Data.setClassIndex(m_Data.numAttributes() - 1); } /** * Updates model using the given training message. */ public void updateModel(String message, String classValue) throws Exception { // Convert message string into instance. Instance instance = makeInstance(cleanupString(message)); // Add class value to instance. instance.setClassValue(classValue); // Add instance to training data. m_Data.add(instance); // Use filter. m_Filter.inputFormat(m_Data); Instances filteredData = Filter.useFilter(m_Data, m_Filter); // Rebuild classifier. m_Classifier.buildClassifier(filteredData); } /** * Classifies a given message. */ public void classifyMessage(String message) throws Exception { // Check if classifier has been built. if (m_Data.numInstances() == 0) { throw new Exception("No classifier available."); } // Convert message string into instance. Instance instance = makeInstance(cleanupString(message)); // Filter instance. m_Filter.input(instance); Instance filteredInstance = m_Filter.output(); // Get index of predicted class value. double predicted = m_Classifier.classifyInstance(filteredInstance); // Classify instance. System.err.println("Message classified as : " + m_Data.classAttribute().value((int)predicted)); } /** * Method that converts a text message into an instance. */ private Instance makeInstance(String messageText) { StringTokenizer tokenizer = new StringTokenizer(messageText); Instance instance = new Instance(m_Keywords.length + 1); String token; // Initialize counts to zero. for (int i = 0; i < m_Keywords.length; i++) { instance.setValue(i, 0); } // Compute attribute values. while (tokenizer.hasMoreTokens()) { token = tokenizer.nextToken(); for (int i = 0; i < m_Keywords.length; i++) { if (token.equals(m_Keywords[i])) { instance.setValue(i, instance.value(i) + 1.0); break; } } } // Give instance access to attribute information from the dataset. instance.setDataset(m_Data); return instance; } /** * Method that deletes all non-letters from a string, and lowercases it. */ private String cleanupString(String messageText) { char[] result = new char[messageText.length()]; int position = 0; for (int i = 0; i < messageText.length(); i++) { if (Character.isLetter(messageText.charAt(i)) || Character.isWhitespace(messageText.charAt(i))) { result[position++] = Character.toLowerCase(messageText.charAt(i)); } } return new String(result); } /** * Main method. */ public static void main(String[] options) { MessageClassifier messageCl; byte[] charArray; try { // Read message file into string. String messageFileString = Utils.getOption('m', options); if (messageFileString.length() != 0) { FileInputStream messageFile = new FileInputStream(messageFileString); int numChars = messageFile.available(); charArray = new byte[numChars]; messageFile.read(charArray); messageFile.close(); } else { throw new Exception ("Name of message file not provided."); } // Check if class value is given. String classValue = Utils.getOption('c', options); // Check for model file. If existent, read it, otherwise create new // one. String modelFileString = Utils.getOption('t', options); if (modelFileString.length() != 0) { try { FileInputStream modelInFile = new FileInputStream(modelFileString); ObjectInputStream modelInObjectFile = new ObjectInputStream(modelInFile); messageCl = (MessageClassifier) modelInObjectFile.readObject(); modelInFile.close(); } catch (FileNotFoundException e) { messageCl = new MessageClassifier(); } } else { throw new Exception ("Name of data file not provided."); } // Check if there are any options left Utils.checkForRemainingOptions(options); // Process message. if (classValue.length() != 0) { messageCl.updateModel(new String(charArray), classValue); } else { messageCl.classifyMessage(new String(charArray)); } // If class has been given, updated message classifier must be saved if (classValue.length() != 0) { FileOutputStream modelOutFile = new FileOutputStream(modelFileString); ObjectOutputStream modelOutObjectFile = new ObjectOutputStream(modelOutFile); modelOutObjectFile.writeObject(messageCl); modelOutObjectFile.flush(); modelOutFile.close(); } } catch (Exception e) { e.printStackTrace(); } } }