/* * $Log: KWIC.java,v $ * Revision 1.6 2002/04/15 22:33:10 sean * removed superfluous imports * * Revision 1.5 2002/03/22 02:21:15 sean * rename vars * * Revision 1.4 2001/07/18 02:39:28 sean * Improvements to javadocs * * Revision 1.3 2001/07/18 02:11:42 sean * Try to fix javadocs * * Revision 1.2 2001/05/28 04:24:06 sean * Updated documentation * * Revision 1.1.1.1 2001/05/07 00:17:36 len * Initial import * * Revision 1.2 2001/04/19 00:25:43 sean * Initial coding effort 3 hours * */ package com.reeltwo.util; import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.FileInputStream; import java.io.File; import java.io.InputStream; import java.io.IOException; /** * KWIC.java * * Given a keyword, find all occurrences of the keyword in the supplied * documents. For each occurrence show the surrounding context.

* * Options:

* *

 * -w      Turn off word boundary detection
 * -c      Perform case sensitive searching
 * -l      Specify amount of left context
 * -r      Specify amount of right context
 * 
* * Created: Tue Apr 17 15:37:22 2001 * * @author Sean A. Irvine * @version $Revision: 1.6 $ * */ public class KWIC { private static final String SPACES = " "; // globals controlling matching and output appearance public boolean mWordBoundary = true; /** match only words? */ public boolean mCaseInsensitive = true; /** case insensitive matching? */ public int mLeftContext = 30; /** characters of left context */ public int mRightContext = 30; /** characters of right context */ /** * Find occurrences of a keyword in a given input stream. Display * all such occurrences with surrounding context. * * @param w keyword to search for * @param s stream to search */ public void kwic(String w, InputStream s) { if (w == null || s == null) return; int wlen = w.length(); if (wlen == 0) return; BufferedReader r = new BufferedReader(new InputStreamReader(s)); StringBuffer b = new StringBuffer(); String t; try { while ((t = r.readLine()) != null) { b.append(t.replace('\n', ' ').replace('\t', ' ')); b.append(' '); } } catch (Exception e) { e.printStackTrace(); } String text = b.toString(); int tlen = text.length(); String searchtext = mCaseInsensitive ? text.toLowerCase() : text; if (mCaseInsensitive) w = w.toLowerCase(); int offset = 0; while ((offset = searchtext.indexOf(w, offset)) != -1) { // make sure we have a word boundary if this is required if (mWordBoundary && ((offset > 0 && Character.isLetter(text.charAt(offset - 1))) || (offset + wlen < tlen && Character.isLetter(text.charAt(offset + wlen))))) { offset += 1; continue; } // extra + 16 to allow for SPACES to be trimmed int begin = offset - mLeftContext - 16; int end = offset + mRightContext + wlen + 16; if (begin < 0) begin = 0; if (end > text.length()) end = text.length(); String lc = text.substring(begin, offset); String rc = text.substring(offset + wlen, end); if (mWordBoundary) { lc = lc.trim(); rc = rc.trim(); } lc = SPACES + lc; rc = rc + SPACES; System.out.println(lc.substring(lc.length() - mLeftContext) + " \033[1m" + text.substring(offset, offset + wlen) + "\033[0m " + rc.substring(0, mRightContext)); offset += 1; } } /** * Match the given string against the contents of the specified file, * or recursively against all the files in the specified directory. * * @param keyword string to match * @param f file or directory to match in */ public void kwic(String keyword, File f) throws IOException { if (f == null) return; if (!f.exists()) throw new IOException("File does not exist"); if (f.isDirectory()) { // recurse over directory contents File[] d = f.listFiles(); for (int i = 0; i < d.length; i++) kwic(keyword, d[i]); } else try { kwic(keyword, new FileInputStream(f)); } catch (IOException e) { e.printStackTrace(); } } public static void main(String[] args) { KWIC k = new KWIC(); if (args.length < 2) { System.out.println("USAGE: KWIC [-c][-w][-l n][-r n] word file*"); System.exit(0); } // process command line arguments int i = 0; while (i < args.length && args[i].length() > 0 && args[i].charAt(0) == '-') { if (args[i].length() > 1) switch (args[i].charAt(1)) { case 'c': k.mCaseInsensitive = false; break; case 'w': k.mWordBoundary = false; break; case 'l': i += 1; if (i != args.length) { k.mLeftContext = (new Integer(args[i])).intValue(); if (k.mLeftContext < 0) k.mLeftContext = -k.mLeftContext; } break; case 'r': i += 1; if (i != args.length) { k.mRightContext = (new Integer(args[i])).intValue(); if (k.mRightContext < 0) k.mRightContext = -k.mRightContext; } break; default: System.out.println("Unrecognized option: " + args[i]); break; } i += 1; } String word = args[i++]; while (i < args.length) try { k.kwic(word, new File(args[i++])); } catch (IOException e) { e.printStackTrace(); } } }