Moved a lot of stuff out of TextGenerator mainly to declutter, so I can

think about what isn't working and why not a bit more cleanly.
2013-10-31 08:51:41 +00:00 · 2013-10-31 08:51:41 +00:00 · 61a8b7ad97
parent e59f160f70
commit 61a8b7ad97
5 changed files with 487 additions and 386 deletions
--- a/src/cc/journeyman/milkwood/Digester.java
+++ b/src/cc/journeyman/milkwood/Digester.java
@ -0,0 +1,60 @@
+/*
+ * Proprietary unpublished source code property of 
+ * Simon Brooke <simon@journeyman.cc>.
+ * 
+ * Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
+ */
+package cc.journeyman.milkwood;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StreamTokenizer;
+import java.util.LinkedList;
+import java.util.Queue;
+
+/**
+ * Read an input stream of text and digest it into a set of generation rules.
+ * Separated out of TextGenerator mainly to declutter tht class.
+ * 
+ * @author simon
+ * 
+ */
+public class Digester {
+	/**
+	 * Read tokens from the input stream, and compile them into the rule tree
+	 * below this root.
+	 * 
+	 * @param in
+	 *            the input stream from which I read.
+	 * @param tupleLength
+	 *            the length of the tuples I read.
+	 * @param root
+	 *            the ruleset to which I shall add.
+	 * @return the number of tokens read.
+	 * @throws IOException if can't read from file system.
+	 */
+	protected int read(final InputStream in, final int tupleLength,
+			final RuleTreeNode root) throws IOException {
+		int result = 0;
+		final Queue<WordSequence> openTuples = new LinkedList<WordSequence>();
+		final Tokeniser tok = new Tokeniser(in);
+
+		for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok
+				.nextToken()) {
+			result++;
+			final WordSequence newTuple = new WordSequence();
+			String token = tok.readBareToken();
+
+			openTuples.add(newTuple);
+			for (WordSequence tuple : openTuples) {
+				tuple.add(token);
+			}
+
+			if (openTuples.size() > tupleLength) {
+				root.addSequence(openTuples.remove());
+			}
+		}
+
+		return result;
+	}
+}
--- a/src/cc/journeyman/milkwood/Milkwood.java
+++ b/src/cc/journeyman/milkwood/Milkwood.java
@ -15,60 +15,97 @@ import java.io.OutputStream;
 * Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
 */

-
 /**
- *
+ * 
 * @author Simon Brooke <simon@journeyman.cc>
 */
 public class Milkwood {

-    /**
-     * Parse command line arguments and kick off the process. Expected 
-     * arguments include:
-     * <dl>
-     * <dt>-i, -input</dt>
-     * <dd>Input file, expected to be an English (or, frankly, other natural
-     * language) text. Defaults to standard in.</dd>
-     * <dt>-n, -tuple-length</dt>
-     * <dd>The length of tuples into which the file will be analised, default 2.</dd>
-     * <dt>-o, -output</dt>
-     * <dd>Output file, to which generated text will be written. 
-     * Defaults to standard out.</dd>
-     * </dl>
-     * 
-     * @param args the command line arguments
-     * @exception FileNotFoundException if the user specifies a file which 
-     * isn't available.
-     * @excpetion IOException if could not read from input or write to output.
-     */
-    public static void main(String[] args) throws FileNotFoundException, IOException {
-        InputStream in = System.in;
-        OutputStream out = System.out;
-        int tupleLength = 2;
-        
-        for (int cursor = 0; cursor < args.length; cursor++) {
-            String arg = args[cursor];
+	/**
+	 * Parse command line arguments and kick off the process. Expected arguments
+	 * include:
+	 * <dl>
+	 * <dt>-d, -debug</dt>
+	 * <dd>Print debugging output to standard error</dd>
+	 * <dt>-i, -input</dt>
+	 * <dd>Input file, expected to be an English (or, frankly, other natural
+	 * language) text. Defaults to standard in.</dd>
+	 * <dt>-n, -tuple-length</dt>
+	 * <dd>The length of tuples into which the file will be analised, default 2.
+	 * </dd>
+	 * <dt>-o, -output</dt>
+	 * <dd>Output file, to which generated text will be written. Defaults to
+	 * standard out.</dd>
+	 * </dl>
+	 * 
+	 * @param args
+	 *            the command line arguments
+	 * @exception FileNotFoundException
+	 *                if the user specifies a file which isn't available.
+	 * @excpetion IOException if could not read from input or write to output.
+	 */
+	public static void main(String[] args) throws FileNotFoundException,
+			IOException {
+		InputStream in = System.in;
+		OutputStream out = System.out;
+		int tupleLength = 2;
+		boolean debug = false;

-            if (arg.startsWith("-") && arg.length() > 1) {
-                switch (arg.charAt(1)) {
-                    case 'i':
-                        // input
-                        in = new FileInputStream(new File(args[++cursor]));
-                        break;
-                    case 'o': // output
-                        out = new FileOutputStream(new File(args[++cursor]));
-                        break;
-                    case 'n':
-                    case 't': // tuple length
-                        tupleLength = Integer.parseInt(args[++cursor]);
-                        break;
-                    default:
-                        throw new IllegalArgumentException(
-                                String.format("Unrecognised argument '%s'", arg));
-                }
-            }
-        }
+		for (int cursor = 0; cursor < args.length; cursor++) {
+			String arg = args[cursor];
+
+			if (arg.startsWith("-") && arg.length() > 1) {
+				switch (arg.charAt(1)) {
+				case 'd':
+					debug = true;
+					break;
+				case 'i':
+					// input
+					in = new FileInputStream(new File(args[++cursor]));
+					break;
+				case 'o': // output
+					out = new FileOutputStream(new File(args[++cursor]));
+					break;
+				case 'n':
+				case 't': // tuple length
+					tupleLength = Integer.parseInt(args[++cursor]);
+					break;
+				default:
+					throw new IllegalArgumentException(String.format(
+							"Unrecognised argument '%s'", arg));
+				}
+			}
+		}
+
+		new Milkwood().readAndGenerate(in, out, tupleLength, debug);
+	}
+
+	/**
+	 * Read tokens from this input and use them to generate text on this output.
+	 * 
+	 * @param in
+	 *            the input stream to read.
+	 * @param out
+	 *            the output stream to write to.
+	 * @param tupleLength
+	 *            the length of tuples to be used in generation.
+	 * @param debug
+	 *            whether to print debugging output.
+	 * @throws IOException
+	 *             if the file system buggers up, which is not, in the cosmic
+	 *             scheme of things, very likely.
+	 */
+	void readAndGenerate(final InputStream in, final OutputStream out,
+			final int tupleLength, boolean debug) throws IOException {
+		/* The root of the rule tree I shall build. */
+		RuleTreeNode root = new RuleTreeNode();
+		int length = new Digester().read(in, tupleLength, root);
+
+		if (debug) {
+			System.err.println(root.toString());
+		}
+
+		new TextGenerator().generate(out, tupleLength, root, length);
+	}

-        new TextGenerator().readAndGenerate( in, out, tupleLength);
-    }
 }
--- a/src/cc/journeyman/milkwood/RuleTreeNode.java
+++ b/src/cc/journeyman/milkwood/RuleTreeNode.java
@ -23,6 +23,10 @@ import java.util.Stack;
 * @author Simon Brooke <simon@journeyman.cc>
 */
 public class RuleTreeNode {
+	/**
+	 * The magic token which identifies the root node of a rule tree.
+	 */
+	public static final String ROOTMAGICTOKEN = "*ROOT*";
    /**
     * The line separator on this platform.
     */
@ -41,6 +45,13 @@ public class RuleTreeNode {
     * Potential successors of this node
     */
    private Map<String,RuleTreeNode> rules = new HashMap<String,RuleTreeNode>();
+    
+    /**
+     * If no argument passed, generate a root node.
+     */
+    public RuleTreeNode() {
+    	this( RuleTreeNode.ROOTMAGICTOKEN);
+    }

    /**
     * Create me wrapping this word.
--- a/src/cc/journeyman/milkwood/TextGenerator.java
+++ b/src/cc/journeyman/milkwood/TextGenerator.java
@ -6,229 +6,127 @@
 */
 package cc.journeyman.milkwood;

-import java.io.BufferedReader;
 import java.io.BufferedWriter;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.OutputStream;
 import java.io.OutputStreamWriter;
-import java.io.Reader;
-import java.io.StreamTokenizer;
 import java.util.Collection;
-import java.util.LinkedList;
 import java.util.Locale;
-import java.util.Queue;
 import java.util.Random;
 import java.util.Stack;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-

 /**
 * 
 * @author Simon Brooke <simon@journeyman.cc>
 */
 class TextGenerator {
+
 	/**
-	 * The magic token which identifies the root node of the
-	 * rule tree.
+	 * The magic token which is deemed to end sentences.
 	 */
-    private static final String ROOTMAGICTOKEN = "*ROOT*";
+	public static final String PERIOD = ".";

 	/**
-     * The special magic token which is deemed to end sentences.
-     */
-    public static final String PERIOD = ".";
-    
-    /**
-     * The average number of sentences in a paragraph.
-     */
-    public static final int AVSENTENCESPERPARA = 5;
-    /**
-     * A random number generator.
-     */
-    private static Random RANDOM = new Random();
-    /**
-     * Dictionary of first-words we know about; each first-word maps 
-     * onto a tuple of tuples of word sequences beginning with that 
-     * word, so 'I' might map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
-     */
-    TupleDictionary dictionary = new TupleDictionary();
+	 * The average number of sentences in a paragraph.
+	 */
+	public static final int AVSENTENCESPERPARA = 5;
+	/**
+	 * A random number generator.
+	 */
+	private static Random RANDOM = new Random();
+	/**
+	 * Dictionary of first-words we know about; each first-word maps onto a
+	 * tuple of tuples of word sequences beginning with that word, so 'I' might
+	 * map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
+	 */
+	TupleDictionary dictionary = new TupleDictionary();

-    public TextGenerator() {
-    }
+	public TextGenerator() {
+	}

-    /**
-     * Read tokens from this input and use them to generate text on this output.
-     * @param in the input stream to read.
-     * @param out the output stream to write to.
-     * @param tupleLength the length of tuples to be used in generation.
-     * @throws IOException if the file system buggers up, which is not, in the
-     * cosmic scheme of things, very likely.
-     */
-    void readAndGenerate(InputStream in, OutputStream out, int tupleLength) throws IOException {
-    /* The root of the rule tree I shall build. */
-    RuleTreeNode root = new RuleTreeNode( ROOTMAGICTOKEN);
-        int length = read(in, tupleLength, root);
-        
-        System.err.println( root.toString());
-        
-        generate( out, tupleLength, root, length);
-    }

-    /**
-     * Read tokens from the input stream, and compile them into a ruleset below root.
-     * @param in the input stream from which I read.
-     * @param tupleLength the length of the tuples I read.
-     * @param root the ruleset to which I shall add.
-     * @return the number of tokens read.
-     * @throws IOException 
-     */
-    private int read(InputStream in, int tupleLength, RuleTreeNode root) throws IOException {
-        int result = 0;
-        Queue<WordSequence> openTuples = new LinkedList<WordSequence>();
-        StreamTokenizer tok = prepareTokenizer(in);
-        
-        for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok.nextToken()) {
-            result ++;
-            final WordSequence newTuple = new WordSequence();
-            String token = readBareToken(tok, type);
+	public void generate(OutputStream out, int tupleLength, RuleTreeNode root,
+			int length) throws IOException {
+		WordSequence tokens = this.compose(root, tupleLength, length);

-            openTuples.add(newTuple);
-            for ( WordSequence tuple : openTuples) {
-                tuple.add(token);
-            }
-            
-            if (openTuples.size() > tupleLength) {
-                root.addSequence( openTuples.remove());
-            }
-        }
-        
-        return result;
-    }
-
-    /**
-     * There surely must be a better way to get just the token out of a 
-     * StreamTokenizer...!
-     * @param tok the tokenizer.
-     * @return just the next token.
-     */
-	private String readBareToken(StreamTokenizer tok, int type) {
-		final String token;
-		
-		switch (type) {
-		case StreamTokenizer.TT_EOL:
-			token = "FIXME"; // TODO: fix this!
-			break;
-		case StreamTokenizer.TT_NUMBER:
-			token = new Double(tok.nval).toString();
-			break;
-		case StreamTokenizer.TT_WORD:
-			token = tok.sval.toLowerCase();
-			break;
-		default:
-			StringBuffer buffy = new StringBuffer();
-			buffy.append((char) type);
-			token = buffy.toString();
-			break;
+		if (tokens.contains(PERIOD)) {
+			// TODO: eq = equal?
+			tokens = this.truncateAtLastInstance(tokens, PERIOD);
 		}
-		return token;
+
+		this.generate(out, tokens);
 	}

-    /**
-     * Prepare a tokeniser on this input stream, set up to handle at least 
-     * Western European natural language text.
-     * @param in the stream.
-     * @return a suitable tokeniser.
-     */
-	private StreamTokenizer prepareTokenizer(InputStream in) {
-		Reader gentle = new BufferedReader(new InputStreamReader(in));
-        StreamTokenizer tok = new StreamTokenizer(gentle);
-        
-        tok.resetSyntax();
-        tok.whitespaceChars(8, 15);
-        tok.whitespaceChars(28, 32);
-        /* treat quotemarks as white space */
-        tok.whitespaceChars((int) '\"', (int) '\"');
-        tok.whitespaceChars((int) '\'', (int) '\'');
-        tok.wordChars((int) '0', (int) '9');
-        tok.wordChars((int) 'A', (int) 'Z');
-        tok.wordChars((int) 'a', (int) 'z');
-        tok.parseNumbers();
-		return tok;
+	/**
+	 * Write this sequence of tokens on this stream, sorting out minor issues of
+	 * orthography.
+	 * 
+	 * @param out
+	 *            the stream.
+	 * @param tokens
+	 *            the tokens.
+	 * @throws IOException
+	 *             if it is impossible to write (e.g. file system full).
+	 */
+	private void generate(OutputStream out, WordSequence tokens)
+			throws IOException {
+		BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
+		boolean capitaliseNext = true;
+
+		try {
+			for (String token : tokens) {
+				capitaliseNext = writeToken(dickens, capitaliseNext, token);
+			}
+		} finally {
+			dickens.flush();
+			dickens.close();
+		}
 	}

-    private void generate(OutputStream out, int tupleLength, RuleTreeNode root, int length) throws IOException {
-        WordSequence tokens = this.compose( root, tupleLength, length);
-        
-        if ( tokens.contains(PERIOD)) {
-            // TODO: eq = equal?
-            tokens = this.truncateAtLastInstance( tokens, PERIOD);
-        }
-        
-        this.generate( out, tokens);
-    }
-
-    /**
-     * Write this sequence of tokens on this stream, sorting out minor 
-     * issues of orthography.
-     * @param out the stream.
-     * @param tokens the tokens.
-     * @throws IOException if it is impossible to write (e.g. file system full).
-     */
-    private void generate(OutputStream out, WordSequence tokens) throws IOException {
-    	BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
-    	boolean capitaliseNext = true;
-    	
-        try {
-            for (String token : tokens) {
-                capitaliseNext = writeToken(dickens, capitaliseNext, token);
-            }
-        } finally {
-        	dickens.flush();
-        	dickens.close();
-        }
-    }
-
-    /**
-     * Deal with end of paragraph, capital after full stop, and other 
-     * minor orthographic conventions.
-     * @param dickens the scrivenor who writes for us.
-     * @param capitalise whether or not the token should be capitalised
-     * @param token the token to write;
-     * @returnvtrue if the next token to be written should be capitalised.
-     * @throws IOException
-     */
+	/**
+	 * Deal with end of paragraph, capital after full stop, and other minor
+	 * orthographic conventions.
+	 * 
+	 * @param dickens
+	 *            the scrivenor who writes for us.
+	 * @param capitalise
+	 *            whether or not the token should be capitalised
+	 * @param token
+	 *            the token to write;
+	 * @returnvtrue if the next token to be written should be capitalised.
+	 * @throws IOException
+	 */
 	private boolean writeToken(BufferedWriter dickens, boolean capitalise,
 			String token) throws IOException {
-		if ( this.spaceBefore(token)) {
-		    dickens.write( " ");
+		if (this.spaceBefore(token)) {
+			dickens.write(" ");
 		}
-		if ( capitalise) {
-			dickens.write(token.substring(0, 1).toUpperCase(Locale.getDefault()));
+		if (capitalise) {
+			dickens.write(token.substring(0, 1)
+					.toUpperCase(Locale.getDefault()));
 			dickens.write(token.substring(1));
 		} else {
 			dickens.write(token);
 		}

-		this.maybeParagraph( token, dickens);
-		
+		this.maybeParagraph(token, dickens);
+
 		return (token.endsWith(PERIOD));
 	}

-    /**
-     * Return false if token is punctuation, else true. Wouldn't it be 
-     * nice if Java provided Character.isPunctuation(char)? However, since it 
-     * doesn't, I can give this slightly special semantics: return true only if 
-     * this is punctuation which would not normally be preceded with a space.
-     * @param ch a character.
-     * @return true if the should be preceded by a space, else false.
-     */
-    private boolean spaceBefore(String token) {
-        final boolean result;
-        
+	/**
+	 * Return false if token is punctuation, else true. Wouldn't it be nice if
+	 * Java provided Character.isPunctuation(char)? However, since it doesn't, I
+	 * can give this slightly special semantics: return true only if this is
+	 * punctuation which would not normally be preceded with a space.
+	 * 
+	 * @param ch
+	 *            a character.
+	 * @return true if the should be preceded by a space, else false.
+	 */
+	private boolean spaceBefore(String token) {
+		final boolean result;
+
 		if (token.length() == 1) {
 			switch (token.charAt(0)) {
 			case '.':
@ -241,9 +139,10 @@ class TextGenerator {
 				 * the apostrophe lost
 				 */
 			case 't':
-					/* similar; probably 'doesn't' or 'shouldn't' or other cases
-					 * of 'not' with an elided 'o'.
-					 */
+				/*
+				 * similar; probably 'doesn't' or 'shouldn't' or other cases of
+				 * 'not' with an elided 'o'.
+				 */
 				result = false;
 				break;
 			default:
@ -253,107 +152,120 @@ class TextGenerator {
 		} else {
 			result = false;
 		}
-        
-        return result;
-    }

-    /**
-     * If this token is an end-of-sentence token, then, on one chance in 
-     * some, have the writer write two new lines. NOTE: The tokeniser is treating
-     * PERIOD ('.') as a word character, even though it has not been told to. 
-     * Token.endsWith( PERIOD) is a hack to get round this problem.
-     * TODO: investigate and fix.
-     * 
-     * @param token a token
-     * @param dickens our scrivenor
-     * @throws IOException if Mr Dickens has run out of ink
-     */
-    private void maybeParagraph(String token, BufferedWriter dickens) throws IOException {
-        if ( token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
-            dickens.write("\n\n");
-        }
-    }
+		return result;
+	}

-    /**
-     * Recursive, backtracking, output generator.
-     * @param rules
-     * @param tupleLength
-     * @param length
-     * @return 
-     */
-    private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
-        Stack<String> preamble = composePreamble( rules);
-        WordSequence result = new WordSequence();
-        
-        // composing the preamble will have ended with *ROOT* on top of the stack;
-        // get rid of it.
-        preamble.pop();
- 
-        result.addAll(preamble);
-        
-        result.addAll(this.compose( preamble, rules, rules, tupleLength, length));
-        return result;
-    }
-    
-    /**
-     * Recursively attempt to find sequences in the ruleset to append to 
-     * what's been composed so far.
-     * @param glanceBack
-     * @param allRules
-     * @param currentRules
-     * @param tupleLength
-     * @param length
-     * @return 
-     */
+	/**
+	 * If this token is an end-of-sentence token, then, on one chance in some,
+	 * have the writer write two new lines. NOTE: The tokeniser is treating
+	 * PERIOD ('.') as a word character, even though it has not been told to.
+	 * Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
+	 * investigate and fix.
+	 * 
+	 * @param token
+	 *            a token
+	 * @param dickens
+	 *            our scrivenor
+	 * @throws IOException
+	 *             if Mr Dickens has run out of ink
+	 */
+	private void maybeParagraph(String token, BufferedWriter dickens)
+			throws IOException {
+		if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
+			dickens.write("\n\n");
+		}
+	}
+
+	/**
+	 * Recursive, backtracking, output generator.
+	 * 
+	 * @param rules
+	 * @param tupleLength
+	 * @param length
+	 * @return
+	 */
+	private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
+		Stack<String> preamble = composePreamble(rules);
+		WordSequence result = new WordSequence();
+
+		// composing the preamble will have ended with *ROOT* on top of the
+		// stack;
+		// get rid of it.
+		preamble.pop();
+
+		result.addAll(preamble);
+
+		result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
+		return result;
+	}
+
+	/**
+	 * Recursively attempt to find sequences in the ruleset to append to what's
+	 * been composed so far.
+	 * 
+	 * @param glanceBack
+	 * @param allRules
+	 * @param currentRules
+	 * @param tupleLength
+	 * @param length
+	 * @return
+	 */
 	private WordSequence compose(Stack<String> glanceBack,
 			RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
 			int length) {
-        assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
-        assert (allRules.getWord() == ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
-        WordSequence result;
+		assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
+		assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
+		WordSequence result;

-        try {
-            @SuppressWarnings("unchecked")
-			String here = currentRules.getWord((Stack<String>) glanceBack.clone());
-            System.err.println( String.format( "Trying token %s", here));
+		try {
+			@SuppressWarnings("unchecked")
+			String here = currentRules.getWord((Stack<String>) glanceBack
+					.clone());
+			System.err.println(String.format("Trying token %s", here));

-            result = new WordSequence();
-            result.add(here);
+			result = new WordSequence();
+			result.add(here);

-            if (length != 0) {
-                /* we're not done yet */
-                Collection<String> options = allRules.getSuccessors();
+			if (length != 0) {
+				/* we're not done yet */
+				Collection<String> options = allRules.getSuccessors();

-                for (String next : options) {
-                    WordSequence rest =
-                            this.tryOption( (Stack<String>) glanceBack.clone(), allRules,
-                            currentRules.getRule(next), tupleLength, length - 1);
+				for (String next : options) {
+					@SuppressWarnings("unchecked")
+					WordSequence rest = this
+							.tryOption((Stack<String>) glanceBack.clone(),
+									allRules, currentRules.getRule(next),
+									tupleLength, length - 1);

-                    if (rest != null) {
-                        /* we have a solution */
-                        result.addAll(rest);
-                        break;
-                    }
-                }
-            }
-        } catch (NoSuchPathException ex) {
-            Logger.getLogger(TextGenerator.class.getName()).log(Level.WARNING,
-                    String.format("No path %s: Backtracking...", glanceBack));
-            result = null;
-        }
+					if (rest != null) {
+						/* we have a solution */
+						result.addAll(rest);
+						break;
+					}
+				}
+			}
+		} catch (NoSuchPathException ex) {
+			System.err.println( String.format("No path %s: Backtracking...", glanceBack));
+			result = null;
+		}

-        return result;
-    }
-    
-    /**
-     * Try composing with this ruleset 
-     * @param glanceBack
-     * @param allRules all the rules there are.
-     * @param currentRules the current node in the rule tree.
-     * @param tupleLength the size of the glanceback window we're considering.
-     * @param length
-     * @return 
-     */
+		return result;
+	}
+
+	/**
+	 * Try composing with this ruleset
+	 * 
+	 * @param glanceBack
+	 * @param allRules
+	 *            all the rules there are.
+	 * @param currentRules
+	 *            the current node in the rule tree.
+	 * @param tupleLength
+	 *            the size of the glanceback window we're considering.
+	 * @param length
+	 * @return
+	 */
 	private WordSequence tryOption(Stack<String> glanceBack,
 			RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
 			int length) {
@ -364,69 +276,76 @@ class TextGenerator {
 				length);
 	}

-    /**
-     * Return a new stack comprising all the items on the current stack, 
-     * with this new string added at the bottom
-     *
-     * @param stack the stack to restack.
-     * @param bottom the item to place on the bottom.
-     * @return the restacked stack.
-     */
-    private Stack<String> restack(Stack<String> stack, String bottom) {
-        final Stack<String> result;
-        if (stack.isEmpty()) {
-            result = new Stack<String>();
-            result.push(bottom);
-        } else {
-            String top = stack.pop();
-            result = restack(stack, bottom);
-            result.push(top);
-        }
-        return result;
-    }
+	/**
+	 * Return a new stack comprising all the items on the current stack, with
+	 * this new string added at the bottom
+	 * 
+	 * @param stack
+	 *            the stack to restack.
+	 * @param bottom
+	 *            the item to place on the bottom.
+	 * @return the restacked stack.
+	 */
+	private Stack<String> restack(Stack<String> stack, String bottom) {
+		final Stack<String> result;
+		if (stack.isEmpty()) {
+			result = new Stack<String>();
+			result.push(bottom);
+		} else {
+			String top = stack.pop();
+			result = restack(stack, bottom);
+			result.push(top);
+		}
+		return result;
+	}

+	/**
+	 * Random walk of the rule tree to extract (from the root) a legal sequence
+	 * of words the length of our tuple.
+	 * 
+	 * @param rules
+	 *            the rule tree (fragment) to walk.
+	 * @return a sequence of words.
+	 */
+	private Stack<String> composePreamble(RuleTreeNode rules) {
+		final Stack<String> result;
+		final RuleTreeNode successor = rules.getRule();

-    /** 
-     * Random walk of the rule tree to extract (from the root) a legal sequence of words the length of our tuple.
-     * 
-     * @param rules the rule tree (fragment) to walk.
-     * @return a sequence of words.
-     */
-    private Stack<String> composePreamble(RuleTreeNode rules) {
-        final Stack<String> result;
-        final RuleTreeNode successor = rules.getRule();
+		if (successor == null) {
+			result = new Stack<String>();
+		} else {
+			result = this.composePreamble(successor);
+			result.push(rules.getWord());
+		}
+		return result;
+	}

-        if (successor == null) {
-            result = new Stack<String>();
-        } else {
-            result = this.composePreamble(successor);
-            result.push(rules.getWord());
-        }
-        return result;
-    }
+	/**
+	 * 
+	 * @param tokens
+	 *            a sequence of tokens
+	 * @param marker
+	 *            a marker to terminate after the last occurrance of.
+	 * @return a copy of tokens, truncated at the last occurrance of the marker.
+	 */
+	private WordSequence truncateAtLastInstance(WordSequence tokens,
+			String marker) {
+		final WordSequence result = new WordSequence();

-    /**
-     * 
-     * @param tokens a sequence of tokens
-     * @param marker a marker to terminate after the last occurrance of.
-     * @return a copy of tokens, truncated at the last occurrance of the marker.
-     */
-    private WordSequence truncateAtLastInstance(WordSequence tokens, 
-            String marker) {
-        final WordSequence result = new WordSequence();
+		if (!tokens.isEmpty()) {

-        if (!tokens.isEmpty()) {
+			String token = tokens.remove();
+			result.add(token);
+			if (!(marker.equals(token) && !tokens.contains(marker))) {
+				/*
+				 * woah, double negatives. If the token we're looking at is the
+				 * marker, and the remainder of the tokens does not include the
+				 * marker, we're done. Otherwise, we continue. OK?
+				 */
+				result.addAll(this.truncateAtLastInstance(tokens, marker));
+			}
+		}

-            String token = tokens.remove();
-            result.add(token);
-            if (!(marker.equals(token) && !tokens.contains(marker))) {
-                /* woah, double negatives. If the token we're looking at is the
-                 * marker, and the remainder of the tokens does not include the 
-                 * marker, we're done. Otherwise, we continue. OK? */
-                result.addAll(this.truncateAtLastInstance(tokens, marker));
-            }
-        }
-
-        return result;
-    }
+		return result;
+	}
 }
--- a/src/cc/journeyman/milkwood/Tokeniser.java
+++ b/src/cc/journeyman/milkwood/Tokeniser.java
@ -0,0 +1,74 @@
+/*
+ * Proprietary unpublished source code property of 
+ * Simon Brooke <simon@journeyman.cc>.
+ * 
+ * Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
+ */
+package cc.journeyman.milkwood;
+
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.StreamTokenizer;
+
+/**
+ * A tokeniser which reads tokens in a manner which suits me. Although this
+ * implementation is based on a StreamTokenizer, the point of separating this
+ * out into its own class is that if I had more time I could reimplement.
+ * 
+ * @author simon
+ * 
+ */
+public class Tokeniser extends StreamTokenizer {
+
+	public Tokeniser(Reader r) {
+		super(r);
+
+		this.resetSyntax();
+		this.whitespaceChars(8, 15);
+		this.whitespaceChars(28, 32);
+		/*
+		 * treat quotemarks as white space. Actually it would be better if quote
+		 * marks were white space only if preceded or followed by whitespace, so
+		 * that, e.g., 'don't' and 'can't' appeared as single tokens. But that
+		 * means really reimplementing the parser and I don't have time.
+		 */
+		this.whitespaceChars((int) '\"', (int) '\"');
+		this.whitespaceChars((int) '\'', (int) '\'');
+		this.wordChars((int) '0', (int) '9');
+		this.wordChars((int) 'A', (int) 'Z');
+		this.wordChars((int) 'a', (int) 'z');
+	}
+
+	public Tokeniser(InputStream in) {
+		this(new BufferedReader(new InputStreamReader(in)));
+	}
+
+	/**
+	 * There surely must be a better way to get just the token out of a
+	 * StreamTokenizer...!
+	 */
+	public String readBareToken() {
+		final String token;
+
+		switch (this.ttype) {
+		case StreamTokenizer.TT_EOL:
+			token = "FIXME"; // TODO: fix this!
+			break;
+		case StreamTokenizer.TT_NUMBER:
+			token = new Double(this.nval).toString();
+			break;
+		case StreamTokenizer.TT_WORD:
+			token = this.sval.toLowerCase();
+			break;
+		default:
+			StringBuffer buffy = new StringBuffer();
+			buffy.append((char) this.ttype);
+			token = buffy.toString();
+			break;
+		}
+		return token;
+	}
+
+}