From 0012a72e3f3b5049b54345df46446aa19fdf5aaf Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Thu, 31 Oct 2013 09:42:20 +0000 Subject: [PATCH] Further 'refactored' (read: decluttered) to isolate the code that's failing. --- README.txt | 3 + src/cc/journeyman/milkwood/Composer.java | 173 +++++++++ src/cc/journeyman/milkwood/Milkwood.java | 72 +++- src/cc/journeyman/milkwood/TextGenerator.java | 351 ------------------ src/cc/journeyman/milkwood/WordSequence.java | 47 ++- src/cc/journeyman/milkwood/Writer.java | 163 ++++++++ 6 files changed, 454 insertions(+), 355 deletions(-) create mode 100644 src/cc/journeyman/milkwood/Composer.java delete mode 100644 src/cc/journeyman/milkwood/TextGenerator.java create mode 100644 src/cc/journeyman/milkwood/Writer.java diff --git a/README.txt b/README.txt index 5cccef9..163facc 100644 --- a/README.txt +++ b/README.txt @@ -74,3 +74,6 @@ New Git repository, and this time pushed out to Goldsmith, so that local power p Decluttered the TextGenerator class by moving the whole read stage into two new classes, Generator and Tokeniser. More declutter needed. +Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt... + + diff --git a/src/cc/journeyman/milkwood/Composer.java b/src/cc/journeyman/milkwood/Composer.java new file mode 100644 index 0000000..068b5f5 --- /dev/null +++ b/src/cc/journeyman/milkwood/Composer.java @@ -0,0 +1,173 @@ +package cc.journeyman.milkwood; + +import java.util.Collection; +import java.util.Stack; + +/** + * Composes text output based on a rule tree. + * + * @author simon + * + */ +public class Composer { + /** + * Whether or not I am in debugging mode. + */ + private final boolean debug; + + /** + * + * @param debug + * Whether or not I am in debugging mode. + */ + public Composer(boolean debug) { + this.debug = debug; + } + + /** + * Recursive, backtracking, output generator. + * + * @param rules + * @param tupleLength + * @param length + * @return + */ + public WordSequence compose(RuleTreeNode rules, int tupleLength, int length) { + Stack preamble = composePreamble(rules); + WordSequence result = new WordSequence(); + + // composing the preamble will have ended with *ROOT* on top of the + // stack; + // get rid of it. + preamble.pop(); + + result.addAll(preamble); + + result.addAll(this.compose(preamble, rules, rules, tupleLength, length)); + return result; + } + + /** + * Recursively attempt to find sequences in the ruleset to append to what's + * been composed so far. + * + * @param glanceBack + * @param allRules + * @param currentRules + * @param tupleLength + * @param length + * @return + */ + private WordSequence compose(Stack glanceBack, + RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, + int length) { + assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size"; + assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set"; + WordSequence result; + + try { + @SuppressWarnings("unchecked") + String here = currentRules.getWord((Stack) glanceBack + .clone()); + System.err.println(String.format("Trying token %s", here)); + + result = new WordSequence(); + result.add(here); + + if (length != 0) { + /* we're not done yet */ + Collection options = allRules.getSuccessors(); + + for (String next : options) { + @SuppressWarnings("unchecked") + WordSequence rest = this + .tryOption((Stack) glanceBack.clone(), + allRules, currentRules.getRule(next), + tupleLength, length - 1); + + if (rest != null) { + /* we have a solution */ + result.addAll(rest); + break; + } + } + } + } catch (NoSuchPathException ex) { + if (debug) { + System.err.println(String.format("No path %s: Backtracking...", + glanceBack)); + } + result = null; + } + + return result; + } + + /** + * Try composing with this ruleset + * + * @param glanceBack + * @param allRules + * all the rules there are. + * @param currentRules + * the current node in the rule tree. + * @param tupleLength + * the size of the glanceback window we're considering. + * @param length + * @return + */ + private WordSequence tryOption(Stack glanceBack, + RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, + int length) { + final Stack restack = this.restack(glanceBack, + currentRules.getWord()); + restack.pop(); + return this.compose(restack, allRules, currentRules, tupleLength, + length); + } + + /** + * Return a new stack comprising all the items on the current stack, with + * this new string added at the bottom + * + * @param stack + * the stack to restack. + * @param bottom + * the item to place on the bottom. + * @return the restacked stack. + */ + private Stack restack(Stack stack, String bottom) { + final Stack result; + if (stack.isEmpty()) { + result = new Stack(); + result.push(bottom); + } else { + String top = stack.pop(); + result = restack(stack, bottom); + result.push(top); + } + return result; + } + + /** + * Random walk of the rule tree to extract (from the root) a legal sequence + * of words the length of our tuple. + * + * @param rules + * the rule tree (fragment) to walk. + * @return a sequence of words. + */ + private Stack composePreamble(RuleTreeNode rules) { + final Stack result; + final RuleTreeNode successor = rules.getRule(); + + if (successor == null) { + result = new Stack(); + } else { + result = this.composePreamble(successor); + result.push(rules.getWord()); + } + return result; + } + +} diff --git a/src/cc/journeyman/milkwood/Milkwood.java b/src/cc/journeyman/milkwood/Milkwood.java index 2343614..d09947f 100644 --- a/src/cc/journeyman/milkwood/Milkwood.java +++ b/src/cc/journeyman/milkwood/Milkwood.java @@ -20,6 +20,10 @@ import java.io.OutputStream; * @author Simon Brooke */ public class Milkwood { + /** + * The magic token which is deemed to end sentences. + */ + public static final String PERIOD = "."; /** * Parse command line arguments and kick off the process. Expected arguments @@ -46,6 +50,7 @@ public class Milkwood { */ public static void main(String[] args) throws FileNotFoundException, IOException { + /* defaults */ InputStream in = System.in; OutputStream out = System.out; int tupleLength = 2; @@ -76,8 +81,11 @@ public class Milkwood { } } } - - new Milkwood().readAndGenerate(in, out, tupleLength, debug); + try { + new Milkwood().readAndGenerate(in, out, tupleLength, debug); + } finally { + out.close(); + } } /** @@ -99,13 +107,71 @@ public class Milkwood { final int tupleLength, boolean debug) throws IOException { /* The root of the rule tree I shall build. */ RuleTreeNode root = new RuleTreeNode(); + int length = read(in, tupleLength, debug, root); + + WordSequence tokens = compose(tupleLength, debug, root, length); + + write(out, debug, tokens); + } + + /** + * Digest the input into a set of rules. + * + * @param in + * the input stream. + * @param tupleLength + * the length of tuples we shall consider. + * @param debug + * whether or not to print debugging output. + * @param root + * the root of the rule tree. + * @return the number of tokens read. + * @throws IOException + * if the file system buggers up, which is not, in the cosmic + * scheme of things, very likely. + */ + private int read(final InputStream in, final int tupleLength, + boolean debug, RuleTreeNode root) throws IOException { int length = new Digester().read(in, tupleLength, root); if (debug) { System.err.println(root.toString()); } + return length; + } - new TextGenerator().generate(out, tupleLength, root, length); + private WordSequence compose(final int tupleLength, boolean debug, + RuleTreeNode root, int length) { + WordSequence tokens = new Composer(debug).compose(root, tupleLength, + length); + + if (tokens.contains(PERIOD)) { + tokens = tokens.truncateAtLastInstance(PERIOD); + } + return tokens; + } + + /** + * Write this sequence of tokens to this output. + * + * @param out + * the stream to which to write. + * @param debug + * whether or not to print debugging output. + * @param tokens + * the sequence of tokens to write. + * @throws IOException + * if the file system buggers up, which is not, in the cosmic + * scheme of things, very likely. + */ + private void write(final OutputStream out, boolean debug, + WordSequence tokens) throws IOException { + Writer scrivenor = new Writer(out, debug); + try { + scrivenor.generate(tokens); + } finally { + scrivenor.close(); + } } } diff --git a/src/cc/journeyman/milkwood/TextGenerator.java b/src/cc/journeyman/milkwood/TextGenerator.java deleted file mode 100644 index b66129a..0000000 --- a/src/cc/journeyman/milkwood/TextGenerator.java +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Proprietary unpublished source code property of - * Simon Brooke . - * - * Copyright (c) 2013 Simon Brooke - */ -package cc.journeyman.milkwood; - -import java.io.BufferedWriter; -import java.io.IOException; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.util.Collection; -import java.util.Locale; -import java.util.Random; -import java.util.Stack; - -/** - * - * @author Simon Brooke - */ -class TextGenerator { - - /** - * The magic token which is deemed to end sentences. - */ - public static final String PERIOD = "."; - - /** - * The average number of sentences in a paragraph. - */ - public static final int AVSENTENCESPERPARA = 5; - /** - * A random number generator. - */ - private static Random RANDOM = new Random(); - /** - * Dictionary of first-words we know about; each first-word maps onto a - * tuple of tuples of word sequences beginning with that word, so 'I' might - * map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]]. - */ - TupleDictionary dictionary = new TupleDictionary(); - - public TextGenerator() { - } - - - public void generate(OutputStream out, int tupleLength, RuleTreeNode root, - int length) throws IOException { - WordSequence tokens = this.compose(root, tupleLength, length); - - if (tokens.contains(PERIOD)) { - // TODO: eq = equal? - tokens = this.truncateAtLastInstance(tokens, PERIOD); - } - - this.generate(out, tokens); - } - - /** - * Write this sequence of tokens on this stream, sorting out minor issues of - * orthography. - * - * @param out - * the stream. - * @param tokens - * the tokens. - * @throws IOException - * if it is impossible to write (e.g. file system full). - */ - private void generate(OutputStream out, WordSequence tokens) - throws IOException { - BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out)); - boolean capitaliseNext = true; - - try { - for (String token : tokens) { - capitaliseNext = writeToken(dickens, capitaliseNext, token); - } - } finally { - dickens.flush(); - dickens.close(); - } - } - - /** - * Deal with end of paragraph, capital after full stop, and other minor - * orthographic conventions. - * - * @param dickens - * the scrivenor who writes for us. - * @param capitalise - * whether or not the token should be capitalised - * @param token - * the token to write; - * @returnvtrue if the next token to be written should be capitalised. - * @throws IOException - */ - private boolean writeToken(BufferedWriter dickens, boolean capitalise, - String token) throws IOException { - if (this.spaceBefore(token)) { - dickens.write(" "); - } - if (capitalise) { - dickens.write(token.substring(0, 1) - .toUpperCase(Locale.getDefault())); - dickens.write(token.substring(1)); - } else { - dickens.write(token); - } - - this.maybeParagraph(token, dickens); - - return (token.endsWith(PERIOD)); - } - - /** - * Return false if token is punctuation, else true. Wouldn't it be nice if - * Java provided Character.isPunctuation(char)? However, since it doesn't, I - * can give this slightly special semantics: return true only if this is - * punctuation which would not normally be preceded with a space. - * - * @param ch - * a character. - * @return true if the should be preceded by a space, else false. - */ - private boolean spaceBefore(String token) { - final boolean result; - - if (token.length() == 1) { - switch (token.charAt(0)) { - case '.': - case ',': - case ':': - case ';': - case 's': - /* - * an 's' on its own is probably evidence of a possessive with - * the apostrophe lost - */ - case 't': - /* - * similar; probably 'doesn't' or 'shouldn't' or other cases of - * 'not' with an elided 'o'. - */ - result = false; - break; - default: - result = true; - break; - } - } else { - result = false; - } - - return result; - } - - /** - * If this token is an end-of-sentence token, then, on one chance in some, - * have the writer write two new lines. NOTE: The tokeniser is treating - * PERIOD ('.') as a word character, even though it has not been told to. - * Token.endsWith( PERIOD) is a hack to get round this problem. TODO: - * investigate and fix. - * - * @param token - * a token - * @param dickens - * our scrivenor - * @throws IOException - * if Mr Dickens has run out of ink - */ - private void maybeParagraph(String token, BufferedWriter dickens) - throws IOException { - if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { - dickens.write("\n\n"); - } - } - - /** - * Recursive, backtracking, output generator. - * - * @param rules - * @param tupleLength - * @param length - * @return - */ - private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) { - Stack preamble = composePreamble(rules); - WordSequence result = new WordSequence(); - - // composing the preamble will have ended with *ROOT* on top of the - // stack; - // get rid of it. - preamble.pop(); - - result.addAll(preamble); - - result.addAll(this.compose(preamble, rules, rules, tupleLength, length)); - return result; - } - - /** - * Recursively attempt to find sequences in the ruleset to append to what's - * been composed so far. - * - * @param glanceBack - * @param allRules - * @param currentRules - * @param tupleLength - * @param length - * @return - */ - private WordSequence compose(Stack glanceBack, - RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, - int length) { - assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size"; - assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set"; - WordSequence result; - - try { - @SuppressWarnings("unchecked") - String here = currentRules.getWord((Stack) glanceBack - .clone()); - System.err.println(String.format("Trying token %s", here)); - - result = new WordSequence(); - result.add(here); - - if (length != 0) { - /* we're not done yet */ - Collection options = allRules.getSuccessors(); - - for (String next : options) { - @SuppressWarnings("unchecked") - WordSequence rest = this - .tryOption((Stack) glanceBack.clone(), - allRules, currentRules.getRule(next), - tupleLength, length - 1); - - if (rest != null) { - /* we have a solution */ - result.addAll(rest); - break; - } - } - } - } catch (NoSuchPathException ex) { - System.err.println( String.format("No path %s: Backtracking...", glanceBack)); - result = null; - } - - return result; - } - - /** - * Try composing with this ruleset - * - * @param glanceBack - * @param allRules - * all the rules there are. - * @param currentRules - * the current node in the rule tree. - * @param tupleLength - * the size of the glanceback window we're considering. - * @param length - * @return - */ - private WordSequence tryOption(Stack glanceBack, - RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, - int length) { - final Stack restack = this.restack(glanceBack, - currentRules.getWord()); - restack.pop(); - return this.compose(restack, allRules, currentRules, tupleLength, - length); - } - - /** - * Return a new stack comprising all the items on the current stack, with - * this new string added at the bottom - * - * @param stack - * the stack to restack. - * @param bottom - * the item to place on the bottom. - * @return the restacked stack. - */ - private Stack restack(Stack stack, String bottom) { - final Stack result; - if (stack.isEmpty()) { - result = new Stack(); - result.push(bottom); - } else { - String top = stack.pop(); - result = restack(stack, bottom); - result.push(top); - } - return result; - } - - /** - * Random walk of the rule tree to extract (from the root) a legal sequence - * of words the length of our tuple. - * - * @param rules - * the rule tree (fragment) to walk. - * @return a sequence of words. - */ - private Stack composePreamble(RuleTreeNode rules) { - final Stack result; - final RuleTreeNode successor = rules.getRule(); - - if (successor == null) { - result = new Stack(); - } else { - result = this.composePreamble(successor); - result.push(rules.getWord()); - } - return result; - } - - /** - * - * @param tokens - * a sequence of tokens - * @param marker - * a marker to terminate after the last occurrance of. - * @return a copy of tokens, truncated at the last occurrance of the marker. - */ - private WordSequence truncateAtLastInstance(WordSequence tokens, - String marker) { - final WordSequence result = new WordSequence(); - - if (!tokens.isEmpty()) { - - String token = tokens.remove(); - result.add(token); - if (!(marker.equals(token) && !tokens.contains(marker))) { - /* - * woah, double negatives. If the token we're looking at is the - * marker, and the remainder of the tokens does not include the - * marker, we're done. Otherwise, we continue. OK? - */ - result.addAll(this.truncateAtLastInstance(tokens, marker)); - } - } - - return result; - } -} diff --git a/src/cc/journeyman/milkwood/WordSequence.java b/src/cc/journeyman/milkwood/WordSequence.java index 8706f45..6908848 100644 --- a/src/cc/journeyman/milkwood/WordSequence.java +++ b/src/cc/journeyman/milkwood/WordSequence.java @@ -13,10 +13,55 @@ import java.util.Queue; * An ordered sequence of words. Of course it implements Queue since it is a * LinkedList and LinkedList implements Queue, but I want to make it explicitly * clear that this is a queue and can be used as such. + * * @author Simon Brooke */ class WordSequence extends LinkedList implements Queue { private static final long serialVersionUID = 1L; - + + /** + * + * @param tokens + * a sequence of tokens + * @param marker + * a marker to terminate after the last occurrance of. + * @return a copy of tokens, truncated at the last occurrance of the marker. + */ + public WordSequence truncateAtLastInstance(String marker) { + final WordSequence result = new WordSequence(); + + for (String token : this) { + if (token.endsWith(marker) && !this.contains(marker)) { + /* + * If the token we're looking at ends with the marker, and the + * remainder of the tokens does not include a token ending with + * the marker, we're done. Otherwise, we continue. OK? + */ + break; + } + result.add(token); + } + + return result; + } + + /** + * Specialisation: Working around the bug that the tokeniser treats PERIOD as a word character. + */ + @Override + public boolean contains(Object target) { + boolean result = false; + if (target != null) { + String marker = target.toString(); + + for (String token : this) { + if (token.endsWith(marker)) { + result = true; + break; + } + } + } + return result; + } } diff --git a/src/cc/journeyman/milkwood/Writer.java b/src/cc/journeyman/milkwood/Writer.java new file mode 100644 index 0000000..75467ab --- /dev/null +++ b/src/cc/journeyman/milkwood/Writer.java @@ -0,0 +1,163 @@ +/* + * Proprietary unpublished source code property of + * Simon Brooke . + * + * Copyright (c) 2013 Simon Brooke + */ +package cc.journeyman.milkwood; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.util.Locale; +import java.util.Random; + +/** + * + * @author Simon Brooke + */ +class Writer extends BufferedWriter { + /** + * The average number of sentences in a paragraph. + */ + public static final int AVSENTENCESPERPARA = 5; + /** + * A random number generator. + */ + private static Random RANDOM = new Random(); + /** + * Dictionary of first-words we know about; each first-word maps onto a + * tuple of tuples of word sequences beginning with that word, so 'I' might + * map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]]. + */ + TupleDictionary dictionary = new TupleDictionary(); + + /** + * Whether or not I am in debugging mode. + */ + @SuppressWarnings("unused") + private final boolean debug; + + /** + * @param out + * the output stream to which I shall write. + * @param debug + * Whether or not I am in debugging mode. + */ + public Writer(OutputStream out, final boolean debug) { + super(new OutputStreamWriter(out)); + this.debug = debug; + } + + /** + * Write this sequence of tokens on this stream, sorting out minor issues of + * orthography. + * + * @param tokens + * the tokens. + * @throws IOException + * if it is impossible to write (e.g. file system full). + */ + public void generate(WordSequence tokens) throws IOException { + boolean capitaliseNext = true; + + try { + for (String token : tokens) { + capitaliseNext = writeToken(capitaliseNext, token); + } + } finally { + this.flush(); + this.close(); + } + } + + /** + * Deal with end of paragraph, capital after full stop, and other minor + * orthographic conventions. + * + * @param capitalise + * whether or not the token should be capitalised + * @param token + * the token to write; + * @returnvtrue if the next token to be written should be capitalised. + * @throws IOException + */ + private boolean writeToken(boolean capitalise, String token) + throws IOException { + if (this.spaceBefore(token)) { + this.write(" "); + } + if (capitalise) { + this.write(token.substring(0, 1).toUpperCase(Locale.getDefault())); + this.write(token.substring(1)); + } else { + this.write(token); + } + + this.maybeParagraph(token); + + return (token.endsWith(Milkwood.PERIOD)); + } + + /** + * Return false if token is punctuation, else true. Wouldn't it be nice if + * Java provided Character.isPunctuation(char)? However, since it doesn't, I + * can give this slightly special semantics: return true only if this is + * punctuation which would not normally be preceded with a space. + * + * @param ch + * a character. + * @return true if the should be preceded by a space, else false. + */ + private boolean spaceBefore(String token) { + final boolean result; + + if (token.length() == 1) { + switch (token.charAt(0)) { + case '.': + case ',': + case ':': + case ';': + case 's': + /* + * an 's' on its own is probably evidence of a possessive with + * the apostrophe lost + */ + case 't': + /* + * similar; probably 'doesn't' or 'shouldn't' or other cases of + * 'not' with an elided 'o'. + */ + result = false; + break; + default: + result = true; + break; + } + } else { + result = false; + } + + return result; + } + + /** + * If this token is an end-of-sentence token, then, on one chance in some, + * have the writer write two new lines. NOTE: The tokeniser is treating + * PERIOD ('.') as a word character, even though it has not been told to. + * Token.endsWith( PERIOD) is a hack to get round this problem. TODO: + * investigate and fix. + * + * @param token + * a token + * @throws IOException + * if Mr this has run out of ink + */ + private void maybeParagraph(String token) throws IOException { + if (token.endsWith(Milkwood.PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { + this.write("\n\n"); + } + } + +}