From a876cb6d1b7cb41dac963c2e920d9b49a94b79f0 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Thu, 31 Oct 2013 11:32:11 +0000 Subject: [PATCH] All working very beautifully. --- README.txt | 5 + src/cc/journeyman/milkwood/Composer.java | 168 ++++++++----------- src/cc/journeyman/milkwood/Milkwood.java | 32 ++-- src/cc/journeyman/milkwood/RuleTreeNode.java | 25 ++- src/cc/journeyman/milkwood/Tokeniser.java | 7 + src/cc/journeyman/milkwood/WordStack.java | 57 +++++++ src/cc/journeyman/milkwood/Writer.java | 18 +- 7 files changed, 195 insertions(+), 117 deletions(-) create mode 100644 src/cc/journeyman/milkwood/WordStack.java diff --git a/README.txt b/README.txt index 163facc..b2c7bc2 100644 --- a/README.txt +++ b/README.txt @@ -76,4 +76,9 @@ Decluttered the TextGenerator class by moving the whole read stage into two new Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt... +Parsing word tuples for n > 2 working sweetly. That is not the problem! + +Major refactoring and cleanup of the compose stage... + +ye! Utuvienyes diff --git a/src/cc/journeyman/milkwood/Composer.java b/src/cc/journeyman/milkwood/Composer.java index 068b5f5..c69f863 100644 --- a/src/cc/journeyman/milkwood/Composer.java +++ b/src/cc/journeyman/milkwood/Composer.java @@ -1,7 +1,7 @@ package cc.journeyman.milkwood; import java.util.Collection; -import java.util.Stack; +import java.util.Collections; /** * Composes text output based on a rule tree. @@ -27,23 +27,29 @@ public class Composer { /** * Recursive, backtracking, output generator. * - * @param rules - * @param tupleLength - * @param length - * @return + * @param rules the rule set we're working to. + * @param length the number of tokens still to be output. + * @return if a successful path forward is found, that path, else null. */ - public WordSequence compose(RuleTreeNode rules, int tupleLength, int length) { - Stack preamble = composePreamble(rules); + public WordSequence compose(RuleTreeNode rules, int length) { + WordStack preamble = composePreamble(rules); WordSequence result = new WordSequence(); // composing the preamble will have ended with *ROOT* on top of the // stack; // get rid of it. preamble.pop(); + + if (debug) { + System.err.println( "Preamble: " + preamble); + } result.addAll(preamble); - - result.addAll(this.compose(preamble, rules, rules, tupleLength, length)); + + WordStack body = this.compose(preamble, rules, length); + Collections.reverse(body); + result.addAll(body); + return result; } @@ -51,103 +57,63 @@ public class Composer { * Recursively attempt to find sequences in the ruleset to append to what's * been composed so far. * - * @param glanceBack - * @param allRules - * @param currentRules - * @param tupleLength - * @param length - * @return + * @param glanceBack the last few words output. + * @param rules the rule set we're working to. + * @param length the number of tokens still to be output. + * @return if a successful path forward is found, that path, else null. */ - private WordSequence compose(Stack glanceBack, - RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, + private WordStack compose(WordStack glanceBack, RuleTreeNode rules, int length) { - assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size"; - assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set"; - WordSequence result; - - try { - @SuppressWarnings("unchecked") - String here = currentRules.getWord((Stack) glanceBack - .clone()); - System.err.println(String.format("Trying token %s", here)); - - result = new WordSequence(); - result.add(here); - - if (length != 0) { - /* we're not done yet */ - Collection options = allRules.getSuccessors(); - - for (String next : options) { - @SuppressWarnings("unchecked") - WordSequence rest = this - .tryOption((Stack) glanceBack.clone(), - allRules, currentRules.getRule(next), - tupleLength, length - 1); - - if (rest != null) { - /* we have a solution */ - result.addAll(rest); - break; - } - } - } - } catch (NoSuchPathException ex) { - if (debug) { - System.err.println(String.format("No path %s: Backtracking...", - glanceBack)); - } - result = null; + final WordStack result; + + if ( debug) { + System.err.println( String.format( "%d: %s", length, glanceBack)); } - return result; - } - - /** - * Try composing with this ruleset - * - * @param glanceBack - * @param allRules - * all the rules there are. - * @param currentRules - * the current node in the rule tree. - * @param tupleLength - * the size of the glanceback window we're considering. - * @param length - * @return - */ - private WordSequence tryOption(Stack glanceBack, - RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, - int length) { - final Stack restack = this.restack(glanceBack, - currentRules.getWord()); - restack.pop(); - return this.compose(restack, allRules, currentRules, tupleLength, - length); - } - - /** - * Return a new stack comprising all the items on the current stack, with - * this new string added at the bottom - * - * @param stack - * the stack to restack. - * @param bottom - * the item to place on the bottom. - * @return the restacked stack. - */ - private Stack restack(Stack stack, String bottom) { - final Stack result; - if (stack.isEmpty()) { - result = new Stack(); - result.push(bottom); + /* are we there yet? */ + if (length == 0) { + result = new WordStack(); } else { - String top = stack.pop(); - result = restack(stack, bottom); - result.push(top); + /* + * are there any rules in this ruleset which matches the current + * sliding window? if so, then recurse; if not, then fail. + */ + Collection words = rules.match(glanceBack.duplicate()); + + if (words.isEmpty()) { + /* backtrack */ + result = null; + } else { + result = tryOptions(words, glanceBack, rules, length); + } } return result; } + + /** + * Try each of these candidates in turn, attempting to recurse. + * @param candidates words which could potentially be added to the output. + * @param glanceBack the last few words output. + * @param allRules the rule set we're working to. + * @param length the number of tokens still to be output. + * @return if a successful path forward is found, that path, else null. + */ + private WordStack tryOptions(Collection candidates, + WordStack glanceBack, RuleTreeNode allRules, int length) { + WordStack result = null; + + for ( String candidate : candidates) { + result = compose( new WordStack(glanceBack, candidate), allRules, length - 1); + if ( result != null) { + /* by Jove, I think she's got it! */ + result.push(candidate); + break; + } + } + + return result; + } + /** * Random walk of the rule tree to extract (from the root) a legal sequence @@ -157,12 +123,12 @@ public class Composer { * the rule tree (fragment) to walk. * @return a sequence of words. */ - private Stack composePreamble(RuleTreeNode rules) { - final Stack result; + private WordStack composePreamble(RuleTreeNode rules) { + final WordStack result; final RuleTreeNode successor = rules.getRule(); if (successor == null) { - result = new Stack(); + result = new WordStack(); } else { result = this.composePreamble(successor); result.push(rules.getWord()); diff --git a/src/cc/journeyman/milkwood/Milkwood.java b/src/cc/journeyman/milkwood/Milkwood.java index d09947f..19011b7 100644 --- a/src/cc/journeyman/milkwood/Milkwood.java +++ b/src/cc/journeyman/milkwood/Milkwood.java @@ -31,13 +31,15 @@ public class Milkwood { *
*
-d, -debug
*
Print debugging output to standard error
- *
-i, -input
+ *
-i [FILE], -input [FILE]
*
Input file, expected to be an English (or, frankly, other natural * language) text. Defaults to standard in.
- *
-n, -tuple-length
- *
The length of tuples into which the file will be analised, default 2. + *
-l [NN], -length [NN]
+ *
The length in tuples of the desired output. Defaults to 100. + *
-n [NN], -tuple-length [NN]
+ *
The length of tuples into which the file will be analysed, default 2. *
- *
-o, -output
+ *
-o [FILE], -output [FILE]
*
Output file, to which generated text will be written. Defaults to * standard out.
*
@@ -55,6 +57,7 @@ public class Milkwood { OutputStream out = System.out; int tupleLength = 2; boolean debug = false; + int length = 100; for (int cursor = 0; cursor < args.length; cursor++) { String arg = args[cursor]; @@ -71,6 +74,9 @@ public class Milkwood { case 'o': // output out = new FileOutputStream(new File(args[++cursor])); break; + case 'l': // length + length = Integer.parseInt(args[++cursor]); + break; case 'n': case 't': // tuple length tupleLength = Integer.parseInt(args[++cursor]); @@ -82,7 +88,7 @@ public class Milkwood { } } try { - new Milkwood().readAndGenerate(in, out, tupleLength, debug); + new Milkwood().readAndGenerate(in, out, tupleLength, length, debug); } finally { out.close(); } @@ -97,6 +103,8 @@ public class Milkwood { * the output stream to write to. * @param tupleLength * the length of tuples to be used in generation. + * @param length + * the length in tokens of the output to be generated. * @param debug * whether to print debugging output. * @throws IOException @@ -104,14 +112,19 @@ public class Milkwood { * scheme of things, very likely. */ void readAndGenerate(final InputStream in, final OutputStream out, - final int tupleLength, boolean debug) throws IOException { + final int tupleLength, int length, boolean debug) + throws IOException { /* The root of the rule tree I shall build. */ RuleTreeNode root = new RuleTreeNode(); - int length = read(in, tupleLength, debug, root); + read(in, tupleLength, debug, root); WordSequence tokens = compose(tupleLength, debug, root, length); write(out, debug, tokens); + + if ( debug) { + System.err.println( "\n\nCompleted."); + } } /** @@ -142,8 +155,7 @@ public class Milkwood { private WordSequence compose(final int tupleLength, boolean debug, RuleTreeNode root, int length) { - WordSequence tokens = new Composer(debug).compose(root, tupleLength, - length); + WordSequence tokens = new Composer(debug).compose(root, length); if (tokens.contains(PERIOD)) { tokens = tokens.truncateAtLastInstance(PERIOD); @@ -168,7 +180,7 @@ public class Milkwood { WordSequence tokens) throws IOException { Writer scrivenor = new Writer(out, debug); try { - scrivenor.generate(tokens); + scrivenor.writeSequence(tokens); } finally { scrivenor.close(); } diff --git a/src/cc/journeyman/milkwood/RuleTreeNode.java b/src/cc/journeyman/milkwood/RuleTreeNode.java index 144fb2c..413b9b4 100644 --- a/src/cc/journeyman/milkwood/RuleTreeNode.java +++ b/src/cc/journeyman/milkwood/RuleTreeNode.java @@ -170,7 +170,7 @@ public class RuleTreeNode { final RuleTreeNode successor = this.getRule(path.pop()); if (successor == null) { - throw new NoSuchPathException(); + result = null; } else { result = successor.getWord(path); } @@ -178,4 +178,27 @@ public class RuleTreeNode { return result; } + + /** + * Find all the terminal strings in the current rule set which would match this path. + * @param path the path to match + * @return a collection (possibly empty) of potential successors. + */ + public Collection match(WordStack path) { + final Collection result; + + if ( path.isEmpty()) { + result = this.getSuccessors(); + } else { + final RuleTreeNode successor = this.getRule(path.pop()); + + if (successor == null) { + result = new ArrayList(); + } else { + result = successor.match(path); + } + } + + return result; + } } diff --git a/src/cc/journeyman/milkwood/Tokeniser.java b/src/cc/journeyman/milkwood/Tokeniser.java index 86a279c..7ce945b 100644 --- a/src/cc/journeyman/milkwood/Tokeniser.java +++ b/src/cc/journeyman/milkwood/Tokeniser.java @@ -36,6 +36,13 @@ public class Tokeniser extends StreamTokenizer { */ this.whitespaceChars((int) '\"', (int) '\"'); this.whitespaceChars((int) '\'', (int) '\''); + /* + * treat underscore and hyphen as whitespace as well. Again, hyphen with + * either leading or trailing non-whitespace probably ought to be + * treated specially, but... + */ + this.whitespaceChars((int) '_', (int) '_'); + this.whitespaceChars((int) '-', (int) '-'); this.wordChars((int) '0', (int) '9'); this.wordChars((int) 'A', (int) 'Z'); this.wordChars((int) 'a', (int) 'z'); diff --git a/src/cc/journeyman/milkwood/WordStack.java b/src/cc/journeyman/milkwood/WordStack.java new file mode 100644 index 0000000..9760182 --- /dev/null +++ b/src/cc/journeyman/milkwood/WordStack.java @@ -0,0 +1,57 @@ +package cc.journeyman.milkwood; + +import java.util.Stack; + +/** + * Sliding window which rules may match. + * + * @author simon + * + */ +public class WordStack extends Stack { + + private static final long serialVersionUID = 1L; + + /** + * Create a new, empty, wordstack. + */ + public WordStack() { + super(); + } + + /** + * create a new window from this window, having this new word as its + * terminal and ommitting the current first word. That is, the new window + * should be as long as the old, with each word shuffled up one place. + * + * @param prototype the window to copy from. + * @param terminal the new terminal word. + */ + public WordStack(WordStack prototype, String terminal) { + this(); + + WordStack copy = prototype.duplicate(); + copy.pop(); + this.populate( copy, terminal); + } + + private void populate(WordStack copy, String terminal) { + if ( copy.isEmpty()) { + this.push(terminal); + } else { + String token = copy.pop(); + this.populate(copy, terminal); + this.push( token); + } + } + + /** + * A wrapper round clone which hides all the ugly casting. + * + * @return a duplicate copy of myself. + */ + public WordStack duplicate() { + return (WordStack) this.clone(); + } + +} diff --git a/src/cc/journeyman/milkwood/Writer.java b/src/cc/journeyman/milkwood/Writer.java index 75467ab..527a342 100644 --- a/src/cc/journeyman/milkwood/Writer.java +++ b/src/cc/journeyman/milkwood/Writer.java @@ -14,6 +14,8 @@ import java.util.Locale; import java.util.Random; /** + * A special purpose writer to write sequences of tokens, chopping them up into + * paragraphs on the fly.. * * @author Simon Brooke */ @@ -59,7 +61,7 @@ class Writer extends BufferedWriter { * @throws IOException * if it is impossible to write (e.g. file system full). */ - public void generate(WordSequence tokens) throws IOException { + public void writeSequence(WordSequence tokens) throws IOException { boolean capitaliseNext = true; try { @@ -113,7 +115,11 @@ class Writer extends BufferedWriter { private boolean spaceBefore(String token) { final boolean result; - if (token.length() == 1) { + switch (token.length()) { + case 0: + result = false; + break; + case 1: switch (token.charAt(0)) { case '.': case ',': @@ -135,8 +141,9 @@ class Writer extends BufferedWriter { result = true; break; } - } else { - result = false; + break; + default: + result = true; } return result; @@ -155,7 +162,8 @@ class Writer extends BufferedWriter { * if Mr this has run out of ink */ private void maybeParagraph(String token) throws IOException { - if (token.endsWith(Milkwood.PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { + if (token.endsWith(Milkwood.PERIOD) + && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { this.write("\n\n"); } }