All working very beautifully.

This commit is contained in:
Simon Brooke 2013-10-31 11:32:11 +00:00
parent 0012a72e3f
commit a876cb6d1b
7 changed files with 195 additions and 117 deletions

View file

@ -76,4 +76,9 @@ Decluttered the TextGenerator class by moving the whole read stage into two new
Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt... Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt...
Parsing word tuples for n > 2 working sweetly. That is not the problem!
Major refactoring and cleanup of the compose stage...
ye! Utuvienyes

View file

@ -1,7 +1,7 @@
package cc.journeyman.milkwood; package cc.journeyman.milkwood;
import java.util.Collection; import java.util.Collection;
import java.util.Stack; import java.util.Collections;
/** /**
* Composes text output based on a rule tree. * Composes text output based on a rule tree.
@ -27,13 +27,12 @@ public class Composer {
/** /**
* Recursive, backtracking, output generator. * Recursive, backtracking, output generator.
* *
* @param rules * @param rules the rule set we're working to.
* @param tupleLength * @param length the number of tokens still to be output.
* @param length * @return if a successful path forward is found, that path, else null.
* @return
*/ */
public WordSequence compose(RuleTreeNode rules, int tupleLength, int length) { public WordSequence compose(RuleTreeNode rules, int length) {
Stack<String> preamble = composePreamble(rules); WordStack preamble = composePreamble(rules);
WordSequence result = new WordSequence(); WordSequence result = new WordSequence();
// composing the preamble will have ended with *ROOT* on top of the // composing the preamble will have ended with *ROOT* on top of the
@ -41,9 +40,16 @@ public class Composer {
// get rid of it. // get rid of it.
preamble.pop(); preamble.pop();
if (debug) {
System.err.println( "Preamble: " + preamble);
}
result.addAll(preamble); result.addAll(preamble);
result.addAll(this.compose(preamble, rules, rules, tupleLength, length)); WordStack body = this.compose(preamble, rules, length);
Collections.reverse(body);
result.addAll(body);
return result; return result;
} }
@ -51,103 +57,63 @@ public class Composer {
* Recursively attempt to find sequences in the ruleset to append to what's * Recursively attempt to find sequences in the ruleset to append to what's
* been composed so far. * been composed so far.
* *
* @param glanceBack * @param glanceBack the last few words output.
* @param allRules * @param rules the rule set we're working to.
* @param currentRules * @param length the number of tokens still to be output.
* @param tupleLength * @return if a successful path forward is found, that path, else null.
* @param length
* @return
*/ */
private WordSequence compose(Stack<String> glanceBack, private WordStack compose(WordStack glanceBack, RuleTreeNode rules,
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) { int length) {
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size"; final WordStack result;
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
WordSequence result;
try { if ( debug) {
@SuppressWarnings("unchecked") System.err.println( String.format( "%d: %s", length, glanceBack));
String here = currentRules.getWord((Stack<String>) glanceBack }
.clone());
System.err.println(String.format("Trying token %s", here));
result = new WordSequence(); /* are we there yet? */
result.add(here); if (length == 0) {
result = new WordStack();
} else {
/*
* are there any rules in this ruleset which matches the current
* sliding window? if so, then recurse; if not, then fail.
*/
Collection<String> words = rules.match(glanceBack.duplicate());
if (length != 0) { if (words.isEmpty()) {
/* we're not done yet */ /* backtrack */
Collection<String> options = allRules.getSuccessors(); result = null;
} else {
result = tryOptions(words, glanceBack, rules, length);
}
}
return result;
}
for (String next : options) { /**
@SuppressWarnings("unchecked") * Try each of these candidates in turn, attempting to recurse.
WordSequence rest = this * @param candidates words which could potentially be added to the output.
.tryOption((Stack<String>) glanceBack.clone(), * @param glanceBack the last few words output.
allRules, currentRules.getRule(next), * @param allRules the rule set we're working to.
tupleLength, length - 1); * @param length the number of tokens still to be output.
* @return if a successful path forward is found, that path, else null.
*/
private WordStack tryOptions(Collection<String> candidates,
WordStack glanceBack, RuleTreeNode allRules, int length) {
WordStack result = null;
if (rest != null) { for ( String candidate : candidates) {
/* we have a solution */ result = compose( new WordStack(glanceBack, candidate), allRules, length - 1);
result.addAll(rest); if ( result != null) {
/* by Jove, I think she's got it! */
result.push(candidate);
break; break;
} }
} }
}
} catch (NoSuchPathException ex) {
if (debug) {
System.err.println(String.format("No path %s: Backtracking...",
glanceBack));
}
result = null;
}
return result; return result;
} }
/**
* Try composing with this ruleset
*
* @param glanceBack
* @param allRules
* all the rules there are.
* @param currentRules
* the current node in the rule tree.
* @param tupleLength
* the size of the glanceback window we're considering.
* @param length
* @return
*/
private WordSequence tryOption(Stack<String> glanceBack,
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) {
final Stack<String> restack = this.restack(glanceBack,
currentRules.getWord());
restack.pop();
return this.compose(restack, allRules, currentRules, tupleLength,
length);
}
/**
* Return a new stack comprising all the items on the current stack, with
* this new string added at the bottom
*
* @param stack
* the stack to restack.
* @param bottom
* the item to place on the bottom.
* @return the restacked stack.
*/
private Stack<String> restack(Stack<String> stack, String bottom) {
final Stack<String> result;
if (stack.isEmpty()) {
result = new Stack<String>();
result.push(bottom);
} else {
String top = stack.pop();
result = restack(stack, bottom);
result.push(top);
}
return result;
}
/** /**
* Random walk of the rule tree to extract (from the root) a legal sequence * Random walk of the rule tree to extract (from the root) a legal sequence
@ -157,12 +123,12 @@ public class Composer {
* the rule tree (fragment) to walk. * the rule tree (fragment) to walk.
* @return a sequence of words. * @return a sequence of words.
*/ */
private Stack<String> composePreamble(RuleTreeNode rules) { private WordStack composePreamble(RuleTreeNode rules) {
final Stack<String> result; final WordStack result;
final RuleTreeNode successor = rules.getRule(); final RuleTreeNode successor = rules.getRule();
if (successor == null) { if (successor == null) {
result = new Stack<String>(); result = new WordStack();
} else { } else {
result = this.composePreamble(successor); result = this.composePreamble(successor);
result.push(rules.getWord()); result.push(rules.getWord());

View file

@ -31,13 +31,15 @@ public class Milkwood {
* <dl> * <dl>
* <dt>-d, -debug</dt> * <dt>-d, -debug</dt>
* <dd>Print debugging output to standard error</dd> * <dd>Print debugging output to standard error</dd>
* <dt>-i, -input</dt> * <dt>-i [FILE], -input [FILE]</dt>
* <dd>Input file, expected to be an English (or, frankly, other natural * <dd>Input file, expected to be an English (or, frankly, other natural
* language) text. Defaults to standard in.</dd> * language) text. Defaults to standard in.</dd>
* <dt>-n, -tuple-length</dt> * <dt>-l [NN], -length [NN]</dt>
* <dd>The length of tuples into which the file will be analised, default 2. * <dd>The length in tuples of the desired output. Defaults to 100.
* <dt>-n [NN], -tuple-length [NN]</dt>
* <dd>The length of tuples into which the file will be analysed, default 2.
* </dd> * </dd>
* <dt>-o, -output</dt> * <dt>-o [FILE], -output [FILE]</dt>
* <dd>Output file, to which generated text will be written. Defaults to * <dd>Output file, to which generated text will be written. Defaults to
* standard out.</dd> * standard out.</dd>
* </dl> * </dl>
@ -55,6 +57,7 @@ public class Milkwood {
OutputStream out = System.out; OutputStream out = System.out;
int tupleLength = 2; int tupleLength = 2;
boolean debug = false; boolean debug = false;
int length = 100;
for (int cursor = 0; cursor < args.length; cursor++) { for (int cursor = 0; cursor < args.length; cursor++) {
String arg = args[cursor]; String arg = args[cursor];
@ -71,6 +74,9 @@ public class Milkwood {
case 'o': // output case 'o': // output
out = new FileOutputStream(new File(args[++cursor])); out = new FileOutputStream(new File(args[++cursor]));
break; break;
case 'l': // length
length = Integer.parseInt(args[++cursor]);
break;
case 'n': case 'n':
case 't': // tuple length case 't': // tuple length
tupleLength = Integer.parseInt(args[++cursor]); tupleLength = Integer.parseInt(args[++cursor]);
@ -82,7 +88,7 @@ public class Milkwood {
} }
} }
try { try {
new Milkwood().readAndGenerate(in, out, tupleLength, debug); new Milkwood().readAndGenerate(in, out, tupleLength, length, debug);
} finally { } finally {
out.close(); out.close();
} }
@ -97,6 +103,8 @@ public class Milkwood {
* the output stream to write to. * the output stream to write to.
* @param tupleLength * @param tupleLength
* the length of tuples to be used in generation. * the length of tuples to be used in generation.
* @param length
* the length in tokens of the output to be generated.
* @param debug * @param debug
* whether to print debugging output. * whether to print debugging output.
* @throws IOException * @throws IOException
@ -104,14 +112,19 @@ public class Milkwood {
* scheme of things, very likely. * scheme of things, very likely.
*/ */
void readAndGenerate(final InputStream in, final OutputStream out, void readAndGenerate(final InputStream in, final OutputStream out,
final int tupleLength, boolean debug) throws IOException { final int tupleLength, int length, boolean debug)
throws IOException {
/* The root of the rule tree I shall build. */ /* The root of the rule tree I shall build. */
RuleTreeNode root = new RuleTreeNode(); RuleTreeNode root = new RuleTreeNode();
int length = read(in, tupleLength, debug, root); read(in, tupleLength, debug, root);
WordSequence tokens = compose(tupleLength, debug, root, length); WordSequence tokens = compose(tupleLength, debug, root, length);
write(out, debug, tokens); write(out, debug, tokens);
if ( debug) {
System.err.println( "\n\nCompleted.");
}
} }
/** /**
@ -142,8 +155,7 @@ public class Milkwood {
private WordSequence compose(final int tupleLength, boolean debug, private WordSequence compose(final int tupleLength, boolean debug,
RuleTreeNode root, int length) { RuleTreeNode root, int length) {
WordSequence tokens = new Composer(debug).compose(root, tupleLength, WordSequence tokens = new Composer(debug).compose(root, length);
length);
if (tokens.contains(PERIOD)) { if (tokens.contains(PERIOD)) {
tokens = tokens.truncateAtLastInstance(PERIOD); tokens = tokens.truncateAtLastInstance(PERIOD);
@ -168,7 +180,7 @@ public class Milkwood {
WordSequence tokens) throws IOException { WordSequence tokens) throws IOException {
Writer scrivenor = new Writer(out, debug); Writer scrivenor = new Writer(out, debug);
try { try {
scrivenor.generate(tokens); scrivenor.writeSequence(tokens);
} finally { } finally {
scrivenor.close(); scrivenor.close();
} }

View file

@ -170,7 +170,7 @@ public class RuleTreeNode {
final RuleTreeNode successor = this.getRule(path.pop()); final RuleTreeNode successor = this.getRule(path.pop());
if (successor == null) { if (successor == null) {
throw new NoSuchPathException(); result = null;
} else { } else {
result = successor.getWord(path); result = successor.getWord(path);
} }
@ -178,4 +178,27 @@ public class RuleTreeNode {
return result; return result;
} }
/**
* Find all the terminal strings in the current rule set which would match this path.
* @param path the path to match
* @return a collection (possibly empty) of potential successors.
*/
public Collection<String> match(WordStack path) {
final Collection<String> result;
if ( path.isEmpty()) {
result = this.getSuccessors();
} else {
final RuleTreeNode successor = this.getRule(path.pop());
if (successor == null) {
result = new ArrayList<String>();
} else {
result = successor.match(path);
}
}
return result;
}
} }

View file

@ -36,6 +36,13 @@ public class Tokeniser extends StreamTokenizer {
*/ */
this.whitespaceChars((int) '\"', (int) '\"'); this.whitespaceChars((int) '\"', (int) '\"');
this.whitespaceChars((int) '\'', (int) '\''); this.whitespaceChars((int) '\'', (int) '\'');
/*
* treat underscore and hyphen as whitespace as well. Again, hyphen with
* either leading or trailing non-whitespace probably ought to be
* treated specially, but...
*/
this.whitespaceChars((int) '_', (int) '_');
this.whitespaceChars((int) '-', (int) '-');
this.wordChars((int) '0', (int) '9'); this.wordChars((int) '0', (int) '9');
this.wordChars((int) 'A', (int) 'Z'); this.wordChars((int) 'A', (int) 'Z');
this.wordChars((int) 'a', (int) 'z'); this.wordChars((int) 'a', (int) 'z');

View file

@ -0,0 +1,57 @@
package cc.journeyman.milkwood;
import java.util.Stack;
/**
* Sliding window which rules may match.
*
* @author simon
*
*/
public class WordStack extends Stack<String> {
private static final long serialVersionUID = 1L;
/**
* Create a new, empty, wordstack.
*/
public WordStack() {
super();
}
/**
* create a new window from this window, having this new word as its
* terminal and ommitting the current first word. That is, the new window
* should be as long as the old, with each word shuffled up one place.
*
* @param prototype the window to copy from.
* @param terminal the new terminal word.
*/
public WordStack(WordStack prototype, String terminal) {
this();
WordStack copy = prototype.duplicate();
copy.pop();
this.populate( copy, terminal);
}
private void populate(WordStack copy, String terminal) {
if ( copy.isEmpty()) {
this.push(terminal);
} else {
String token = copy.pop();
this.populate(copy, terminal);
this.push( token);
}
}
/**
* A wrapper round clone which hides all the ugly casting.
*
* @return a duplicate copy of myself.
*/
public WordStack duplicate() {
return (WordStack) this.clone();
}
}

View file

@ -14,6 +14,8 @@ import java.util.Locale;
import java.util.Random; import java.util.Random;
/** /**
* A special purpose writer to write sequences of tokens, chopping them up into
* paragraphs on the fly..
* *
* @author Simon Brooke <simon@journeyman.cc> * @author Simon Brooke <simon@journeyman.cc>
*/ */
@ -59,7 +61,7 @@ class Writer extends BufferedWriter {
* @throws IOException * @throws IOException
* if it is impossible to write (e.g. file system full). * if it is impossible to write (e.g. file system full).
*/ */
public void generate(WordSequence tokens) throws IOException { public void writeSequence(WordSequence tokens) throws IOException {
boolean capitaliseNext = true; boolean capitaliseNext = true;
try { try {
@ -113,7 +115,11 @@ class Writer extends BufferedWriter {
private boolean spaceBefore(String token) { private boolean spaceBefore(String token) {
final boolean result; final boolean result;
if (token.length() == 1) { switch (token.length()) {
case 0:
result = false;
break;
case 1:
switch (token.charAt(0)) { switch (token.charAt(0)) {
case '.': case '.':
case ',': case ',':
@ -135,8 +141,9 @@ class Writer extends BufferedWriter {
result = true; result = true;
break; break;
} }
} else { break;
result = false; default:
result = true;
} }
return result; return result;
@ -155,7 +162,8 @@ class Writer extends BufferedWriter {
* if Mr this has run out of ink * if Mr this has run out of ink
*/ */
private void maybeParagraph(String token) throws IOException { private void maybeParagraph(String token) throws IOException {
if (token.endsWith(Milkwood.PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { if (token.endsWith(Milkwood.PERIOD)
&& RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
this.write("\n\n"); this.write("\n\n");
} }
} }