Further 'refactored' (read: decluttered) to isolate the code that's failing.

This commit is contained in:
Simon Brooke 2013-10-31 09:42:20 +00:00
parent 4fb7a9830f
commit 0012a72e3f
6 changed files with 454 additions and 355 deletions

View file

@ -74,3 +74,6 @@ New Git repository, and this time pushed out to Goldsmith, so that local power p
Decluttered the TextGenerator class by moving the whole read stage into two new classes, Generator and Tokeniser. More declutter needed. Decluttered the TextGenerator class by moving the whole read stage into two new classes, Generator and Tokeniser. More declutter needed.
Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt...

View file

@ -0,0 +1,173 @@
package cc.journeyman.milkwood;
import java.util.Collection;
import java.util.Stack;
/**
* Composes text output based on a rule tree.
*
* @author simon
*
*/
public class Composer {
/**
* Whether or not I am in debugging mode.
*/
private final boolean debug;
/**
*
* @param debug
* Whether or not I am in debugging mode.
*/
public Composer(boolean debug) {
this.debug = debug;
}
/**
* Recursive, backtracking, output generator.
*
* @param rules
* @param tupleLength
* @param length
* @return
*/
public WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
Stack<String> preamble = composePreamble(rules);
WordSequence result = new WordSequence();
// composing the preamble will have ended with *ROOT* on top of the
// stack;
// get rid of it.
preamble.pop();
result.addAll(preamble);
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
return result;
}
/**
* Recursively attempt to find sequences in the ruleset to append to what's
* been composed so far.
*
* @param glanceBack
* @param allRules
* @param currentRules
* @param tupleLength
* @param length
* @return
*/
private WordSequence compose(Stack<String> glanceBack,
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) {
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
WordSequence result;
try {
@SuppressWarnings("unchecked")
String here = currentRules.getWord((Stack<String>) glanceBack
.clone());
System.err.println(String.format("Trying token %s", here));
result = new WordSequence();
result.add(here);
if (length != 0) {
/* we're not done yet */
Collection<String> options = allRules.getSuccessors();
for (String next : options) {
@SuppressWarnings("unchecked")
WordSequence rest = this
.tryOption((Stack<String>) glanceBack.clone(),
allRules, currentRules.getRule(next),
tupleLength, length - 1);
if (rest != null) {
/* we have a solution */
result.addAll(rest);
break;
}
}
}
} catch (NoSuchPathException ex) {
if (debug) {
System.err.println(String.format("No path %s: Backtracking...",
glanceBack));
}
result = null;
}
return result;
}
/**
* Try composing with this ruleset
*
* @param glanceBack
* @param allRules
* all the rules there are.
* @param currentRules
* the current node in the rule tree.
* @param tupleLength
* the size of the glanceback window we're considering.
* @param length
* @return
*/
private WordSequence tryOption(Stack<String> glanceBack,
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) {
final Stack<String> restack = this.restack(glanceBack,
currentRules.getWord());
restack.pop();
return this.compose(restack, allRules, currentRules, tupleLength,
length);
}
/**
* Return a new stack comprising all the items on the current stack, with
* this new string added at the bottom
*
* @param stack
* the stack to restack.
* @param bottom
* the item to place on the bottom.
* @return the restacked stack.
*/
private Stack<String> restack(Stack<String> stack, String bottom) {
final Stack<String> result;
if (stack.isEmpty()) {
result = new Stack<String>();
result.push(bottom);
} else {
String top = stack.pop();
result = restack(stack, bottom);
result.push(top);
}
return result;
}
/**
* Random walk of the rule tree to extract (from the root) a legal sequence
* of words the length of our tuple.
*
* @param rules
* the rule tree (fragment) to walk.
* @return a sequence of words.
*/
private Stack<String> composePreamble(RuleTreeNode rules) {
final Stack<String> result;
final RuleTreeNode successor = rules.getRule();
if (successor == null) {
result = new Stack<String>();
} else {
result = this.composePreamble(successor);
result.push(rules.getWord());
}
return result;
}
}

View file

@ -20,6 +20,10 @@ import java.io.OutputStream;
* @author Simon Brooke <simon@journeyman.cc> * @author Simon Brooke <simon@journeyman.cc>
*/ */
public class Milkwood { public class Milkwood {
/**
* The magic token which is deemed to end sentences.
*/
public static final String PERIOD = ".";
/** /**
* Parse command line arguments and kick off the process. Expected arguments * Parse command line arguments and kick off the process. Expected arguments
@ -46,6 +50,7 @@ public class Milkwood {
*/ */
public static void main(String[] args) throws FileNotFoundException, public static void main(String[] args) throws FileNotFoundException,
IOException { IOException {
/* defaults */
InputStream in = System.in; InputStream in = System.in;
OutputStream out = System.out; OutputStream out = System.out;
int tupleLength = 2; int tupleLength = 2;
@ -76,8 +81,11 @@ public class Milkwood {
} }
} }
} }
try {
new Milkwood().readAndGenerate(in, out, tupleLength, debug); new Milkwood().readAndGenerate(in, out, tupleLength, debug);
} finally {
out.close();
}
} }
/** /**
@ -99,13 +107,71 @@ public class Milkwood {
final int tupleLength, boolean debug) throws IOException { final int tupleLength, boolean debug) throws IOException {
/* The root of the rule tree I shall build. */ /* The root of the rule tree I shall build. */
RuleTreeNode root = new RuleTreeNode(); RuleTreeNode root = new RuleTreeNode();
int length = read(in, tupleLength, debug, root);
WordSequence tokens = compose(tupleLength, debug, root, length);
write(out, debug, tokens);
}
/**
* Digest the input into a set of rules.
*
* @param in
* the input stream.
* @param tupleLength
* the length of tuples we shall consider.
* @param debug
* whether or not to print debugging output.
* @param root
* the root of the rule tree.
* @return the number of tokens read.
* @throws IOException
* if the file system buggers up, which is not, in the cosmic
* scheme of things, very likely.
*/
private int read(final InputStream in, final int tupleLength,
boolean debug, RuleTreeNode root) throws IOException {
int length = new Digester().read(in, tupleLength, root); int length = new Digester().read(in, tupleLength, root);
if (debug) { if (debug) {
System.err.println(root.toString()); System.err.println(root.toString());
} }
return length;
}
new TextGenerator().generate(out, tupleLength, root, length); private WordSequence compose(final int tupleLength, boolean debug,
RuleTreeNode root, int length) {
WordSequence tokens = new Composer(debug).compose(root, tupleLength,
length);
if (tokens.contains(PERIOD)) {
tokens = tokens.truncateAtLastInstance(PERIOD);
}
return tokens;
}
/**
* Write this sequence of tokens to this output.
*
* @param out
* the stream to which to write.
* @param debug
* whether or not to print debugging output.
* @param tokens
* the sequence of tokens to write.
* @throws IOException
* if the file system buggers up, which is not, in the cosmic
* scheme of things, very likely.
*/
private void write(final OutputStream out, boolean debug,
WordSequence tokens) throws IOException {
Writer scrivenor = new Writer(out, debug);
try {
scrivenor.generate(tokens);
} finally {
scrivenor.close();
}
} }
} }

View file

@ -1,351 +0,0 @@
/*
* Proprietary unpublished source code property of
* Simon Brooke <simon@journeyman.cc>.
*
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
*/
package cc.journeyman.milkwood;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Collection;
import java.util.Locale;
import java.util.Random;
import java.util.Stack;
/**
*
* @author Simon Brooke <simon@journeyman.cc>
*/
class TextGenerator {
/**
* The magic token which is deemed to end sentences.
*/
public static final String PERIOD = ".";
/**
* The average number of sentences in a paragraph.
*/
public static final int AVSENTENCESPERPARA = 5;
/**
* A random number generator.
*/
private static Random RANDOM = new Random();
/**
* Dictionary of first-words we know about; each first-word maps onto a
* tuple of tuples of word sequences beginning with that word, so 'I' might
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
*/
TupleDictionary dictionary = new TupleDictionary();
public TextGenerator() {
}
public void generate(OutputStream out, int tupleLength, RuleTreeNode root,
int length) throws IOException {
WordSequence tokens = this.compose(root, tupleLength, length);
if (tokens.contains(PERIOD)) {
// TODO: eq = equal?
tokens = this.truncateAtLastInstance(tokens, PERIOD);
}
this.generate(out, tokens);
}
/**
* Write this sequence of tokens on this stream, sorting out minor issues of
* orthography.
*
* @param out
* the stream.
* @param tokens
* the tokens.
* @throws IOException
* if it is impossible to write (e.g. file system full).
*/
private void generate(OutputStream out, WordSequence tokens)
throws IOException {
BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
boolean capitaliseNext = true;
try {
for (String token : tokens) {
capitaliseNext = writeToken(dickens, capitaliseNext, token);
}
} finally {
dickens.flush();
dickens.close();
}
}
/**
* Deal with end of paragraph, capital after full stop, and other minor
* orthographic conventions.
*
* @param dickens
* the scrivenor who writes for us.
* @param capitalise
* whether or not the token should be capitalised
* @param token
* the token to write;
* @returnvtrue if the next token to be written should be capitalised.
* @throws IOException
*/
private boolean writeToken(BufferedWriter dickens, boolean capitalise,
String token) throws IOException {
if (this.spaceBefore(token)) {
dickens.write(" ");
}
if (capitalise) {
dickens.write(token.substring(0, 1)
.toUpperCase(Locale.getDefault()));
dickens.write(token.substring(1));
} else {
dickens.write(token);
}
this.maybeParagraph(token, dickens);
return (token.endsWith(PERIOD));
}
/**
* Return false if token is punctuation, else true. Wouldn't it be nice if
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
* can give this slightly special semantics: return true only if this is
* punctuation which would not normally be preceded with a space.
*
* @param ch
* a character.
* @return true if the should be preceded by a space, else false.
*/
private boolean spaceBefore(String token) {
final boolean result;
if (token.length() == 1) {
switch (token.charAt(0)) {
case '.':
case ',':
case ':':
case ';':
case 's':
/*
* an 's' on its own is probably evidence of a possessive with
* the apostrophe lost
*/
case 't':
/*
* similar; probably 'doesn't' or 'shouldn't' or other cases of
* 'not' with an elided 'o'.
*/
result = false;
break;
default:
result = true;
break;
}
} else {
result = false;
}
return result;
}
/**
* If this token is an end-of-sentence token, then, on one chance in some,
* have the writer write two new lines. NOTE: The tokeniser is treating
* PERIOD ('.') as a word character, even though it has not been told to.
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
* investigate and fix.
*
* @param token
* a token
* @param dickens
* our scrivenor
* @throws IOException
* if Mr Dickens has run out of ink
*/
private void maybeParagraph(String token, BufferedWriter dickens)
throws IOException {
if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
dickens.write("\n\n");
}
}
/**
* Recursive, backtracking, output generator.
*
* @param rules
* @param tupleLength
* @param length
* @return
*/
private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
Stack<String> preamble = composePreamble(rules);
WordSequence result = new WordSequence();
// composing the preamble will have ended with *ROOT* on top of the
// stack;
// get rid of it.
preamble.pop();
result.addAll(preamble);
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
return result;
}
/**
* Recursively attempt to find sequences in the ruleset to append to what's
* been composed so far.
*
* @param glanceBack
* @param allRules
* @param currentRules
* @param tupleLength
* @param length
* @return
*/
private WordSequence compose(Stack<String> glanceBack,
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) {
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
WordSequence result;
try {
@SuppressWarnings("unchecked")
String here = currentRules.getWord((Stack<String>) glanceBack
.clone());
System.err.println(String.format("Trying token %s", here));
result = new WordSequence();
result.add(here);
if (length != 0) {
/* we're not done yet */
Collection<String> options = allRules.getSuccessors();
for (String next : options) {
@SuppressWarnings("unchecked")
WordSequence rest = this
.tryOption((Stack<String>) glanceBack.clone(),
allRules, currentRules.getRule(next),
tupleLength, length - 1);
if (rest != null) {
/* we have a solution */
result.addAll(rest);
break;
}
}
}
} catch (NoSuchPathException ex) {
System.err.println( String.format("No path %s: Backtracking...", glanceBack));
result = null;
}
return result;
}
/**
* Try composing with this ruleset
*
* @param glanceBack
* @param allRules
* all the rules there are.
* @param currentRules
* the current node in the rule tree.
* @param tupleLength
* the size of the glanceback window we're considering.
* @param length
* @return
*/
private WordSequence tryOption(Stack<String> glanceBack,
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) {
final Stack<String> restack = this.restack(glanceBack,
currentRules.getWord());
restack.pop();
return this.compose(restack, allRules, currentRules, tupleLength,
length);
}
/**
* Return a new stack comprising all the items on the current stack, with
* this new string added at the bottom
*
* @param stack
* the stack to restack.
* @param bottom
* the item to place on the bottom.
* @return the restacked stack.
*/
private Stack<String> restack(Stack<String> stack, String bottom) {
final Stack<String> result;
if (stack.isEmpty()) {
result = new Stack<String>();
result.push(bottom);
} else {
String top = stack.pop();
result = restack(stack, bottom);
result.push(top);
}
return result;
}
/**
* Random walk of the rule tree to extract (from the root) a legal sequence
* of words the length of our tuple.
*
* @param rules
* the rule tree (fragment) to walk.
* @return a sequence of words.
*/
private Stack<String> composePreamble(RuleTreeNode rules) {
final Stack<String> result;
final RuleTreeNode successor = rules.getRule();
if (successor == null) {
result = new Stack<String>();
} else {
result = this.composePreamble(successor);
result.push(rules.getWord());
}
return result;
}
/**
*
* @param tokens
* a sequence of tokens
* @param marker
* a marker to terminate after the last occurrance of.
* @return a copy of tokens, truncated at the last occurrance of the marker.
*/
private WordSequence truncateAtLastInstance(WordSequence tokens,
String marker) {
final WordSequence result = new WordSequence();
if (!tokens.isEmpty()) {
String token = tokens.remove();
result.add(token);
if (!(marker.equals(token) && !tokens.contains(marker))) {
/*
* woah, double negatives. If the token we're looking at is the
* marker, and the remainder of the tokens does not include the
* marker, we're done. Otherwise, we continue. OK?
*/
result.addAll(this.truncateAtLastInstance(tokens, marker));
}
}
return result;
}
}

View file

@ -13,10 +13,55 @@ import java.util.Queue;
* An ordered sequence of words. Of course it implements Queue since it is a * An ordered sequence of words. Of course it implements Queue since it is a
* LinkedList and LinkedList implements Queue, but I want to make it explicitly * LinkedList and LinkedList implements Queue, but I want to make it explicitly
* clear that this is a queue and can be used as such. * clear that this is a queue and can be used as such.
*
* @author Simon Brooke <simon@journeyman.cc> * @author Simon Brooke <simon@journeyman.cc>
*/ */
class WordSequence extends LinkedList<String> implements Queue<String> { class WordSequence extends LinkedList<String> implements Queue<String> {
private static final long serialVersionUID = 1L; private static final long serialVersionUID = 1L;
/**
*
* @param tokens
* a sequence of tokens
* @param marker
* a marker to terminate after the last occurrance of.
* @return a copy of tokens, truncated at the last occurrance of the marker.
*/
public WordSequence truncateAtLastInstance(String marker) {
final WordSequence result = new WordSequence();
for (String token : this) {
if (token.endsWith(marker) && !this.contains(marker)) {
/*
* If the token we're looking at ends with the marker, and the
* remainder of the tokens does not include a token ending with
* the marker, we're done. Otherwise, we continue. OK?
*/
break;
}
result.add(token);
}
return result;
}
/**
* Specialisation: Working around the bug that the tokeniser treats PERIOD as a word character.
*/
@Override
public boolean contains(Object target) {
boolean result = false;
if (target != null) {
String marker = target.toString();
for (String token : this) {
if (token.endsWith(marker)) {
result = true;
break;
}
}
}
return result;
}
} }

View file

@ -0,0 +1,163 @@
/*
* Proprietary unpublished source code property of
* Simon Brooke <simon@journeyman.cc>.
*
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
*/
package cc.journeyman.milkwood;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Locale;
import java.util.Random;
/**
*
* @author Simon Brooke <simon@journeyman.cc>
*/
class Writer extends BufferedWriter {
/**
* The average number of sentences in a paragraph.
*/
public static final int AVSENTENCESPERPARA = 5;
/**
* A random number generator.
*/
private static Random RANDOM = new Random();
/**
* Dictionary of first-words we know about; each first-word maps onto a
* tuple of tuples of word sequences beginning with that word, so 'I' might
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
*/
TupleDictionary dictionary = new TupleDictionary();
/**
* Whether or not I am in debugging mode.
*/
@SuppressWarnings("unused")
private final boolean debug;
/**
* @param out
* the output stream to which I shall write.
* @param debug
* Whether or not I am in debugging mode.
*/
public Writer(OutputStream out, final boolean debug) {
super(new OutputStreamWriter(out));
this.debug = debug;
}
/**
* Write this sequence of tokens on this stream, sorting out minor issues of
* orthography.
*
* @param tokens
* the tokens.
* @throws IOException
* if it is impossible to write (e.g. file system full).
*/
public void generate(WordSequence tokens) throws IOException {
boolean capitaliseNext = true;
try {
for (String token : tokens) {
capitaliseNext = writeToken(capitaliseNext, token);
}
} finally {
this.flush();
this.close();
}
}
/**
* Deal with end of paragraph, capital after full stop, and other minor
* orthographic conventions.
*
* @param capitalise
* whether or not the token should be capitalised
* @param token
* the token to write;
* @returnvtrue if the next token to be written should be capitalised.
* @throws IOException
*/
private boolean writeToken(boolean capitalise, String token)
throws IOException {
if (this.spaceBefore(token)) {
this.write(" ");
}
if (capitalise) {
this.write(token.substring(0, 1).toUpperCase(Locale.getDefault()));
this.write(token.substring(1));
} else {
this.write(token);
}
this.maybeParagraph(token);
return (token.endsWith(Milkwood.PERIOD));
}
/**
* Return false if token is punctuation, else true. Wouldn't it be nice if
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
* can give this slightly special semantics: return true only if this is
* punctuation which would not normally be preceded with a space.
*
* @param ch
* a character.
* @return true if the should be preceded by a space, else false.
*/
private boolean spaceBefore(String token) {
final boolean result;
if (token.length() == 1) {
switch (token.charAt(0)) {
case '.':
case ',':
case ':':
case ';':
case 's':
/*
* an 's' on its own is probably evidence of a possessive with
* the apostrophe lost
*/
case 't':
/*
* similar; probably 'doesn't' or 'shouldn't' or other cases of
* 'not' with an elided 'o'.
*/
result = false;
break;
default:
result = true;
break;
}
} else {
result = false;
}
return result;
}
/**
* If this token is an end-of-sentence token, then, on one chance in some,
* have the writer write two new lines. NOTE: The tokeniser is treating
* PERIOD ('.') as a word character, even though it has not been told to.
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
* investigate and fix.
*
* @param token
* a token
* @throws IOException
* if Mr this has run out of ink
*/
private void maybeParagraph(String token) throws IOException {
if (token.endsWith(Milkwood.PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
this.write("\n\n");
}
}
}