Further 'refactored' (read: decluttered) to isolate the code that's failing.
This commit is contained in:
parent
4fb7a9830f
commit
0012a72e3f
|
@ -74,3 +74,6 @@ New Git repository, and this time pushed out to Goldsmith, so that local power p
|
||||||
|
|
||||||
Decluttered the TextGenerator class by moving the whole read stage into two new classes, Generator and Tokeniser. More declutter needed.
|
Decluttered the TextGenerator class by moving the whole read stage into two new classes, Generator and Tokeniser. More declutter needed.
|
||||||
|
|
||||||
|
Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt...
|
||||||
|
|
||||||
|
|
||||||
|
|
173
src/cc/journeyman/milkwood/Composer.java
Normal file
173
src/cc/journeyman/milkwood/Composer.java
Normal file
|
@ -0,0 +1,173 @@
|
||||||
|
package cc.journeyman.milkwood;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.Stack;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Composes text output based on a rule tree.
|
||||||
|
*
|
||||||
|
* @author simon
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class Composer {
|
||||||
|
/**
|
||||||
|
* Whether or not I am in debugging mode.
|
||||||
|
*/
|
||||||
|
private final boolean debug;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param debug
|
||||||
|
* Whether or not I am in debugging mode.
|
||||||
|
*/
|
||||||
|
public Composer(boolean debug) {
|
||||||
|
this.debug = debug;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursive, backtracking, output generator.
|
||||||
|
*
|
||||||
|
* @param rules
|
||||||
|
* @param tupleLength
|
||||||
|
* @param length
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
public WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
|
||||||
|
Stack<String> preamble = composePreamble(rules);
|
||||||
|
WordSequence result = new WordSequence();
|
||||||
|
|
||||||
|
// composing the preamble will have ended with *ROOT* on top of the
|
||||||
|
// stack;
|
||||||
|
// get rid of it.
|
||||||
|
preamble.pop();
|
||||||
|
|
||||||
|
result.addAll(preamble);
|
||||||
|
|
||||||
|
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Recursively attempt to find sequences in the ruleset to append to what's
|
||||||
|
* been composed so far.
|
||||||
|
*
|
||||||
|
* @param glanceBack
|
||||||
|
* @param allRules
|
||||||
|
* @param currentRules
|
||||||
|
* @param tupleLength
|
||||||
|
* @param length
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private WordSequence compose(Stack<String> glanceBack,
|
||||||
|
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||||
|
int length) {
|
||||||
|
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
|
||||||
|
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
|
||||||
|
WordSequence result;
|
||||||
|
|
||||||
|
try {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
String here = currentRules.getWord((Stack<String>) glanceBack
|
||||||
|
.clone());
|
||||||
|
System.err.println(String.format("Trying token %s", here));
|
||||||
|
|
||||||
|
result = new WordSequence();
|
||||||
|
result.add(here);
|
||||||
|
|
||||||
|
if (length != 0) {
|
||||||
|
/* we're not done yet */
|
||||||
|
Collection<String> options = allRules.getSuccessors();
|
||||||
|
|
||||||
|
for (String next : options) {
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
|
WordSequence rest = this
|
||||||
|
.tryOption((Stack<String>) glanceBack.clone(),
|
||||||
|
allRules, currentRules.getRule(next),
|
||||||
|
tupleLength, length - 1);
|
||||||
|
|
||||||
|
if (rest != null) {
|
||||||
|
/* we have a solution */
|
||||||
|
result.addAll(rest);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (NoSuchPathException ex) {
|
||||||
|
if (debug) {
|
||||||
|
System.err.println(String.format("No path %s: Backtracking...",
|
||||||
|
glanceBack));
|
||||||
|
}
|
||||||
|
result = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Try composing with this ruleset
|
||||||
|
*
|
||||||
|
* @param glanceBack
|
||||||
|
* @param allRules
|
||||||
|
* all the rules there are.
|
||||||
|
* @param currentRules
|
||||||
|
* the current node in the rule tree.
|
||||||
|
* @param tupleLength
|
||||||
|
* the size of the glanceback window we're considering.
|
||||||
|
* @param length
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private WordSequence tryOption(Stack<String> glanceBack,
|
||||||
|
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||||
|
int length) {
|
||||||
|
final Stack<String> restack = this.restack(glanceBack,
|
||||||
|
currentRules.getWord());
|
||||||
|
restack.pop();
|
||||||
|
return this.compose(restack, allRules, currentRules, tupleLength,
|
||||||
|
length);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a new stack comprising all the items on the current stack, with
|
||||||
|
* this new string added at the bottom
|
||||||
|
*
|
||||||
|
* @param stack
|
||||||
|
* the stack to restack.
|
||||||
|
* @param bottom
|
||||||
|
* the item to place on the bottom.
|
||||||
|
* @return the restacked stack.
|
||||||
|
*/
|
||||||
|
private Stack<String> restack(Stack<String> stack, String bottom) {
|
||||||
|
final Stack<String> result;
|
||||||
|
if (stack.isEmpty()) {
|
||||||
|
result = new Stack<String>();
|
||||||
|
result.push(bottom);
|
||||||
|
} else {
|
||||||
|
String top = stack.pop();
|
||||||
|
result = restack(stack, bottom);
|
||||||
|
result.push(top);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Random walk of the rule tree to extract (from the root) a legal sequence
|
||||||
|
* of words the length of our tuple.
|
||||||
|
*
|
||||||
|
* @param rules
|
||||||
|
* the rule tree (fragment) to walk.
|
||||||
|
* @return a sequence of words.
|
||||||
|
*/
|
||||||
|
private Stack<String> composePreamble(RuleTreeNode rules) {
|
||||||
|
final Stack<String> result;
|
||||||
|
final RuleTreeNode successor = rules.getRule();
|
||||||
|
|
||||||
|
if (successor == null) {
|
||||||
|
result = new Stack<String>();
|
||||||
|
} else {
|
||||||
|
result = this.composePreamble(successor);
|
||||||
|
result.push(rules.getWord());
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -20,6 +20,10 @@ import java.io.OutputStream;
|
||||||
* @author Simon Brooke <simon@journeyman.cc>
|
* @author Simon Brooke <simon@journeyman.cc>
|
||||||
*/
|
*/
|
||||||
public class Milkwood {
|
public class Milkwood {
|
||||||
|
/**
|
||||||
|
* The magic token which is deemed to end sentences.
|
||||||
|
*/
|
||||||
|
public static final String PERIOD = ".";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse command line arguments and kick off the process. Expected arguments
|
* Parse command line arguments and kick off the process. Expected arguments
|
||||||
|
@ -46,6 +50,7 @@ public class Milkwood {
|
||||||
*/
|
*/
|
||||||
public static void main(String[] args) throws FileNotFoundException,
|
public static void main(String[] args) throws FileNotFoundException,
|
||||||
IOException {
|
IOException {
|
||||||
|
/* defaults */
|
||||||
InputStream in = System.in;
|
InputStream in = System.in;
|
||||||
OutputStream out = System.out;
|
OutputStream out = System.out;
|
||||||
int tupleLength = 2;
|
int tupleLength = 2;
|
||||||
|
@ -76,8 +81,11 @@ public class Milkwood {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
try {
|
||||||
new Milkwood().readAndGenerate(in, out, tupleLength, debug);
|
new Milkwood().readAndGenerate(in, out, tupleLength, debug);
|
||||||
|
} finally {
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -99,13 +107,71 @@ public class Milkwood {
|
||||||
final int tupleLength, boolean debug) throws IOException {
|
final int tupleLength, boolean debug) throws IOException {
|
||||||
/* The root of the rule tree I shall build. */
|
/* The root of the rule tree I shall build. */
|
||||||
RuleTreeNode root = new RuleTreeNode();
|
RuleTreeNode root = new RuleTreeNode();
|
||||||
|
int length = read(in, tupleLength, debug, root);
|
||||||
|
|
||||||
|
WordSequence tokens = compose(tupleLength, debug, root, length);
|
||||||
|
|
||||||
|
write(out, debug, tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Digest the input into a set of rules.
|
||||||
|
*
|
||||||
|
* @param in
|
||||||
|
* the input stream.
|
||||||
|
* @param tupleLength
|
||||||
|
* the length of tuples we shall consider.
|
||||||
|
* @param debug
|
||||||
|
* whether or not to print debugging output.
|
||||||
|
* @param root
|
||||||
|
* the root of the rule tree.
|
||||||
|
* @return the number of tokens read.
|
||||||
|
* @throws IOException
|
||||||
|
* if the file system buggers up, which is not, in the cosmic
|
||||||
|
* scheme of things, very likely.
|
||||||
|
*/
|
||||||
|
private int read(final InputStream in, final int tupleLength,
|
||||||
|
boolean debug, RuleTreeNode root) throws IOException {
|
||||||
int length = new Digester().read(in, tupleLength, root);
|
int length = new Digester().read(in, tupleLength, root);
|
||||||
|
|
||||||
if (debug) {
|
if (debug) {
|
||||||
System.err.println(root.toString());
|
System.err.println(root.toString());
|
||||||
}
|
}
|
||||||
|
return length;
|
||||||
|
}
|
||||||
|
|
||||||
new TextGenerator().generate(out, tupleLength, root, length);
|
private WordSequence compose(final int tupleLength, boolean debug,
|
||||||
|
RuleTreeNode root, int length) {
|
||||||
|
WordSequence tokens = new Composer(debug).compose(root, tupleLength,
|
||||||
|
length);
|
||||||
|
|
||||||
|
if (tokens.contains(PERIOD)) {
|
||||||
|
tokens = tokens.truncateAtLastInstance(PERIOD);
|
||||||
|
}
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write this sequence of tokens to this output.
|
||||||
|
*
|
||||||
|
* @param out
|
||||||
|
* the stream to which to write.
|
||||||
|
* @param debug
|
||||||
|
* whether or not to print debugging output.
|
||||||
|
* @param tokens
|
||||||
|
* the sequence of tokens to write.
|
||||||
|
* @throws IOException
|
||||||
|
* if the file system buggers up, which is not, in the cosmic
|
||||||
|
* scheme of things, very likely.
|
||||||
|
*/
|
||||||
|
private void write(final OutputStream out, boolean debug,
|
||||||
|
WordSequence tokens) throws IOException {
|
||||||
|
Writer scrivenor = new Writer(out, debug);
|
||||||
|
try {
|
||||||
|
scrivenor.generate(tokens);
|
||||||
|
} finally {
|
||||||
|
scrivenor.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,351 +0,0 @@
|
||||||
/*
|
|
||||||
* Proprietary unpublished source code property of
|
|
||||||
* Simon Brooke <simon@journeyman.cc>.
|
|
||||||
*
|
|
||||||
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
|
||||||
*/
|
|
||||||
package cc.journeyman.milkwood;
|
|
||||||
|
|
||||||
import java.io.BufferedWriter;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.io.OutputStreamWriter;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Locale;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.Stack;
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @author Simon Brooke <simon@journeyman.cc>
|
|
||||||
*/
|
|
||||||
class TextGenerator {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The magic token which is deemed to end sentences.
|
|
||||||
*/
|
|
||||||
public static final String PERIOD = ".";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The average number of sentences in a paragraph.
|
|
||||||
*/
|
|
||||||
public static final int AVSENTENCESPERPARA = 5;
|
|
||||||
/**
|
|
||||||
* A random number generator.
|
|
||||||
*/
|
|
||||||
private static Random RANDOM = new Random();
|
|
||||||
/**
|
|
||||||
* Dictionary of first-words we know about; each first-word maps onto a
|
|
||||||
* tuple of tuples of word sequences beginning with that word, so 'I' might
|
|
||||||
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
|
|
||||||
*/
|
|
||||||
TupleDictionary dictionary = new TupleDictionary();
|
|
||||||
|
|
||||||
public TextGenerator() {
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public void generate(OutputStream out, int tupleLength, RuleTreeNode root,
|
|
||||||
int length) throws IOException {
|
|
||||||
WordSequence tokens = this.compose(root, tupleLength, length);
|
|
||||||
|
|
||||||
if (tokens.contains(PERIOD)) {
|
|
||||||
// TODO: eq = equal?
|
|
||||||
tokens = this.truncateAtLastInstance(tokens, PERIOD);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.generate(out, tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Write this sequence of tokens on this stream, sorting out minor issues of
|
|
||||||
* orthography.
|
|
||||||
*
|
|
||||||
* @param out
|
|
||||||
* the stream.
|
|
||||||
* @param tokens
|
|
||||||
* the tokens.
|
|
||||||
* @throws IOException
|
|
||||||
* if it is impossible to write (e.g. file system full).
|
|
||||||
*/
|
|
||||||
private void generate(OutputStream out, WordSequence tokens)
|
|
||||||
throws IOException {
|
|
||||||
BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
|
|
||||||
boolean capitaliseNext = true;
|
|
||||||
|
|
||||||
try {
|
|
||||||
for (String token : tokens) {
|
|
||||||
capitaliseNext = writeToken(dickens, capitaliseNext, token);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
dickens.flush();
|
|
||||||
dickens.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Deal with end of paragraph, capital after full stop, and other minor
|
|
||||||
* orthographic conventions.
|
|
||||||
*
|
|
||||||
* @param dickens
|
|
||||||
* the scrivenor who writes for us.
|
|
||||||
* @param capitalise
|
|
||||||
* whether or not the token should be capitalised
|
|
||||||
* @param token
|
|
||||||
* the token to write;
|
|
||||||
* @returnvtrue if the next token to be written should be capitalised.
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
private boolean writeToken(BufferedWriter dickens, boolean capitalise,
|
|
||||||
String token) throws IOException {
|
|
||||||
if (this.spaceBefore(token)) {
|
|
||||||
dickens.write(" ");
|
|
||||||
}
|
|
||||||
if (capitalise) {
|
|
||||||
dickens.write(token.substring(0, 1)
|
|
||||||
.toUpperCase(Locale.getDefault()));
|
|
||||||
dickens.write(token.substring(1));
|
|
||||||
} else {
|
|
||||||
dickens.write(token);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.maybeParagraph(token, dickens);
|
|
||||||
|
|
||||||
return (token.endsWith(PERIOD));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return false if token is punctuation, else true. Wouldn't it be nice if
|
|
||||||
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
|
|
||||||
* can give this slightly special semantics: return true only if this is
|
|
||||||
* punctuation which would not normally be preceded with a space.
|
|
||||||
*
|
|
||||||
* @param ch
|
|
||||||
* a character.
|
|
||||||
* @return true if the should be preceded by a space, else false.
|
|
||||||
*/
|
|
||||||
private boolean spaceBefore(String token) {
|
|
||||||
final boolean result;
|
|
||||||
|
|
||||||
if (token.length() == 1) {
|
|
||||||
switch (token.charAt(0)) {
|
|
||||||
case '.':
|
|
||||||
case ',':
|
|
||||||
case ':':
|
|
||||||
case ';':
|
|
||||||
case 's':
|
|
||||||
/*
|
|
||||||
* an 's' on its own is probably evidence of a possessive with
|
|
||||||
* the apostrophe lost
|
|
||||||
*/
|
|
||||||
case 't':
|
|
||||||
/*
|
|
||||||
* similar; probably 'doesn't' or 'shouldn't' or other cases of
|
|
||||||
* 'not' with an elided 'o'.
|
|
||||||
*/
|
|
||||||
result = false;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
result = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
result = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* If this token is an end-of-sentence token, then, on one chance in some,
|
|
||||||
* have the writer write two new lines. NOTE: The tokeniser is treating
|
|
||||||
* PERIOD ('.') as a word character, even though it has not been told to.
|
|
||||||
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
|
|
||||||
* investigate and fix.
|
|
||||||
*
|
|
||||||
* @param token
|
|
||||||
* a token
|
|
||||||
* @param dickens
|
|
||||||
* our scrivenor
|
|
||||||
* @throws IOException
|
|
||||||
* if Mr Dickens has run out of ink
|
|
||||||
*/
|
|
||||||
private void maybeParagraph(String token, BufferedWriter dickens)
|
|
||||||
throws IOException {
|
|
||||||
if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
|
|
||||||
dickens.write("\n\n");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Recursive, backtracking, output generator.
|
|
||||||
*
|
|
||||||
* @param rules
|
|
||||||
* @param tupleLength
|
|
||||||
* @param length
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
|
|
||||||
Stack<String> preamble = composePreamble(rules);
|
|
||||||
WordSequence result = new WordSequence();
|
|
||||||
|
|
||||||
// composing the preamble will have ended with *ROOT* on top of the
|
|
||||||
// stack;
|
|
||||||
// get rid of it.
|
|
||||||
preamble.pop();
|
|
||||||
|
|
||||||
result.addAll(preamble);
|
|
||||||
|
|
||||||
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Recursively attempt to find sequences in the ruleset to append to what's
|
|
||||||
* been composed so far.
|
|
||||||
*
|
|
||||||
* @param glanceBack
|
|
||||||
* @param allRules
|
|
||||||
* @param currentRules
|
|
||||||
* @param tupleLength
|
|
||||||
* @param length
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private WordSequence compose(Stack<String> glanceBack,
|
|
||||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
|
||||||
int length) {
|
|
||||||
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
|
|
||||||
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
|
|
||||||
WordSequence result;
|
|
||||||
|
|
||||||
try {
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
String here = currentRules.getWord((Stack<String>) glanceBack
|
|
||||||
.clone());
|
|
||||||
System.err.println(String.format("Trying token %s", here));
|
|
||||||
|
|
||||||
result = new WordSequence();
|
|
||||||
result.add(here);
|
|
||||||
|
|
||||||
if (length != 0) {
|
|
||||||
/* we're not done yet */
|
|
||||||
Collection<String> options = allRules.getSuccessors();
|
|
||||||
|
|
||||||
for (String next : options) {
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
WordSequence rest = this
|
|
||||||
.tryOption((Stack<String>) glanceBack.clone(),
|
|
||||||
allRules, currentRules.getRule(next),
|
|
||||||
tupleLength, length - 1);
|
|
||||||
|
|
||||||
if (rest != null) {
|
|
||||||
/* we have a solution */
|
|
||||||
result.addAll(rest);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (NoSuchPathException ex) {
|
|
||||||
System.err.println( String.format("No path %s: Backtracking...", glanceBack));
|
|
||||||
result = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Try composing with this ruleset
|
|
||||||
*
|
|
||||||
* @param glanceBack
|
|
||||||
* @param allRules
|
|
||||||
* all the rules there are.
|
|
||||||
* @param currentRules
|
|
||||||
* the current node in the rule tree.
|
|
||||||
* @param tupleLength
|
|
||||||
* the size of the glanceback window we're considering.
|
|
||||||
* @param length
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private WordSequence tryOption(Stack<String> glanceBack,
|
|
||||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
|
||||||
int length) {
|
|
||||||
final Stack<String> restack = this.restack(glanceBack,
|
|
||||||
currentRules.getWord());
|
|
||||||
restack.pop();
|
|
||||||
return this.compose(restack, allRules, currentRules, tupleLength,
|
|
||||||
length);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return a new stack comprising all the items on the current stack, with
|
|
||||||
* this new string added at the bottom
|
|
||||||
*
|
|
||||||
* @param stack
|
|
||||||
* the stack to restack.
|
|
||||||
* @param bottom
|
|
||||||
* the item to place on the bottom.
|
|
||||||
* @return the restacked stack.
|
|
||||||
*/
|
|
||||||
private Stack<String> restack(Stack<String> stack, String bottom) {
|
|
||||||
final Stack<String> result;
|
|
||||||
if (stack.isEmpty()) {
|
|
||||||
result = new Stack<String>();
|
|
||||||
result.push(bottom);
|
|
||||||
} else {
|
|
||||||
String top = stack.pop();
|
|
||||||
result = restack(stack, bottom);
|
|
||||||
result.push(top);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Random walk of the rule tree to extract (from the root) a legal sequence
|
|
||||||
* of words the length of our tuple.
|
|
||||||
*
|
|
||||||
* @param rules
|
|
||||||
* the rule tree (fragment) to walk.
|
|
||||||
* @return a sequence of words.
|
|
||||||
*/
|
|
||||||
private Stack<String> composePreamble(RuleTreeNode rules) {
|
|
||||||
final Stack<String> result;
|
|
||||||
final RuleTreeNode successor = rules.getRule();
|
|
||||||
|
|
||||||
if (successor == null) {
|
|
||||||
result = new Stack<String>();
|
|
||||||
} else {
|
|
||||||
result = this.composePreamble(successor);
|
|
||||||
result.push(rules.getWord());
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
* @param tokens
|
|
||||||
* a sequence of tokens
|
|
||||||
* @param marker
|
|
||||||
* a marker to terminate after the last occurrance of.
|
|
||||||
* @return a copy of tokens, truncated at the last occurrance of the marker.
|
|
||||||
*/
|
|
||||||
private WordSequence truncateAtLastInstance(WordSequence tokens,
|
|
||||||
String marker) {
|
|
||||||
final WordSequence result = new WordSequence();
|
|
||||||
|
|
||||||
if (!tokens.isEmpty()) {
|
|
||||||
|
|
||||||
String token = tokens.remove();
|
|
||||||
result.add(token);
|
|
||||||
if (!(marker.equals(token) && !tokens.contains(marker))) {
|
|
||||||
/*
|
|
||||||
* woah, double negatives. If the token we're looking at is the
|
|
||||||
* marker, and the remainder of the tokens does not include the
|
|
||||||
* marker, we're done. Otherwise, we continue. OK?
|
|
||||||
*/
|
|
||||||
result.addAll(this.truncateAtLastInstance(tokens, marker));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -13,10 +13,55 @@ import java.util.Queue;
|
||||||
* An ordered sequence of words. Of course it implements Queue since it is a
|
* An ordered sequence of words. Of course it implements Queue since it is a
|
||||||
* LinkedList and LinkedList implements Queue, but I want to make it explicitly
|
* LinkedList and LinkedList implements Queue, but I want to make it explicitly
|
||||||
* clear that this is a queue and can be used as such.
|
* clear that this is a queue and can be used as such.
|
||||||
|
*
|
||||||
* @author Simon Brooke <simon@journeyman.cc>
|
* @author Simon Brooke <simon@journeyman.cc>
|
||||||
*/
|
*/
|
||||||
class WordSequence extends LinkedList<String> implements Queue<String> {
|
class WordSequence extends LinkedList<String> implements Queue<String> {
|
||||||
|
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param tokens
|
||||||
|
* a sequence of tokens
|
||||||
|
* @param marker
|
||||||
|
* a marker to terminate after the last occurrance of.
|
||||||
|
* @return a copy of tokens, truncated at the last occurrance of the marker.
|
||||||
|
*/
|
||||||
|
public WordSequence truncateAtLastInstance(String marker) {
|
||||||
|
final WordSequence result = new WordSequence();
|
||||||
|
|
||||||
|
for (String token : this) {
|
||||||
|
if (token.endsWith(marker) && !this.contains(marker)) {
|
||||||
|
/*
|
||||||
|
* If the token we're looking at ends with the marker, and the
|
||||||
|
* remainder of the tokens does not include a token ending with
|
||||||
|
* the marker, we're done. Otherwise, we continue. OK?
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result.add(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Specialisation: Working around the bug that the tokeniser treats PERIOD as a word character.
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public boolean contains(Object target) {
|
||||||
|
boolean result = false;
|
||||||
|
if (target != null) {
|
||||||
|
String marker = target.toString();
|
||||||
|
|
||||||
|
for (String token : this) {
|
||||||
|
if (token.endsWith(marker)) {
|
||||||
|
result = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
163
src/cc/journeyman/milkwood/Writer.java
Normal file
163
src/cc/journeyman/milkwood/Writer.java
Normal file
|
@ -0,0 +1,163 @@
|
||||||
|
/*
|
||||||
|
* Proprietary unpublished source code property of
|
||||||
|
* Simon Brooke <simon@journeyman.cc>.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
||||||
|
*/
|
||||||
|
package cc.journeyman.milkwood;
|
||||||
|
|
||||||
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStream;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.util.Locale;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @author Simon Brooke <simon@journeyman.cc>
|
||||||
|
*/
|
||||||
|
class Writer extends BufferedWriter {
|
||||||
|
/**
|
||||||
|
* The average number of sentences in a paragraph.
|
||||||
|
*/
|
||||||
|
public static final int AVSENTENCESPERPARA = 5;
|
||||||
|
/**
|
||||||
|
* A random number generator.
|
||||||
|
*/
|
||||||
|
private static Random RANDOM = new Random();
|
||||||
|
/**
|
||||||
|
* Dictionary of first-words we know about; each first-word maps onto a
|
||||||
|
* tuple of tuples of word sequences beginning with that word, so 'I' might
|
||||||
|
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
|
||||||
|
*/
|
||||||
|
TupleDictionary dictionary = new TupleDictionary();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Whether or not I am in debugging mode.
|
||||||
|
*/
|
||||||
|
@SuppressWarnings("unused")
|
||||||
|
private final boolean debug;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param out
|
||||||
|
* the output stream to which I shall write.
|
||||||
|
* @param debug
|
||||||
|
* Whether or not I am in debugging mode.
|
||||||
|
*/
|
||||||
|
public Writer(OutputStream out, final boolean debug) {
|
||||||
|
super(new OutputStreamWriter(out));
|
||||||
|
this.debug = debug;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write this sequence of tokens on this stream, sorting out minor issues of
|
||||||
|
* orthography.
|
||||||
|
*
|
||||||
|
* @param tokens
|
||||||
|
* the tokens.
|
||||||
|
* @throws IOException
|
||||||
|
* if it is impossible to write (e.g. file system full).
|
||||||
|
*/
|
||||||
|
public void generate(WordSequence tokens) throws IOException {
|
||||||
|
boolean capitaliseNext = true;
|
||||||
|
|
||||||
|
try {
|
||||||
|
for (String token : tokens) {
|
||||||
|
capitaliseNext = writeToken(capitaliseNext, token);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
this.flush();
|
||||||
|
this.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deal with end of paragraph, capital after full stop, and other minor
|
||||||
|
* orthographic conventions.
|
||||||
|
*
|
||||||
|
* @param capitalise
|
||||||
|
* whether or not the token should be capitalised
|
||||||
|
* @param token
|
||||||
|
* the token to write;
|
||||||
|
* @returnvtrue if the next token to be written should be capitalised.
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private boolean writeToken(boolean capitalise, String token)
|
||||||
|
throws IOException {
|
||||||
|
if (this.spaceBefore(token)) {
|
||||||
|
this.write(" ");
|
||||||
|
}
|
||||||
|
if (capitalise) {
|
||||||
|
this.write(token.substring(0, 1).toUpperCase(Locale.getDefault()));
|
||||||
|
this.write(token.substring(1));
|
||||||
|
} else {
|
||||||
|
this.write(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.maybeParagraph(token);
|
||||||
|
|
||||||
|
return (token.endsWith(Milkwood.PERIOD));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return false if token is punctuation, else true. Wouldn't it be nice if
|
||||||
|
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
|
||||||
|
* can give this slightly special semantics: return true only if this is
|
||||||
|
* punctuation which would not normally be preceded with a space.
|
||||||
|
*
|
||||||
|
* @param ch
|
||||||
|
* a character.
|
||||||
|
* @return true if the should be preceded by a space, else false.
|
||||||
|
*/
|
||||||
|
private boolean spaceBefore(String token) {
|
||||||
|
final boolean result;
|
||||||
|
|
||||||
|
if (token.length() == 1) {
|
||||||
|
switch (token.charAt(0)) {
|
||||||
|
case '.':
|
||||||
|
case ',':
|
||||||
|
case ':':
|
||||||
|
case ';':
|
||||||
|
case 's':
|
||||||
|
/*
|
||||||
|
* an 's' on its own is probably evidence of a possessive with
|
||||||
|
* the apostrophe lost
|
||||||
|
*/
|
||||||
|
case 't':
|
||||||
|
/*
|
||||||
|
* similar; probably 'doesn't' or 'shouldn't' or other cases of
|
||||||
|
* 'not' with an elided 'o'.
|
||||||
|
*/
|
||||||
|
result = false;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
result = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
result = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If this token is an end-of-sentence token, then, on one chance in some,
|
||||||
|
* have the writer write two new lines. NOTE: The tokeniser is treating
|
||||||
|
* PERIOD ('.') as a word character, even though it has not been told to.
|
||||||
|
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
|
||||||
|
* investigate and fix.
|
||||||
|
*
|
||||||
|
* @param token
|
||||||
|
* a token
|
||||||
|
* @throws IOException
|
||||||
|
* if Mr this has run out of ink
|
||||||
|
*/
|
||||||
|
private void maybeParagraph(String token) throws IOException {
|
||||||
|
if (token.endsWith(Milkwood.PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
|
||||||
|
this.write("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue