Further 'refactored' (read: decluttered) to isolate the code that's failing.
This commit is contained in:
parent
4fb7a9830f
commit
0012a72e3f
|
@ -74,3 +74,6 @@ New Git repository, and this time pushed out to Goldsmith, so that local power p
|
|||
|
||||
Decluttered the TextGenerator class by moving the whole read stage into two new classes, Generator and Tokeniser. More declutter needed.
|
||||
|
||||
Right, fully decluttered, All bugs(!) are in new class Composer. I have a little Liszt...
|
||||
|
||||
|
||||
|
|
173
src/cc/journeyman/milkwood/Composer.java
Normal file
173
src/cc/journeyman/milkwood/Composer.java
Normal file
|
@ -0,0 +1,173 @@
|
|||
package cc.journeyman.milkwood;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
* Composes text output based on a rule tree.
|
||||
*
|
||||
* @author simon
|
||||
*
|
||||
*/
|
||||
public class Composer {
|
||||
/**
|
||||
* Whether or not I am in debugging mode.
|
||||
*/
|
||||
private final boolean debug;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param debug
|
||||
* Whether or not I am in debugging mode.
|
||||
*/
|
||||
public Composer(boolean debug) {
|
||||
this.debug = debug;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive, backtracking, output generator.
|
||||
*
|
||||
* @param rules
|
||||
* @param tupleLength
|
||||
* @param length
|
||||
* @return
|
||||
*/
|
||||
public WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
|
||||
Stack<String> preamble = composePreamble(rules);
|
||||
WordSequence result = new WordSequence();
|
||||
|
||||
// composing the preamble will have ended with *ROOT* on top of the
|
||||
// stack;
|
||||
// get rid of it.
|
||||
preamble.pop();
|
||||
|
||||
result.addAll(preamble);
|
||||
|
||||
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively attempt to find sequences in the ruleset to append to what's
|
||||
* been composed so far.
|
||||
*
|
||||
* @param glanceBack
|
||||
* @param allRules
|
||||
* @param currentRules
|
||||
* @param tupleLength
|
||||
* @param length
|
||||
* @return
|
||||
*/
|
||||
private WordSequence compose(Stack<String> glanceBack,
|
||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||
int length) {
|
||||
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
|
||||
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
|
||||
WordSequence result;
|
||||
|
||||
try {
|
||||
@SuppressWarnings("unchecked")
|
||||
String here = currentRules.getWord((Stack<String>) glanceBack
|
||||
.clone());
|
||||
System.err.println(String.format("Trying token %s", here));
|
||||
|
||||
result = new WordSequence();
|
||||
result.add(here);
|
||||
|
||||
if (length != 0) {
|
||||
/* we're not done yet */
|
||||
Collection<String> options = allRules.getSuccessors();
|
||||
|
||||
for (String next : options) {
|
||||
@SuppressWarnings("unchecked")
|
||||
WordSequence rest = this
|
||||
.tryOption((Stack<String>) glanceBack.clone(),
|
||||
allRules, currentRules.getRule(next),
|
||||
tupleLength, length - 1);
|
||||
|
||||
if (rest != null) {
|
||||
/* we have a solution */
|
||||
result.addAll(rest);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (NoSuchPathException ex) {
|
||||
if (debug) {
|
||||
System.err.println(String.format("No path %s: Backtracking...",
|
||||
glanceBack));
|
||||
}
|
||||
result = null;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try composing with this ruleset
|
||||
*
|
||||
* @param glanceBack
|
||||
* @param allRules
|
||||
* all the rules there are.
|
||||
* @param currentRules
|
||||
* the current node in the rule tree.
|
||||
* @param tupleLength
|
||||
* the size of the glanceback window we're considering.
|
||||
* @param length
|
||||
* @return
|
||||
*/
|
||||
private WordSequence tryOption(Stack<String> glanceBack,
|
||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||
int length) {
|
||||
final Stack<String> restack = this.restack(glanceBack,
|
||||
currentRules.getWord());
|
||||
restack.pop();
|
||||
return this.compose(restack, allRules, currentRules, tupleLength,
|
||||
length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new stack comprising all the items on the current stack, with
|
||||
* this new string added at the bottom
|
||||
*
|
||||
* @param stack
|
||||
* the stack to restack.
|
||||
* @param bottom
|
||||
* the item to place on the bottom.
|
||||
* @return the restacked stack.
|
||||
*/
|
||||
private Stack<String> restack(Stack<String> stack, String bottom) {
|
||||
final Stack<String> result;
|
||||
if (stack.isEmpty()) {
|
||||
result = new Stack<String>();
|
||||
result.push(bottom);
|
||||
} else {
|
||||
String top = stack.pop();
|
||||
result = restack(stack, bottom);
|
||||
result.push(top);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Random walk of the rule tree to extract (from the root) a legal sequence
|
||||
* of words the length of our tuple.
|
||||
*
|
||||
* @param rules
|
||||
* the rule tree (fragment) to walk.
|
||||
* @return a sequence of words.
|
||||
*/
|
||||
private Stack<String> composePreamble(RuleTreeNode rules) {
|
||||
final Stack<String> result;
|
||||
final RuleTreeNode successor = rules.getRule();
|
||||
|
||||
if (successor == null) {
|
||||
result = new Stack<String>();
|
||||
} else {
|
||||
result = this.composePreamble(successor);
|
||||
result.push(rules.getWord());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
|
@ -20,6 +20,10 @@ import java.io.OutputStream;
|
|||
* @author Simon Brooke <simon@journeyman.cc>
|
||||
*/
|
||||
public class Milkwood {
|
||||
/**
|
||||
* The magic token which is deemed to end sentences.
|
||||
*/
|
||||
public static final String PERIOD = ".";
|
||||
|
||||
/**
|
||||
* Parse command line arguments and kick off the process. Expected arguments
|
||||
|
@ -46,6 +50,7 @@ public class Milkwood {
|
|||
*/
|
||||
public static void main(String[] args) throws FileNotFoundException,
|
||||
IOException {
|
||||
/* defaults */
|
||||
InputStream in = System.in;
|
||||
OutputStream out = System.out;
|
||||
int tupleLength = 2;
|
||||
|
@ -76,8 +81,11 @@ public class Milkwood {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
new Milkwood().readAndGenerate(in, out, tupleLength, debug);
|
||||
} finally {
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -99,13 +107,71 @@ public class Milkwood {
|
|||
final int tupleLength, boolean debug) throws IOException {
|
||||
/* The root of the rule tree I shall build. */
|
||||
RuleTreeNode root = new RuleTreeNode();
|
||||
int length = read(in, tupleLength, debug, root);
|
||||
|
||||
WordSequence tokens = compose(tupleLength, debug, root, length);
|
||||
|
||||
write(out, debug, tokens);
|
||||
}
|
||||
|
||||
/**
|
||||
* Digest the input into a set of rules.
|
||||
*
|
||||
* @param in
|
||||
* the input stream.
|
||||
* @param tupleLength
|
||||
* the length of tuples we shall consider.
|
||||
* @param debug
|
||||
* whether or not to print debugging output.
|
||||
* @param root
|
||||
* the root of the rule tree.
|
||||
* @return the number of tokens read.
|
||||
* @throws IOException
|
||||
* if the file system buggers up, which is not, in the cosmic
|
||||
* scheme of things, very likely.
|
||||
*/
|
||||
private int read(final InputStream in, final int tupleLength,
|
||||
boolean debug, RuleTreeNode root) throws IOException {
|
||||
int length = new Digester().read(in, tupleLength, root);
|
||||
|
||||
if (debug) {
|
||||
System.err.println(root.toString());
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
new TextGenerator().generate(out, tupleLength, root, length);
|
||||
private WordSequence compose(final int tupleLength, boolean debug,
|
||||
RuleTreeNode root, int length) {
|
||||
WordSequence tokens = new Composer(debug).compose(root, tupleLength,
|
||||
length);
|
||||
|
||||
if (tokens.contains(PERIOD)) {
|
||||
tokens = tokens.truncateAtLastInstance(PERIOD);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this sequence of tokens to this output.
|
||||
*
|
||||
* @param out
|
||||
* the stream to which to write.
|
||||
* @param debug
|
||||
* whether or not to print debugging output.
|
||||
* @param tokens
|
||||
* the sequence of tokens to write.
|
||||
* @throws IOException
|
||||
* if the file system buggers up, which is not, in the cosmic
|
||||
* scheme of things, very likely.
|
||||
*/
|
||||
private void write(final OutputStream out, boolean debug,
|
||||
WordSequence tokens) throws IOException {
|
||||
Writer scrivenor = new Writer(out, debug);
|
||||
try {
|
||||
scrivenor.generate(tokens);
|
||||
} finally {
|
||||
scrivenor.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,351 +0,0 @@
|
|||
/*
|
||||
* Proprietary unpublished source code property of
|
||||
* Simon Brooke <simon@journeyman.cc>.
|
||||
*
|
||||
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
||||
*/
|
||||
package cc.journeyman.milkwood;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Collection;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
import java.util.Stack;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Simon Brooke <simon@journeyman.cc>
|
||||
*/
|
||||
class TextGenerator {
|
||||
|
||||
/**
|
||||
* The magic token which is deemed to end sentences.
|
||||
*/
|
||||
public static final String PERIOD = ".";
|
||||
|
||||
/**
|
||||
* The average number of sentences in a paragraph.
|
||||
*/
|
||||
public static final int AVSENTENCESPERPARA = 5;
|
||||
/**
|
||||
* A random number generator.
|
||||
*/
|
||||
private static Random RANDOM = new Random();
|
||||
/**
|
||||
* Dictionary of first-words we know about; each first-word maps onto a
|
||||
* tuple of tuples of word sequences beginning with that word, so 'I' might
|
||||
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
|
||||
*/
|
||||
TupleDictionary dictionary = new TupleDictionary();
|
||||
|
||||
public TextGenerator() {
|
||||
}
|
||||
|
||||
|
||||
public void generate(OutputStream out, int tupleLength, RuleTreeNode root,
|
||||
int length) throws IOException {
|
||||
WordSequence tokens = this.compose(root, tupleLength, length);
|
||||
|
||||
if (tokens.contains(PERIOD)) {
|
||||
// TODO: eq = equal?
|
||||
tokens = this.truncateAtLastInstance(tokens, PERIOD);
|
||||
}
|
||||
|
||||
this.generate(out, tokens);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this sequence of tokens on this stream, sorting out minor issues of
|
||||
* orthography.
|
||||
*
|
||||
* @param out
|
||||
* the stream.
|
||||
* @param tokens
|
||||
* the tokens.
|
||||
* @throws IOException
|
||||
* if it is impossible to write (e.g. file system full).
|
||||
*/
|
||||
private void generate(OutputStream out, WordSequence tokens)
|
||||
throws IOException {
|
||||
BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
|
||||
boolean capitaliseNext = true;
|
||||
|
||||
try {
|
||||
for (String token : tokens) {
|
||||
capitaliseNext = writeToken(dickens, capitaliseNext, token);
|
||||
}
|
||||
} finally {
|
||||
dickens.flush();
|
||||
dickens.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deal with end of paragraph, capital after full stop, and other minor
|
||||
* orthographic conventions.
|
||||
*
|
||||
* @param dickens
|
||||
* the scrivenor who writes for us.
|
||||
* @param capitalise
|
||||
* whether or not the token should be capitalised
|
||||
* @param token
|
||||
* the token to write;
|
||||
* @returnvtrue if the next token to be written should be capitalised.
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean writeToken(BufferedWriter dickens, boolean capitalise,
|
||||
String token) throws IOException {
|
||||
if (this.spaceBefore(token)) {
|
||||
dickens.write(" ");
|
||||
}
|
||||
if (capitalise) {
|
||||
dickens.write(token.substring(0, 1)
|
||||
.toUpperCase(Locale.getDefault()));
|
||||
dickens.write(token.substring(1));
|
||||
} else {
|
||||
dickens.write(token);
|
||||
}
|
||||
|
||||
this.maybeParagraph(token, dickens);
|
||||
|
||||
return (token.endsWith(PERIOD));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return false if token is punctuation, else true. Wouldn't it be nice if
|
||||
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
|
||||
* can give this slightly special semantics: return true only if this is
|
||||
* punctuation which would not normally be preceded with a space.
|
||||
*
|
||||
* @param ch
|
||||
* a character.
|
||||
* @return true if the should be preceded by a space, else false.
|
||||
*/
|
||||
private boolean spaceBefore(String token) {
|
||||
final boolean result;
|
||||
|
||||
if (token.length() == 1) {
|
||||
switch (token.charAt(0)) {
|
||||
case '.':
|
||||
case ',':
|
||||
case ':':
|
||||
case ';':
|
||||
case 's':
|
||||
/*
|
||||
* an 's' on its own is probably evidence of a possessive with
|
||||
* the apostrophe lost
|
||||
*/
|
||||
case 't':
|
||||
/*
|
||||
* similar; probably 'doesn't' or 'shouldn't' or other cases of
|
||||
* 'not' with an elided 'o'.
|
||||
*/
|
||||
result = false;
|
||||
break;
|
||||
default:
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* If this token is an end-of-sentence token, then, on one chance in some,
|
||||
* have the writer write two new lines. NOTE: The tokeniser is treating
|
||||
* PERIOD ('.') as a word character, even though it has not been told to.
|
||||
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
|
||||
* investigate and fix.
|
||||
*
|
||||
* @param token
|
||||
* a token
|
||||
* @param dickens
|
||||
* our scrivenor
|
||||
* @throws IOException
|
||||
* if Mr Dickens has run out of ink
|
||||
*/
|
||||
private void maybeParagraph(String token, BufferedWriter dickens)
|
||||
throws IOException {
|
||||
if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
|
||||
dickens.write("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursive, backtracking, output generator.
|
||||
*
|
||||
* @param rules
|
||||
* @param tupleLength
|
||||
* @param length
|
||||
* @return
|
||||
*/
|
||||
private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
|
||||
Stack<String> preamble = composePreamble(rules);
|
||||
WordSequence result = new WordSequence();
|
||||
|
||||
// composing the preamble will have ended with *ROOT* on top of the
|
||||
// stack;
|
||||
// get rid of it.
|
||||
preamble.pop();
|
||||
|
||||
result.addAll(preamble);
|
||||
|
||||
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively attempt to find sequences in the ruleset to append to what's
|
||||
* been composed so far.
|
||||
*
|
||||
* @param glanceBack
|
||||
* @param allRules
|
||||
* @param currentRules
|
||||
* @param tupleLength
|
||||
* @param length
|
||||
* @return
|
||||
*/
|
||||
private WordSequence compose(Stack<String> glanceBack,
|
||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||
int length) {
|
||||
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
|
||||
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
|
||||
WordSequence result;
|
||||
|
||||
try {
|
||||
@SuppressWarnings("unchecked")
|
||||
String here = currentRules.getWord((Stack<String>) glanceBack
|
||||
.clone());
|
||||
System.err.println(String.format("Trying token %s", here));
|
||||
|
||||
result = new WordSequence();
|
||||
result.add(here);
|
||||
|
||||
if (length != 0) {
|
||||
/* we're not done yet */
|
||||
Collection<String> options = allRules.getSuccessors();
|
||||
|
||||
for (String next : options) {
|
||||
@SuppressWarnings("unchecked")
|
||||
WordSequence rest = this
|
||||
.tryOption((Stack<String>) glanceBack.clone(),
|
||||
allRules, currentRules.getRule(next),
|
||||
tupleLength, length - 1);
|
||||
|
||||
if (rest != null) {
|
||||
/* we have a solution */
|
||||
result.addAll(rest);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (NoSuchPathException ex) {
|
||||
System.err.println( String.format("No path %s: Backtracking...", glanceBack));
|
||||
result = null;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Try composing with this ruleset
|
||||
*
|
||||
* @param glanceBack
|
||||
* @param allRules
|
||||
* all the rules there are.
|
||||
* @param currentRules
|
||||
* the current node in the rule tree.
|
||||
* @param tupleLength
|
||||
* the size of the glanceback window we're considering.
|
||||
* @param length
|
||||
* @return
|
||||
*/
|
||||
private WordSequence tryOption(Stack<String> glanceBack,
|
||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||
int length) {
|
||||
final Stack<String> restack = this.restack(glanceBack,
|
||||
currentRules.getWord());
|
||||
restack.pop();
|
||||
return this.compose(restack, allRules, currentRules, tupleLength,
|
||||
length);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a new stack comprising all the items on the current stack, with
|
||||
* this new string added at the bottom
|
||||
*
|
||||
* @param stack
|
||||
* the stack to restack.
|
||||
* @param bottom
|
||||
* the item to place on the bottom.
|
||||
* @return the restacked stack.
|
||||
*/
|
||||
private Stack<String> restack(Stack<String> stack, String bottom) {
|
||||
final Stack<String> result;
|
||||
if (stack.isEmpty()) {
|
||||
result = new Stack<String>();
|
||||
result.push(bottom);
|
||||
} else {
|
||||
String top = stack.pop();
|
||||
result = restack(stack, bottom);
|
||||
result.push(top);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Random walk of the rule tree to extract (from the root) a legal sequence
|
||||
* of words the length of our tuple.
|
||||
*
|
||||
* @param rules
|
||||
* the rule tree (fragment) to walk.
|
||||
* @return a sequence of words.
|
||||
*/
|
||||
private Stack<String> composePreamble(RuleTreeNode rules) {
|
||||
final Stack<String> result;
|
||||
final RuleTreeNode successor = rules.getRule();
|
||||
|
||||
if (successor == null) {
|
||||
result = new Stack<String>();
|
||||
} else {
|
||||
result = this.composePreamble(successor);
|
||||
result.push(rules.getWord());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param tokens
|
||||
* a sequence of tokens
|
||||
* @param marker
|
||||
* a marker to terminate after the last occurrance of.
|
||||
* @return a copy of tokens, truncated at the last occurrance of the marker.
|
||||
*/
|
||||
private WordSequence truncateAtLastInstance(WordSequence tokens,
|
||||
String marker) {
|
||||
final WordSequence result = new WordSequence();
|
||||
|
||||
if (!tokens.isEmpty()) {
|
||||
|
||||
String token = tokens.remove();
|
||||
result.add(token);
|
||||
if (!(marker.equals(token) && !tokens.contains(marker))) {
|
||||
/*
|
||||
* woah, double negatives. If the token we're looking at is the
|
||||
* marker, and the remainder of the tokens does not include the
|
||||
* marker, we're done. Otherwise, we continue. OK?
|
||||
*/
|
||||
result.addAll(this.truncateAtLastInstance(tokens, marker));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -13,10 +13,55 @@ import java.util.Queue;
|
|||
* An ordered sequence of words. Of course it implements Queue since it is a
|
||||
* LinkedList and LinkedList implements Queue, but I want to make it explicitly
|
||||
* clear that this is a queue and can be used as such.
|
||||
*
|
||||
* @author Simon Brooke <simon@journeyman.cc>
|
||||
*/
|
||||
class WordSequence extends LinkedList<String> implements Queue<String> {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param tokens
|
||||
* a sequence of tokens
|
||||
* @param marker
|
||||
* a marker to terminate after the last occurrance of.
|
||||
* @return a copy of tokens, truncated at the last occurrance of the marker.
|
||||
*/
|
||||
public WordSequence truncateAtLastInstance(String marker) {
|
||||
final WordSequence result = new WordSequence();
|
||||
|
||||
for (String token : this) {
|
||||
if (token.endsWith(marker) && !this.contains(marker)) {
|
||||
/*
|
||||
* If the token we're looking at ends with the marker, and the
|
||||
* remainder of the tokens does not include a token ending with
|
||||
* the marker, we're done. Otherwise, we continue. OK?
|
||||
*/
|
||||
break;
|
||||
}
|
||||
result.add(token);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Specialisation: Working around the bug that the tokeniser treats PERIOD as a word character.
|
||||
*/
|
||||
@Override
|
||||
public boolean contains(Object target) {
|
||||
boolean result = false;
|
||||
if (target != null) {
|
||||
String marker = target.toString();
|
||||
|
||||
for (String token : this) {
|
||||
if (token.endsWith(marker)) {
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
163
src/cc/journeyman/milkwood/Writer.java
Normal file
163
src/cc/journeyman/milkwood/Writer.java
Normal file
|
@ -0,0 +1,163 @@
|
|||
/*
|
||||
* Proprietary unpublished source code property of
|
||||
* Simon Brooke <simon@journeyman.cc>.
|
||||
*
|
||||
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
||||
*/
|
||||
package cc.journeyman.milkwood;
|
||||
|
||||
import java.io.BufferedWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.util.Locale;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author Simon Brooke <simon@journeyman.cc>
|
||||
*/
|
||||
class Writer extends BufferedWriter {
|
||||
/**
|
||||
* The average number of sentences in a paragraph.
|
||||
*/
|
||||
public static final int AVSENTENCESPERPARA = 5;
|
||||
/**
|
||||
* A random number generator.
|
||||
*/
|
||||
private static Random RANDOM = new Random();
|
||||
/**
|
||||
* Dictionary of first-words we know about; each first-word maps onto a
|
||||
* tuple of tuples of word sequences beginning with that word, so 'I' might
|
||||
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
|
||||
*/
|
||||
TupleDictionary dictionary = new TupleDictionary();
|
||||
|
||||
/**
|
||||
* Whether or not I am in debugging mode.
|
||||
*/
|
||||
@SuppressWarnings("unused")
|
||||
private final boolean debug;
|
||||
|
||||
/**
|
||||
* @param out
|
||||
* the output stream to which I shall write.
|
||||
* @param debug
|
||||
* Whether or not I am in debugging mode.
|
||||
*/
|
||||
public Writer(OutputStream out, final boolean debug) {
|
||||
super(new OutputStreamWriter(out));
|
||||
this.debug = debug;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this sequence of tokens on this stream, sorting out minor issues of
|
||||
* orthography.
|
||||
*
|
||||
* @param tokens
|
||||
* the tokens.
|
||||
* @throws IOException
|
||||
* if it is impossible to write (e.g. file system full).
|
||||
*/
|
||||
public void generate(WordSequence tokens) throws IOException {
|
||||
boolean capitaliseNext = true;
|
||||
|
||||
try {
|
||||
for (String token : tokens) {
|
||||
capitaliseNext = writeToken(capitaliseNext, token);
|
||||
}
|
||||
} finally {
|
||||
this.flush();
|
||||
this.close();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deal with end of paragraph, capital after full stop, and other minor
|
||||
* orthographic conventions.
|
||||
*
|
||||
* @param capitalise
|
||||
* whether or not the token should be capitalised
|
||||
* @param token
|
||||
* the token to write;
|
||||
* @returnvtrue if the next token to be written should be capitalised.
|
||||
* @throws IOException
|
||||
*/
|
||||
private boolean writeToken(boolean capitalise, String token)
|
||||
throws IOException {
|
||||
if (this.spaceBefore(token)) {
|
||||
this.write(" ");
|
||||
}
|
||||
if (capitalise) {
|
||||
this.write(token.substring(0, 1).toUpperCase(Locale.getDefault()));
|
||||
this.write(token.substring(1));
|
||||
} else {
|
||||
this.write(token);
|
||||
}
|
||||
|
||||
this.maybeParagraph(token);
|
||||
|
||||
return (token.endsWith(Milkwood.PERIOD));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return false if token is punctuation, else true. Wouldn't it be nice if
|
||||
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
|
||||
* can give this slightly special semantics: return true only if this is
|
||||
* punctuation which would not normally be preceded with a space.
|
||||
*
|
||||
* @param ch
|
||||
* a character.
|
||||
* @return true if the should be preceded by a space, else false.
|
||||
*/
|
||||
private boolean spaceBefore(String token) {
|
||||
final boolean result;
|
||||
|
||||
if (token.length() == 1) {
|
||||
switch (token.charAt(0)) {
|
||||
case '.':
|
||||
case ',':
|
||||
case ':':
|
||||
case ';':
|
||||
case 's':
|
||||
/*
|
||||
* an 's' on its own is probably evidence of a possessive with
|
||||
* the apostrophe lost
|
||||
*/
|
||||
case 't':
|
||||
/*
|
||||
* similar; probably 'doesn't' or 'shouldn't' or other cases of
|
||||
* 'not' with an elided 'o'.
|
||||
*/
|
||||
result = false;
|
||||
break;
|
||||
default:
|
||||
result = true;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
result = false;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* If this token is an end-of-sentence token, then, on one chance in some,
|
||||
* have the writer write two new lines. NOTE: The tokeniser is treating
|
||||
* PERIOD ('.') as a word character, even though it has not been told to.
|
||||
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
|
||||
* investigate and fix.
|
||||
*
|
||||
* @param token
|
||||
* a token
|
||||
* @throws IOException
|
||||
* if Mr this has run out of ink
|
||||
*/
|
||||
private void maybeParagraph(String token) throws IOException {
|
||||
if (token.endsWith(Milkwood.PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
|
||||
this.write("\n\n");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in a new issue