Moved a lot of stuff out of TextGenerator mainly to declutter, so I can

think about what isn't working and why not a bit more cleanly.
This commit is contained in:
Simon Brooke 2013-10-31 08:51:41 +00:00
parent e59f160f70
commit 61a8b7ad97
5 changed files with 487 additions and 386 deletions

View file

@ -0,0 +1,60 @@
/*
* Proprietary unpublished source code property of
* Simon Brooke <simon@journeyman.cc>.
*
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
*/
package cc.journeyman.milkwood;
import java.io.IOException;
import java.io.InputStream;
import java.io.StreamTokenizer;
import java.util.LinkedList;
import java.util.Queue;
/**
* Read an input stream of text and digest it into a set of generation rules.
* Separated out of TextGenerator mainly to declutter tht class.
*
* @author simon
*
*/
public class Digester {
/**
* Read tokens from the input stream, and compile them into the rule tree
* below this root.
*
* @param in
* the input stream from which I read.
* @param tupleLength
* the length of the tuples I read.
* @param root
* the ruleset to which I shall add.
* @return the number of tokens read.
* @throws IOException if can't read from file system.
*/
protected int read(final InputStream in, final int tupleLength,
final RuleTreeNode root) throws IOException {
int result = 0;
final Queue<WordSequence> openTuples = new LinkedList<WordSequence>();
final Tokeniser tok = new Tokeniser(in);
for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok
.nextToken()) {
result++;
final WordSequence newTuple = new WordSequence();
String token = tok.readBareToken();
openTuples.add(newTuple);
for (WordSequence tuple : openTuples) {
tuple.add(token);
}
if (openTuples.size() > tupleLength) {
root.addSequence(openTuples.remove());
}
}
return result;
}
}

View file

@ -15,7 +15,6 @@ import java.io.OutputStream;
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
*/
/**
*
* @author Simon Brooke <simon@journeyman.cc>
@ -23,34 +22,43 @@ import java.io.OutputStream;
public class Milkwood {
/**
* Parse command line arguments and kick off the process. Expected
* arguments include:
* Parse command line arguments and kick off the process. Expected arguments
* include:
* <dl>
* <dt>-d, -debug</dt>
* <dd>Print debugging output to standard error</dd>
* <dt>-i, -input</dt>
* <dd>Input file, expected to be an English (or, frankly, other natural
* language) text. Defaults to standard in.</dd>
* <dt>-n, -tuple-length</dt>
* <dd>The length of tuples into which the file will be analised, default 2.</dd>
* <dd>The length of tuples into which the file will be analised, default 2.
* </dd>
* <dt>-o, -output</dt>
* <dd>Output file, to which generated text will be written.
* Defaults to standard out.</dd>
* <dd>Output file, to which generated text will be written. Defaults to
* standard out.</dd>
* </dl>
*
* @param args the command line arguments
* @exception FileNotFoundException if the user specifies a file which
* isn't available.
* @param args
* the command line arguments
* @exception FileNotFoundException
* if the user specifies a file which isn't available.
* @excpetion IOException if could not read from input or write to output.
*/
public static void main(String[] args) throws FileNotFoundException, IOException {
public static void main(String[] args) throws FileNotFoundException,
IOException {
InputStream in = System.in;
OutputStream out = System.out;
int tupleLength = 2;
boolean debug = false;
for (int cursor = 0; cursor < args.length; cursor++) {
String arg = args[cursor];
if (arg.startsWith("-") && arg.length() > 1) {
switch (arg.charAt(1)) {
case 'd':
debug = true;
break;
case 'i':
// input
in = new FileInputStream(new File(args[++cursor]));
@ -63,12 +71,41 @@ public class Milkwood {
tupleLength = Integer.parseInt(args[++cursor]);
break;
default:
throw new IllegalArgumentException(
String.format("Unrecognised argument '%s'", arg));
throw new IllegalArgumentException(String.format(
"Unrecognised argument '%s'", arg));
}
}
}
new TextGenerator().readAndGenerate( in, out, tupleLength);
new Milkwood().readAndGenerate(in, out, tupleLength, debug);
}
/**
* Read tokens from this input and use them to generate text on this output.
*
* @param in
* the input stream to read.
* @param out
* the output stream to write to.
* @param tupleLength
* the length of tuples to be used in generation.
* @param debug
* whether to print debugging output.
* @throws IOException
* if the file system buggers up, which is not, in the cosmic
* scheme of things, very likely.
*/
void readAndGenerate(final InputStream in, final OutputStream out,
final int tupleLength, boolean debug) throws IOException {
/* The root of the rule tree I shall build. */
RuleTreeNode root = new RuleTreeNode();
int length = new Digester().read(in, tupleLength, root);
if (debug) {
System.err.println(root.toString());
}
new TextGenerator().generate(out, tupleLength, root, length);
}
}

View file

@ -23,6 +23,10 @@ import java.util.Stack;
* @author Simon Brooke <simon@journeyman.cc>
*/
public class RuleTreeNode {
/**
* The magic token which identifies the root node of a rule tree.
*/
public static final String ROOTMAGICTOKEN = "*ROOT*";
/**
* The line separator on this platform.
*/
@ -42,6 +46,13 @@ public class RuleTreeNode {
*/
private Map<String,RuleTreeNode> rules = new HashMap<String,RuleTreeNode>();
/**
* If no argument passed, generate a root node.
*/
public RuleTreeNode() {
this( RuleTreeNode.ROOTMAGICTOKEN);
}
/**
* Create me wrapping this word.
* @param word the word I represent.

View file

@ -6,39 +6,23 @@
*/
package cc.journeyman.milkwood;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Queue;
import java.util.Random;
import java.util.Stack;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
*
* @author Simon Brooke <simon@journeyman.cc>
*/
class TextGenerator {
/**
* The magic token which identifies the root node of the
* rule tree.
*/
private static final String ROOTMAGICTOKEN = "*ROOT*";
/**
* The special magic token which is deemed to end sentences.
* The magic token which is deemed to end sentences.
*/
public static final String PERIOD = ".";
@ -51,116 +35,18 @@ class TextGenerator {
*/
private static Random RANDOM = new Random();
/**
* Dictionary of first-words we know about; each first-word maps
* onto a tuple of tuples of word sequences beginning with that
* word, so 'I' might map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
* Dictionary of first-words we know about; each first-word maps onto a
* tuple of tuples of word sequences beginning with that word, so 'I' might
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
*/
TupleDictionary dictionary = new TupleDictionary();
public TextGenerator() {
}
/**
* Read tokens from this input and use them to generate text on this output.
* @param in the input stream to read.
* @param out the output stream to write to.
* @param tupleLength the length of tuples to be used in generation.
* @throws IOException if the file system buggers up, which is not, in the
* cosmic scheme of things, very likely.
*/
void readAndGenerate(InputStream in, OutputStream out, int tupleLength) throws IOException {
/* The root of the rule tree I shall build. */
RuleTreeNode root = new RuleTreeNode( ROOTMAGICTOKEN);
int length = read(in, tupleLength, root);
System.err.println( root.toString());
generate( out, tupleLength, root, length);
}
/**
* Read tokens from the input stream, and compile them into a ruleset below root.
* @param in the input stream from which I read.
* @param tupleLength the length of the tuples I read.
* @param root the ruleset to which I shall add.
* @return the number of tokens read.
* @throws IOException
*/
private int read(InputStream in, int tupleLength, RuleTreeNode root) throws IOException {
int result = 0;
Queue<WordSequence> openTuples = new LinkedList<WordSequence>();
StreamTokenizer tok = prepareTokenizer(in);
for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok.nextToken()) {
result ++;
final WordSequence newTuple = new WordSequence();
String token = readBareToken(tok, type);
openTuples.add(newTuple);
for ( WordSequence tuple : openTuples) {
tuple.add(token);
}
if (openTuples.size() > tupleLength) {
root.addSequence( openTuples.remove());
}
}
return result;
}
/**
* There surely must be a better way to get just the token out of a
* StreamTokenizer...!
* @param tok the tokenizer.
* @return just the next token.
*/
private String readBareToken(StreamTokenizer tok, int type) {
final String token;
switch (type) {
case StreamTokenizer.TT_EOL:
token = "FIXME"; // TODO: fix this!
break;
case StreamTokenizer.TT_NUMBER:
token = new Double(tok.nval).toString();
break;
case StreamTokenizer.TT_WORD:
token = tok.sval.toLowerCase();
break;
default:
StringBuffer buffy = new StringBuffer();
buffy.append((char) type);
token = buffy.toString();
break;
}
return token;
}
/**
* Prepare a tokeniser on this input stream, set up to handle at least
* Western European natural language text.
* @param in the stream.
* @return a suitable tokeniser.
*/
private StreamTokenizer prepareTokenizer(InputStream in) {
Reader gentle = new BufferedReader(new InputStreamReader(in));
StreamTokenizer tok = new StreamTokenizer(gentle);
tok.resetSyntax();
tok.whitespaceChars(8, 15);
tok.whitespaceChars(28, 32);
/* treat quotemarks as white space */
tok.whitespaceChars((int) '\"', (int) '\"');
tok.whitespaceChars((int) '\'', (int) '\'');
tok.wordChars((int) '0', (int) '9');
tok.wordChars((int) 'A', (int) 'Z');
tok.wordChars((int) 'a', (int) 'z');
tok.parseNumbers();
return tok;
}
private void generate(OutputStream out, int tupleLength, RuleTreeNode root, int length) throws IOException {
public void generate(OutputStream out, int tupleLength, RuleTreeNode root,
int length) throws IOException {
WordSequence tokens = this.compose(root, tupleLength, length);
if (tokens.contains(PERIOD)) {
@ -172,13 +58,18 @@ class TextGenerator {
}
/**
* Write this sequence of tokens on this stream, sorting out minor
* issues of orthography.
* @param out the stream.
* @param tokens the tokens.
* @throws IOException if it is impossible to write (e.g. file system full).
* Write this sequence of tokens on this stream, sorting out minor issues of
* orthography.
*
* @param out
* the stream.
* @param tokens
* the tokens.
* @throws IOException
* if it is impossible to write (e.g. file system full).
*/
private void generate(OutputStream out, WordSequence tokens) throws IOException {
private void generate(OutputStream out, WordSequence tokens)
throws IOException {
BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
boolean capitaliseNext = true;
@ -193,11 +84,15 @@ class TextGenerator {
}
/**
* Deal with end of paragraph, capital after full stop, and other
* minor orthographic conventions.
* @param dickens the scrivenor who writes for us.
* @param capitalise whether or not the token should be capitalised
* @param token the token to write;
* Deal with end of paragraph, capital after full stop, and other minor
* orthographic conventions.
*
* @param dickens
* the scrivenor who writes for us.
* @param capitalise
* whether or not the token should be capitalised
* @param token
* the token to write;
* @returnvtrue if the next token to be written should be capitalised.
* @throws IOException
*/
@ -207,7 +102,8 @@ class TextGenerator {
dickens.write(" ");
}
if (capitalise) {
dickens.write(token.substring(0, 1).toUpperCase(Locale.getDefault()));
dickens.write(token.substring(0, 1)
.toUpperCase(Locale.getDefault()));
dickens.write(token.substring(1));
} else {
dickens.write(token);
@ -219,11 +115,13 @@ class TextGenerator {
}
/**
* Return false if token is punctuation, else true. Wouldn't it be
* nice if Java provided Character.isPunctuation(char)? However, since it
* doesn't, I can give this slightly special semantics: return true only if
* this is punctuation which would not normally be preceded with a space.
* @param ch a character.
* Return false if token is punctuation, else true. Wouldn't it be nice if
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
* can give this slightly special semantics: return true only if this is
* punctuation which would not normally be preceded with a space.
*
* @param ch
* a character.
* @return true if the should be preceded by a space, else false.
*/
private boolean spaceBefore(String token) {
@ -241,8 +139,9 @@ class TextGenerator {
* the apostrophe lost
*/
case 't':
/* similar; probably 'doesn't' or 'shouldn't' or other cases
* of 'not' with an elided 'o'.
/*
* similar; probably 'doesn't' or 'shouldn't' or other cases of
* 'not' with an elided 'o'.
*/
result = false;
break;
@ -258,17 +157,21 @@ class TextGenerator {
}
/**
* If this token is an end-of-sentence token, then, on one chance in
* some, have the writer write two new lines. NOTE: The tokeniser is treating
* If this token is an end-of-sentence token, then, on one chance in some,
* have the writer write two new lines. NOTE: The tokeniser is treating
* PERIOD ('.') as a word character, even though it has not been told to.
* Token.endsWith( PERIOD) is a hack to get round this problem.
* TODO: investigate and fix.
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
* investigate and fix.
*
* @param token a token
* @param dickens our scrivenor
* @throws IOException if Mr Dickens has run out of ink
* @param token
* a token
* @param dickens
* our scrivenor
* @throws IOException
* if Mr Dickens has run out of ink
*/
private void maybeParagraph(String token, BufferedWriter dickens) throws IOException {
private void maybeParagraph(String token, BufferedWriter dickens)
throws IOException {
if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
dickens.write("\n\n");
}
@ -276,6 +179,7 @@ class TextGenerator {
/**
* Recursive, backtracking, output generator.
*
* @param rules
* @param tupleLength
* @param length
@ -285,7 +189,8 @@ class TextGenerator {
Stack<String> preamble = composePreamble(rules);
WordSequence result = new WordSequence();
// composing the preamble will have ended with *ROOT* on top of the stack;
// composing the preamble will have ended with *ROOT* on top of the
// stack;
// get rid of it.
preamble.pop();
@ -296,8 +201,9 @@ class TextGenerator {
}
/**
* Recursively attempt to find sequences in the ruleset to append to
* what's been composed so far.
* Recursively attempt to find sequences in the ruleset to append to what's
* been composed so far.
*
* @param glanceBack
* @param allRules
* @param currentRules
@ -309,12 +215,13 @@ class TextGenerator {
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
int length) {
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
assert (allRules.getWord() == ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
WordSequence result;
try {
@SuppressWarnings("unchecked")
String here = currentRules.getWord((Stack<String>) glanceBack.clone());
String here = currentRules.getWord((Stack<String>) glanceBack
.clone());
System.err.println(String.format("Trying token %s", here));
result = new WordSequence();
@ -325,9 +232,11 @@ class TextGenerator {
Collection<String> options = allRules.getSuccessors();
for (String next : options) {
WordSequence rest =
this.tryOption( (Stack<String>) glanceBack.clone(), allRules,
currentRules.getRule(next), tupleLength, length - 1);
@SuppressWarnings("unchecked")
WordSequence rest = this
.tryOption((Stack<String>) glanceBack.clone(),
allRules, currentRules.getRule(next),
tupleLength, length - 1);
if (rest != null) {
/* we have a solution */
@ -337,8 +246,7 @@ class TextGenerator {
}
}
} catch (NoSuchPathException ex) {
Logger.getLogger(TextGenerator.class.getName()).log(Level.WARNING,
String.format("No path %s: Backtracking...", glanceBack));
System.err.println( String.format("No path %s: Backtracking...", glanceBack));
result = null;
}
@ -347,10 +255,14 @@ class TextGenerator {
/**
* Try composing with this ruleset
*
* @param glanceBack
* @param allRules all the rules there are.
* @param currentRules the current node in the rule tree.
* @param tupleLength the size of the glanceback window we're considering.
* @param allRules
* all the rules there are.
* @param currentRules
* the current node in the rule tree.
* @param tupleLength
* the size of the glanceback window we're considering.
* @param length
* @return
*/
@ -365,11 +277,13 @@ class TextGenerator {
}
/**
* Return a new stack comprising all the items on the current stack,
* with this new string added at the bottom
* Return a new stack comprising all the items on the current stack, with
* this new string added at the bottom
*
* @param stack the stack to restack.
* @param bottom the item to place on the bottom.
* @param stack
* the stack to restack.
* @param bottom
* the item to place on the bottom.
* @return the restacked stack.
*/
private Stack<String> restack(Stack<String> stack, String bottom) {
@ -385,11 +299,12 @@ class TextGenerator {
return result;
}
/**
* Random walk of the rule tree to extract (from the root) a legal sequence of words the length of our tuple.
* Random walk of the rule tree to extract (from the root) a legal sequence
* of words the length of our tuple.
*
* @param rules the rule tree (fragment) to walk.
* @param rules
* the rule tree (fragment) to walk.
* @return a sequence of words.
*/
private Stack<String> composePreamble(RuleTreeNode rules) {
@ -407,8 +322,10 @@ class TextGenerator {
/**
*
* @param tokens a sequence of tokens
* @param marker a marker to terminate after the last occurrance of.
* @param tokens
* a sequence of tokens
* @param marker
* a marker to terminate after the last occurrance of.
* @return a copy of tokens, truncated at the last occurrance of the marker.
*/
private WordSequence truncateAtLastInstance(WordSequence tokens,
@ -420,9 +337,11 @@ class TextGenerator {
String token = tokens.remove();
result.add(token);
if (!(marker.equals(token) && !tokens.contains(marker))) {
/* woah, double negatives. If the token we're looking at is the
/*
* woah, double negatives. If the token we're looking at is the
* marker, and the remainder of the tokens does not include the
* marker, we're done. Otherwise, we continue. OK? */
* marker, we're done. Otherwise, we continue. OK?
*/
result.addAll(this.truncateAtLastInstance(tokens, marker));
}
}

View file

@ -0,0 +1,74 @@
/*
* Proprietary unpublished source code property of
* Simon Brooke <simon@journeyman.cc>.
*
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
*/
package cc.journeyman.milkwood;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
/**
* A tokeniser which reads tokens in a manner which suits me. Although this
* implementation is based on a StreamTokenizer, the point of separating this
* out into its own class is that if I had more time I could reimplement.
*
* @author simon
*
*/
public class Tokeniser extends StreamTokenizer {
public Tokeniser(Reader r) {
super(r);
this.resetSyntax();
this.whitespaceChars(8, 15);
this.whitespaceChars(28, 32);
/*
* treat quotemarks as white space. Actually it would be better if quote
* marks were white space only if preceded or followed by whitespace, so
* that, e.g., 'don't' and 'can't' appeared as single tokens. But that
* means really reimplementing the parser and I don't have time.
*/
this.whitespaceChars((int) '\"', (int) '\"');
this.whitespaceChars((int) '\'', (int) '\'');
this.wordChars((int) '0', (int) '9');
this.wordChars((int) 'A', (int) 'Z');
this.wordChars((int) 'a', (int) 'z');
}
public Tokeniser(InputStream in) {
this(new BufferedReader(new InputStreamReader(in)));
}
/**
* There surely must be a better way to get just the token out of a
* StreamTokenizer...!
*/
public String readBareToken() {
final String token;
switch (this.ttype) {
case StreamTokenizer.TT_EOL:
token = "FIXME"; // TODO: fix this!
break;
case StreamTokenizer.TT_NUMBER:
token = new Double(this.nval).toString();
break;
case StreamTokenizer.TT_WORD:
token = this.sval.toLowerCase();
break;
default:
StringBuffer buffy = new StringBuffer();
buffy.append((char) this.ttype);
token = buffy.toString();
break;
}
return token;
}
}