Moved a lot of stuff out of TextGenerator mainly to declutter, so I can
think about what isn't working and why not a bit more cleanly.
This commit is contained in:
parent
e59f160f70
commit
61a8b7ad97
60
src/cc/journeyman/milkwood/Digester.java
Normal file
60
src/cc/journeyman/milkwood/Digester.java
Normal file
|
@ -0,0 +1,60 @@
|
||||||
|
/*
|
||||||
|
* Proprietary unpublished source code property of
|
||||||
|
* Simon Brooke <simon@journeyman.cc>.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
||||||
|
*/
|
||||||
|
package cc.journeyman.milkwood;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.StreamTokenizer;
|
||||||
|
import java.util.LinkedList;
|
||||||
|
import java.util.Queue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read an input stream of text and digest it into a set of generation rules.
|
||||||
|
* Separated out of TextGenerator mainly to declutter tht class.
|
||||||
|
*
|
||||||
|
* @author simon
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class Digester {
|
||||||
|
/**
|
||||||
|
* Read tokens from the input stream, and compile them into the rule tree
|
||||||
|
* below this root.
|
||||||
|
*
|
||||||
|
* @param in
|
||||||
|
* the input stream from which I read.
|
||||||
|
* @param tupleLength
|
||||||
|
* the length of the tuples I read.
|
||||||
|
* @param root
|
||||||
|
* the ruleset to which I shall add.
|
||||||
|
* @return the number of tokens read.
|
||||||
|
* @throws IOException if can't read from file system.
|
||||||
|
*/
|
||||||
|
protected int read(final InputStream in, final int tupleLength,
|
||||||
|
final RuleTreeNode root) throws IOException {
|
||||||
|
int result = 0;
|
||||||
|
final Queue<WordSequence> openTuples = new LinkedList<WordSequence>();
|
||||||
|
final Tokeniser tok = new Tokeniser(in);
|
||||||
|
|
||||||
|
for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok
|
||||||
|
.nextToken()) {
|
||||||
|
result++;
|
||||||
|
final WordSequence newTuple = new WordSequence();
|
||||||
|
String token = tok.readBareToken();
|
||||||
|
|
||||||
|
openTuples.add(newTuple);
|
||||||
|
for (WordSequence tuple : openTuples) {
|
||||||
|
tuple.add(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (openTuples.size() > tupleLength) {
|
||||||
|
root.addSequence(openTuples.remove());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
|
@ -15,60 +15,97 @@ import java.io.OutputStream;
|
||||||
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @author Simon Brooke <simon@journeyman.cc>
|
* @author Simon Brooke <simon@journeyman.cc>
|
||||||
*/
|
*/
|
||||||
public class Milkwood {
|
public class Milkwood {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse command line arguments and kick off the process. Expected
|
* Parse command line arguments and kick off the process. Expected arguments
|
||||||
* arguments include:
|
* include:
|
||||||
* <dl>
|
* <dl>
|
||||||
* <dt>-i, -input</dt>
|
* <dt>-d, -debug</dt>
|
||||||
* <dd>Input file, expected to be an English (or, frankly, other natural
|
* <dd>Print debugging output to standard error</dd>
|
||||||
* language) text. Defaults to standard in.</dd>
|
* <dt>-i, -input</dt>
|
||||||
* <dt>-n, -tuple-length</dt>
|
* <dd>Input file, expected to be an English (or, frankly, other natural
|
||||||
* <dd>The length of tuples into which the file will be analised, default 2.</dd>
|
* language) text. Defaults to standard in.</dd>
|
||||||
* <dt>-o, -output</dt>
|
* <dt>-n, -tuple-length</dt>
|
||||||
* <dd>Output file, to which generated text will be written.
|
* <dd>The length of tuples into which the file will be analised, default 2.
|
||||||
* Defaults to standard out.</dd>
|
* </dd>
|
||||||
* </dl>
|
* <dt>-o, -output</dt>
|
||||||
*
|
* <dd>Output file, to which generated text will be written. Defaults to
|
||||||
* @param args the command line arguments
|
* standard out.</dd>
|
||||||
* @exception FileNotFoundException if the user specifies a file which
|
* </dl>
|
||||||
* isn't available.
|
*
|
||||||
* @excpetion IOException if could not read from input or write to output.
|
* @param args
|
||||||
*/
|
* the command line arguments
|
||||||
public static void main(String[] args) throws FileNotFoundException, IOException {
|
* @exception FileNotFoundException
|
||||||
InputStream in = System.in;
|
* if the user specifies a file which isn't available.
|
||||||
OutputStream out = System.out;
|
* @excpetion IOException if could not read from input or write to output.
|
||||||
int tupleLength = 2;
|
*/
|
||||||
|
public static void main(String[] args) throws FileNotFoundException,
|
||||||
|
IOException {
|
||||||
|
InputStream in = System.in;
|
||||||
|
OutputStream out = System.out;
|
||||||
|
int tupleLength = 2;
|
||||||
|
boolean debug = false;
|
||||||
|
|
||||||
for (int cursor = 0; cursor < args.length; cursor++) {
|
for (int cursor = 0; cursor < args.length; cursor++) {
|
||||||
String arg = args[cursor];
|
String arg = args[cursor];
|
||||||
|
|
||||||
if (arg.startsWith("-") && arg.length() > 1) {
|
if (arg.startsWith("-") && arg.length() > 1) {
|
||||||
switch (arg.charAt(1)) {
|
switch (arg.charAt(1)) {
|
||||||
case 'i':
|
case 'd':
|
||||||
// input
|
debug = true;
|
||||||
in = new FileInputStream(new File(args[++cursor]));
|
break;
|
||||||
break;
|
case 'i':
|
||||||
case 'o': // output
|
// input
|
||||||
out = new FileOutputStream(new File(args[++cursor]));
|
in = new FileInputStream(new File(args[++cursor]));
|
||||||
break;
|
break;
|
||||||
case 'n':
|
case 'o': // output
|
||||||
case 't': // tuple length
|
out = new FileOutputStream(new File(args[++cursor]));
|
||||||
tupleLength = Integer.parseInt(args[++cursor]);
|
break;
|
||||||
break;
|
case 'n':
|
||||||
default:
|
case 't': // tuple length
|
||||||
throw new IllegalArgumentException(
|
tupleLength = Integer.parseInt(args[++cursor]);
|
||||||
String.format("Unrecognised argument '%s'", arg));
|
break;
|
||||||
}
|
default:
|
||||||
}
|
throw new IllegalArgumentException(String.format(
|
||||||
}
|
"Unrecognised argument '%s'", arg));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new Milkwood().readAndGenerate(in, out, tupleLength, debug);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read tokens from this input and use them to generate text on this output.
|
||||||
|
*
|
||||||
|
* @param in
|
||||||
|
* the input stream to read.
|
||||||
|
* @param out
|
||||||
|
* the output stream to write to.
|
||||||
|
* @param tupleLength
|
||||||
|
* the length of tuples to be used in generation.
|
||||||
|
* @param debug
|
||||||
|
* whether to print debugging output.
|
||||||
|
* @throws IOException
|
||||||
|
* if the file system buggers up, which is not, in the cosmic
|
||||||
|
* scheme of things, very likely.
|
||||||
|
*/
|
||||||
|
void readAndGenerate(final InputStream in, final OutputStream out,
|
||||||
|
final int tupleLength, boolean debug) throws IOException {
|
||||||
|
/* The root of the rule tree I shall build. */
|
||||||
|
RuleTreeNode root = new RuleTreeNode();
|
||||||
|
int length = new Digester().read(in, tupleLength, root);
|
||||||
|
|
||||||
|
if (debug) {
|
||||||
|
System.err.println(root.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
new TextGenerator().generate(out, tupleLength, root, length);
|
||||||
|
}
|
||||||
|
|
||||||
new TextGenerator().readAndGenerate( in, out, tupleLength);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,10 @@ import java.util.Stack;
|
||||||
* @author Simon Brooke <simon@journeyman.cc>
|
* @author Simon Brooke <simon@journeyman.cc>
|
||||||
*/
|
*/
|
||||||
public class RuleTreeNode {
|
public class RuleTreeNode {
|
||||||
|
/**
|
||||||
|
* The magic token which identifies the root node of a rule tree.
|
||||||
|
*/
|
||||||
|
public static final String ROOTMAGICTOKEN = "*ROOT*";
|
||||||
/**
|
/**
|
||||||
* The line separator on this platform.
|
* The line separator on this platform.
|
||||||
*/
|
*/
|
||||||
|
@ -42,6 +46,13 @@ public class RuleTreeNode {
|
||||||
*/
|
*/
|
||||||
private Map<String,RuleTreeNode> rules = new HashMap<String,RuleTreeNode>();
|
private Map<String,RuleTreeNode> rules = new HashMap<String,RuleTreeNode>();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If no argument passed, generate a root node.
|
||||||
|
*/
|
||||||
|
public RuleTreeNode() {
|
||||||
|
this( RuleTreeNode.ROOTMAGICTOKEN);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create me wrapping this word.
|
* Create me wrapping this word.
|
||||||
* @param word the word I represent.
|
* @param word the word I represent.
|
||||||
|
|
|
@ -6,228 +6,126 @@
|
||||||
*/
|
*/
|
||||||
package cc.journeyman.milkwood;
|
package cc.journeyman.milkwood;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.BufferedWriter;
|
import java.io.BufferedWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.StreamTokenizer;
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Queue;
|
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
import java.util.Stack;
|
import java.util.Stack;
|
||||||
import java.util.logging.Level;
|
|
||||||
import java.util.logging.Logger;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
* @author Simon Brooke <simon@journeyman.cc>
|
* @author Simon Brooke <simon@journeyman.cc>
|
||||||
*/
|
*/
|
||||||
class TextGenerator {
|
class TextGenerator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The magic token which identifies the root node of the
|
* The magic token which is deemed to end sentences.
|
||||||
* rule tree.
|
|
||||||
*/
|
*/
|
||||||
private static final String ROOTMAGICTOKEN = "*ROOT*";
|
public static final String PERIOD = ".";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The special magic token which is deemed to end sentences.
|
* The average number of sentences in a paragraph.
|
||||||
*/
|
*/
|
||||||
public static final String PERIOD = ".";
|
public static final int AVSENTENCESPERPARA = 5;
|
||||||
|
/**
|
||||||
|
* A random number generator.
|
||||||
|
*/
|
||||||
|
private static Random RANDOM = new Random();
|
||||||
|
/**
|
||||||
|
* Dictionary of first-words we know about; each first-word maps onto a
|
||||||
|
* tuple of tuples of word sequences beginning with that word, so 'I' might
|
||||||
|
* map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
|
||||||
|
*/
|
||||||
|
TupleDictionary dictionary = new TupleDictionary();
|
||||||
|
|
||||||
/**
|
public TextGenerator() {
|
||||||
* The average number of sentences in a paragraph.
|
}
|
||||||
*/
|
|
||||||
public static final int AVSENTENCESPERPARA = 5;
|
|
||||||
/**
|
|
||||||
* A random number generator.
|
|
||||||
*/
|
|
||||||
private static Random RANDOM = new Random();
|
|
||||||
/**
|
|
||||||
* Dictionary of first-words we know about; each first-word maps
|
|
||||||
* onto a tuple of tuples of word sequences beginning with that
|
|
||||||
* word, so 'I' might map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]].
|
|
||||||
*/
|
|
||||||
TupleDictionary dictionary = new TupleDictionary();
|
|
||||||
|
|
||||||
public TextGenerator() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
public void generate(OutputStream out, int tupleLength, RuleTreeNode root,
|
||||||
* Read tokens from this input and use them to generate text on this output.
|
int length) throws IOException {
|
||||||
* @param in the input stream to read.
|
WordSequence tokens = this.compose(root, tupleLength, length);
|
||||||
* @param out the output stream to write to.
|
|
||||||
* @param tupleLength the length of tuples to be used in generation.
|
|
||||||
* @throws IOException if the file system buggers up, which is not, in the
|
|
||||||
* cosmic scheme of things, very likely.
|
|
||||||
*/
|
|
||||||
void readAndGenerate(InputStream in, OutputStream out, int tupleLength) throws IOException {
|
|
||||||
/* The root of the rule tree I shall build. */
|
|
||||||
RuleTreeNode root = new RuleTreeNode( ROOTMAGICTOKEN);
|
|
||||||
int length = read(in, tupleLength, root);
|
|
||||||
|
|
||||||
System.err.println( root.toString());
|
if (tokens.contains(PERIOD)) {
|
||||||
|
// TODO: eq = equal?
|
||||||
generate( out, tupleLength, root, length);
|
tokens = this.truncateAtLastInstance(tokens, PERIOD);
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Read tokens from the input stream, and compile them into a ruleset below root.
|
|
||||||
* @param in the input stream from which I read.
|
|
||||||
* @param tupleLength the length of the tuples I read.
|
|
||||||
* @param root the ruleset to which I shall add.
|
|
||||||
* @return the number of tokens read.
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
private int read(InputStream in, int tupleLength, RuleTreeNode root) throws IOException {
|
|
||||||
int result = 0;
|
|
||||||
Queue<WordSequence> openTuples = new LinkedList<WordSequence>();
|
|
||||||
StreamTokenizer tok = prepareTokenizer(in);
|
|
||||||
|
|
||||||
for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok.nextToken()) {
|
|
||||||
result ++;
|
|
||||||
final WordSequence newTuple = new WordSequence();
|
|
||||||
String token = readBareToken(tok, type);
|
|
||||||
|
|
||||||
openTuples.add(newTuple);
|
|
||||||
for ( WordSequence tuple : openTuples) {
|
|
||||||
tuple.add(token);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (openTuples.size() > tupleLength) {
|
|
||||||
root.addSequence( openTuples.remove());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* There surely must be a better way to get just the token out of a
|
|
||||||
* StreamTokenizer...!
|
|
||||||
* @param tok the tokenizer.
|
|
||||||
* @return just the next token.
|
|
||||||
*/
|
|
||||||
private String readBareToken(StreamTokenizer tok, int type) {
|
|
||||||
final String token;
|
|
||||||
|
|
||||||
switch (type) {
|
|
||||||
case StreamTokenizer.TT_EOL:
|
|
||||||
token = "FIXME"; // TODO: fix this!
|
|
||||||
break;
|
|
||||||
case StreamTokenizer.TT_NUMBER:
|
|
||||||
token = new Double(tok.nval).toString();
|
|
||||||
break;
|
|
||||||
case StreamTokenizer.TT_WORD:
|
|
||||||
token = tok.sval.toLowerCase();
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
StringBuffer buffy = new StringBuffer();
|
|
||||||
buffy.append((char) type);
|
|
||||||
token = buffy.toString();
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
return token;
|
|
||||||
|
this.generate(out, tokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepare a tokeniser on this input stream, set up to handle at least
|
* Write this sequence of tokens on this stream, sorting out minor issues of
|
||||||
* Western European natural language text.
|
* orthography.
|
||||||
* @param in the stream.
|
*
|
||||||
* @return a suitable tokeniser.
|
* @param out
|
||||||
*/
|
* the stream.
|
||||||
private StreamTokenizer prepareTokenizer(InputStream in) {
|
* @param tokens
|
||||||
Reader gentle = new BufferedReader(new InputStreamReader(in));
|
* the tokens.
|
||||||
StreamTokenizer tok = new StreamTokenizer(gentle);
|
* @throws IOException
|
||||||
|
* if it is impossible to write (e.g. file system full).
|
||||||
|
*/
|
||||||
|
private void generate(OutputStream out, WordSequence tokens)
|
||||||
|
throws IOException {
|
||||||
|
BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
|
||||||
|
boolean capitaliseNext = true;
|
||||||
|
|
||||||
tok.resetSyntax();
|
try {
|
||||||
tok.whitespaceChars(8, 15);
|
for (String token : tokens) {
|
||||||
tok.whitespaceChars(28, 32);
|
capitaliseNext = writeToken(dickens, capitaliseNext, token);
|
||||||
/* treat quotemarks as white space */
|
}
|
||||||
tok.whitespaceChars((int) '\"', (int) '\"');
|
} finally {
|
||||||
tok.whitespaceChars((int) '\'', (int) '\'');
|
dickens.flush();
|
||||||
tok.wordChars((int) '0', (int) '9');
|
dickens.close();
|
||||||
tok.wordChars((int) 'A', (int) 'Z');
|
}
|
||||||
tok.wordChars((int) 'a', (int) 'z');
|
|
||||||
tok.parseNumbers();
|
|
||||||
return tok;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void generate(OutputStream out, int tupleLength, RuleTreeNode root, int length) throws IOException {
|
/**
|
||||||
WordSequence tokens = this.compose( root, tupleLength, length);
|
* Deal with end of paragraph, capital after full stop, and other minor
|
||||||
|
* orthographic conventions.
|
||||||
if ( tokens.contains(PERIOD)) {
|
*
|
||||||
// TODO: eq = equal?
|
* @param dickens
|
||||||
tokens = this.truncateAtLastInstance( tokens, PERIOD);
|
* the scrivenor who writes for us.
|
||||||
}
|
* @param capitalise
|
||||||
|
* whether or not the token should be capitalised
|
||||||
this.generate( out, tokens);
|
* @param token
|
||||||
}
|
* the token to write;
|
||||||
|
* @returnvtrue if the next token to be written should be capitalised.
|
||||||
/**
|
* @throws IOException
|
||||||
* Write this sequence of tokens on this stream, sorting out minor
|
*/
|
||||||
* issues of orthography.
|
|
||||||
* @param out the stream.
|
|
||||||
* @param tokens the tokens.
|
|
||||||
* @throws IOException if it is impossible to write (e.g. file system full).
|
|
||||||
*/
|
|
||||||
private void generate(OutputStream out, WordSequence tokens) throws IOException {
|
|
||||||
BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out));
|
|
||||||
boolean capitaliseNext = true;
|
|
||||||
|
|
||||||
try {
|
|
||||||
for (String token : tokens) {
|
|
||||||
capitaliseNext = writeToken(dickens, capitaliseNext, token);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
dickens.flush();
|
|
||||||
dickens.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Deal with end of paragraph, capital after full stop, and other
|
|
||||||
* minor orthographic conventions.
|
|
||||||
* @param dickens the scrivenor who writes for us.
|
|
||||||
* @param capitalise whether or not the token should be capitalised
|
|
||||||
* @param token the token to write;
|
|
||||||
* @returnvtrue if the next token to be written should be capitalised.
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
private boolean writeToken(BufferedWriter dickens, boolean capitalise,
|
private boolean writeToken(BufferedWriter dickens, boolean capitalise,
|
||||||
String token) throws IOException {
|
String token) throws IOException {
|
||||||
if ( this.spaceBefore(token)) {
|
if (this.spaceBefore(token)) {
|
||||||
dickens.write( " ");
|
dickens.write(" ");
|
||||||
}
|
}
|
||||||
if ( capitalise) {
|
if (capitalise) {
|
||||||
dickens.write(token.substring(0, 1).toUpperCase(Locale.getDefault()));
|
dickens.write(token.substring(0, 1)
|
||||||
|
.toUpperCase(Locale.getDefault()));
|
||||||
dickens.write(token.substring(1));
|
dickens.write(token.substring(1));
|
||||||
} else {
|
} else {
|
||||||
dickens.write(token);
|
dickens.write(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.maybeParagraph( token, dickens);
|
this.maybeParagraph(token, dickens);
|
||||||
|
|
||||||
return (token.endsWith(PERIOD));
|
return (token.endsWith(PERIOD));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return false if token is punctuation, else true. Wouldn't it be
|
* Return false if token is punctuation, else true. Wouldn't it be nice if
|
||||||
* nice if Java provided Character.isPunctuation(char)? However, since it
|
* Java provided Character.isPunctuation(char)? However, since it doesn't, I
|
||||||
* doesn't, I can give this slightly special semantics: return true only if
|
* can give this slightly special semantics: return true only if this is
|
||||||
* this is punctuation which would not normally be preceded with a space.
|
* punctuation which would not normally be preceded with a space.
|
||||||
* @param ch a character.
|
*
|
||||||
* @return true if the should be preceded by a space, else false.
|
* @param ch
|
||||||
*/
|
* a character.
|
||||||
private boolean spaceBefore(String token) {
|
* @return true if the should be preceded by a space, else false.
|
||||||
final boolean result;
|
*/
|
||||||
|
private boolean spaceBefore(String token) {
|
||||||
|
final boolean result;
|
||||||
|
|
||||||
if (token.length() == 1) {
|
if (token.length() == 1) {
|
||||||
switch (token.charAt(0)) {
|
switch (token.charAt(0)) {
|
||||||
|
@ -241,9 +139,10 @@ class TextGenerator {
|
||||||
* the apostrophe lost
|
* the apostrophe lost
|
||||||
*/
|
*/
|
||||||
case 't':
|
case 't':
|
||||||
/* similar; probably 'doesn't' or 'shouldn't' or other cases
|
/*
|
||||||
* of 'not' with an elided 'o'.
|
* similar; probably 'doesn't' or 'shouldn't' or other cases of
|
||||||
*/
|
* 'not' with an elided 'o'.
|
||||||
|
*/
|
||||||
result = false;
|
result = false;
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -254,106 +153,119 @@ class TextGenerator {
|
||||||
result = false;
|
result = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* If this token is an end-of-sentence token, then, on one chance in
|
* If this token is an end-of-sentence token, then, on one chance in some,
|
||||||
* some, have the writer write two new lines. NOTE: The tokeniser is treating
|
* have the writer write two new lines. NOTE: The tokeniser is treating
|
||||||
* PERIOD ('.') as a word character, even though it has not been told to.
|
* PERIOD ('.') as a word character, even though it has not been told to.
|
||||||
* Token.endsWith( PERIOD) is a hack to get round this problem.
|
* Token.endsWith( PERIOD) is a hack to get round this problem. TODO:
|
||||||
* TODO: investigate and fix.
|
* investigate and fix.
|
||||||
*
|
*
|
||||||
* @param token a token
|
* @param token
|
||||||
* @param dickens our scrivenor
|
* a token
|
||||||
* @throws IOException if Mr Dickens has run out of ink
|
* @param dickens
|
||||||
*/
|
* our scrivenor
|
||||||
private void maybeParagraph(String token, BufferedWriter dickens) throws IOException {
|
* @throws IOException
|
||||||
if ( token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
|
* if Mr Dickens has run out of ink
|
||||||
dickens.write("\n\n");
|
*/
|
||||||
}
|
private void maybeParagraph(String token, BufferedWriter dickens)
|
||||||
}
|
throws IOException {
|
||||||
|
if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) {
|
||||||
|
dickens.write("\n\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recursive, backtracking, output generator.
|
* Recursive, backtracking, output generator.
|
||||||
* @param rules
|
*
|
||||||
* @param tupleLength
|
* @param rules
|
||||||
* @param length
|
* @param tupleLength
|
||||||
* @return
|
* @param length
|
||||||
*/
|
* @return
|
||||||
private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
|
*/
|
||||||
Stack<String> preamble = composePreamble( rules);
|
private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) {
|
||||||
WordSequence result = new WordSequence();
|
Stack<String> preamble = composePreamble(rules);
|
||||||
|
WordSequence result = new WordSequence();
|
||||||
|
|
||||||
// composing the preamble will have ended with *ROOT* on top of the stack;
|
// composing the preamble will have ended with *ROOT* on top of the
|
||||||
// get rid of it.
|
// stack;
|
||||||
preamble.pop();
|
// get rid of it.
|
||||||
|
preamble.pop();
|
||||||
|
|
||||||
result.addAll(preamble);
|
result.addAll(preamble);
|
||||||
|
|
||||||
result.addAll(this.compose( preamble, rules, rules, tupleLength, length));
|
result.addAll(this.compose(preamble, rules, rules, tupleLength, length));
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Recursively attempt to find sequences in the ruleset to append to
|
* Recursively attempt to find sequences in the ruleset to append to what's
|
||||||
* what's been composed so far.
|
* been composed so far.
|
||||||
* @param glanceBack
|
*
|
||||||
* @param allRules
|
* @param glanceBack
|
||||||
* @param currentRules
|
* @param allRules
|
||||||
* @param tupleLength
|
* @param currentRules
|
||||||
* @param length
|
* @param tupleLength
|
||||||
* @return
|
* @param length
|
||||||
*/
|
* @return
|
||||||
|
*/
|
||||||
private WordSequence compose(Stack<String> glanceBack,
|
private WordSequence compose(Stack<String> glanceBack,
|
||||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||||
int length) {
|
int length) {
|
||||||
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
|
assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size";
|
||||||
assert (allRules.getWord() == ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
|
assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set";
|
||||||
WordSequence result;
|
WordSequence result;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
String here = currentRules.getWord((Stack<String>) glanceBack.clone());
|
String here = currentRules.getWord((Stack<String>) glanceBack
|
||||||
System.err.println( String.format( "Trying token %s", here));
|
.clone());
|
||||||
|
System.err.println(String.format("Trying token %s", here));
|
||||||
|
|
||||||
result = new WordSequence();
|
result = new WordSequence();
|
||||||
result.add(here);
|
result.add(here);
|
||||||
|
|
||||||
if (length != 0) {
|
if (length != 0) {
|
||||||
/* we're not done yet */
|
/* we're not done yet */
|
||||||
Collection<String> options = allRules.getSuccessors();
|
Collection<String> options = allRules.getSuccessors();
|
||||||
|
|
||||||
for (String next : options) {
|
for (String next : options) {
|
||||||
WordSequence rest =
|
@SuppressWarnings("unchecked")
|
||||||
this.tryOption( (Stack<String>) glanceBack.clone(), allRules,
|
WordSequence rest = this
|
||||||
currentRules.getRule(next), tupleLength, length - 1);
|
.tryOption((Stack<String>) glanceBack.clone(),
|
||||||
|
allRules, currentRules.getRule(next),
|
||||||
|
tupleLength, length - 1);
|
||||||
|
|
||||||
if (rest != null) {
|
if (rest != null) {
|
||||||
/* we have a solution */
|
/* we have a solution */
|
||||||
result.addAll(rest);
|
result.addAll(rest);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (NoSuchPathException ex) {
|
} catch (NoSuchPathException ex) {
|
||||||
Logger.getLogger(TextGenerator.class.getName()).log(Level.WARNING,
|
System.err.println( String.format("No path %s: Backtracking...", glanceBack));
|
||||||
String.format("No path %s: Backtracking...", glanceBack));
|
result = null;
|
||||||
result = null;
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Try composing with this ruleset
|
* Try composing with this ruleset
|
||||||
* @param glanceBack
|
*
|
||||||
* @param allRules all the rules there are.
|
* @param glanceBack
|
||||||
* @param currentRules the current node in the rule tree.
|
* @param allRules
|
||||||
* @param tupleLength the size of the glanceback window we're considering.
|
* all the rules there are.
|
||||||
* @param length
|
* @param currentRules
|
||||||
* @return
|
* the current node in the rule tree.
|
||||||
*/
|
* @param tupleLength
|
||||||
|
* the size of the glanceback window we're considering.
|
||||||
|
* @param length
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
private WordSequence tryOption(Stack<String> glanceBack,
|
private WordSequence tryOption(Stack<String> glanceBack,
|
||||||
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength,
|
||||||
int length) {
|
int length) {
|
||||||
|
@ -364,69 +276,76 @@ class TextGenerator {
|
||||||
length);
|
length);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Return a new stack comprising all the items on the current stack,
|
* Return a new stack comprising all the items on the current stack, with
|
||||||
* with this new string added at the bottom
|
* this new string added at the bottom
|
||||||
*
|
*
|
||||||
* @param stack the stack to restack.
|
* @param stack
|
||||||
* @param bottom the item to place on the bottom.
|
* the stack to restack.
|
||||||
* @return the restacked stack.
|
* @param bottom
|
||||||
*/
|
* the item to place on the bottom.
|
||||||
private Stack<String> restack(Stack<String> stack, String bottom) {
|
* @return the restacked stack.
|
||||||
final Stack<String> result;
|
*/
|
||||||
if (stack.isEmpty()) {
|
private Stack<String> restack(Stack<String> stack, String bottom) {
|
||||||
result = new Stack<String>();
|
final Stack<String> result;
|
||||||
result.push(bottom);
|
if (stack.isEmpty()) {
|
||||||
} else {
|
result = new Stack<String>();
|
||||||
String top = stack.pop();
|
result.push(bottom);
|
||||||
result = restack(stack, bottom);
|
} else {
|
||||||
result.push(top);
|
String top = stack.pop();
|
||||||
}
|
result = restack(stack, bottom);
|
||||||
return result;
|
result.push(top);
|
||||||
}
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Random walk of the rule tree to extract (from the root) a legal sequence
|
||||||
|
* of words the length of our tuple.
|
||||||
|
*
|
||||||
|
* @param rules
|
||||||
|
* the rule tree (fragment) to walk.
|
||||||
|
* @return a sequence of words.
|
||||||
|
*/
|
||||||
|
private Stack<String> composePreamble(RuleTreeNode rules) {
|
||||||
|
final Stack<String> result;
|
||||||
|
final RuleTreeNode successor = rules.getRule();
|
||||||
|
|
||||||
/**
|
if (successor == null) {
|
||||||
* Random walk of the rule tree to extract (from the root) a legal sequence of words the length of our tuple.
|
result = new Stack<String>();
|
||||||
*
|
} else {
|
||||||
* @param rules the rule tree (fragment) to walk.
|
result = this.composePreamble(successor);
|
||||||
* @return a sequence of words.
|
result.push(rules.getWord());
|
||||||
*/
|
}
|
||||||
private Stack<String> composePreamble(RuleTreeNode rules) {
|
return result;
|
||||||
final Stack<String> result;
|
}
|
||||||
final RuleTreeNode successor = rules.getRule();
|
|
||||||
|
|
||||||
if (successor == null) {
|
/**
|
||||||
result = new Stack<String>();
|
*
|
||||||
} else {
|
* @param tokens
|
||||||
result = this.composePreamble(successor);
|
* a sequence of tokens
|
||||||
result.push(rules.getWord());
|
* @param marker
|
||||||
}
|
* a marker to terminate after the last occurrance of.
|
||||||
return result;
|
* @return a copy of tokens, truncated at the last occurrance of the marker.
|
||||||
}
|
*/
|
||||||
|
private WordSequence truncateAtLastInstance(WordSequence tokens,
|
||||||
|
String marker) {
|
||||||
|
final WordSequence result = new WordSequence();
|
||||||
|
|
||||||
/**
|
if (!tokens.isEmpty()) {
|
||||||
*
|
|
||||||
* @param tokens a sequence of tokens
|
|
||||||
* @param marker a marker to terminate after the last occurrance of.
|
|
||||||
* @return a copy of tokens, truncated at the last occurrance of the marker.
|
|
||||||
*/
|
|
||||||
private WordSequence truncateAtLastInstance(WordSequence tokens,
|
|
||||||
String marker) {
|
|
||||||
final WordSequence result = new WordSequence();
|
|
||||||
|
|
||||||
if (!tokens.isEmpty()) {
|
String token = tokens.remove();
|
||||||
|
result.add(token);
|
||||||
|
if (!(marker.equals(token) && !tokens.contains(marker))) {
|
||||||
|
/*
|
||||||
|
* woah, double negatives. If the token we're looking at is the
|
||||||
|
* marker, and the remainder of the tokens does not include the
|
||||||
|
* marker, we're done. Otherwise, we continue. OK?
|
||||||
|
*/
|
||||||
|
result.addAll(this.truncateAtLastInstance(tokens, marker));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
String token = tokens.remove();
|
return result;
|
||||||
result.add(token);
|
}
|
||||||
if (!(marker.equals(token) && !tokens.contains(marker))) {
|
|
||||||
/* woah, double negatives. If the token we're looking at is the
|
|
||||||
* marker, and the remainder of the tokens does not include the
|
|
||||||
* marker, we're done. Otherwise, we continue. OK? */
|
|
||||||
result.addAll(this.truncateAtLastInstance(tokens, marker));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
74
src/cc/journeyman/milkwood/Tokeniser.java
Normal file
74
src/cc/journeyman/milkwood/Tokeniser.java
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
/*
|
||||||
|
* Proprietary unpublished source code property of
|
||||||
|
* Simon Brooke <simon@journeyman.cc>.
|
||||||
|
*
|
||||||
|
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
|
||||||
|
*/
|
||||||
|
package cc.journeyman.milkwood;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StreamTokenizer;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A tokeniser which reads tokens in a manner which suits me. Although this
|
||||||
|
* implementation is based on a StreamTokenizer, the point of separating this
|
||||||
|
* out into its own class is that if I had more time I could reimplement.
|
||||||
|
*
|
||||||
|
* @author simon
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
public class Tokeniser extends StreamTokenizer {
|
||||||
|
|
||||||
|
public Tokeniser(Reader r) {
|
||||||
|
super(r);
|
||||||
|
|
||||||
|
this.resetSyntax();
|
||||||
|
this.whitespaceChars(8, 15);
|
||||||
|
this.whitespaceChars(28, 32);
|
||||||
|
/*
|
||||||
|
* treat quotemarks as white space. Actually it would be better if quote
|
||||||
|
* marks were white space only if preceded or followed by whitespace, so
|
||||||
|
* that, e.g., 'don't' and 'can't' appeared as single tokens. But that
|
||||||
|
* means really reimplementing the parser and I don't have time.
|
||||||
|
*/
|
||||||
|
this.whitespaceChars((int) '\"', (int) '\"');
|
||||||
|
this.whitespaceChars((int) '\'', (int) '\'');
|
||||||
|
this.wordChars((int) '0', (int) '9');
|
||||||
|
this.wordChars((int) 'A', (int) 'Z');
|
||||||
|
this.wordChars((int) 'a', (int) 'z');
|
||||||
|
}
|
||||||
|
|
||||||
|
public Tokeniser(InputStream in) {
|
||||||
|
this(new BufferedReader(new InputStreamReader(in)));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* There surely must be a better way to get just the token out of a
|
||||||
|
* StreamTokenizer...!
|
||||||
|
*/
|
||||||
|
public String readBareToken() {
|
||||||
|
final String token;
|
||||||
|
|
||||||
|
switch (this.ttype) {
|
||||||
|
case StreamTokenizer.TT_EOL:
|
||||||
|
token = "FIXME"; // TODO: fix this!
|
||||||
|
break;
|
||||||
|
case StreamTokenizer.TT_NUMBER:
|
||||||
|
token = new Double(this.nval).toString();
|
||||||
|
break;
|
||||||
|
case StreamTokenizer.TT_WORD:
|
||||||
|
token = this.sval.toLowerCase();
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
StringBuffer buffy = new StringBuffer();
|
||||||
|
buffy.append((char) this.ttype);
|
||||||
|
token = buffy.toString();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return token;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Reference in a new issue