diff --git a/src/cc/journeyman/milkwood/Digester.java b/src/cc/journeyman/milkwood/Digester.java new file mode 100644 index 0000000..8b2facb --- /dev/null +++ b/src/cc/journeyman/milkwood/Digester.java @@ -0,0 +1,60 @@ +/* + * Proprietary unpublished source code property of + * Simon Brooke . + * + * Copyright (c) 2013 Simon Brooke + */ +package cc.journeyman.milkwood; + +import java.io.IOException; +import java.io.InputStream; +import java.io.StreamTokenizer; +import java.util.LinkedList; +import java.util.Queue; + +/** + * Read an input stream of text and digest it into a set of generation rules. + * Separated out of TextGenerator mainly to declutter tht class. + * + * @author simon + * + */ +public class Digester { + /** + * Read tokens from the input stream, and compile them into the rule tree + * below this root. + * + * @param in + * the input stream from which I read. + * @param tupleLength + * the length of the tuples I read. + * @param root + * the ruleset to which I shall add. + * @return the number of tokens read. + * @throws IOException if can't read from file system. + */ + protected int read(final InputStream in, final int tupleLength, + final RuleTreeNode root) throws IOException { + int result = 0; + final Queue openTuples = new LinkedList(); + final Tokeniser tok = new Tokeniser(in); + + for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok + .nextToken()) { + result++; + final WordSequence newTuple = new WordSequence(); + String token = tok.readBareToken(); + + openTuples.add(newTuple); + for (WordSequence tuple : openTuples) { + tuple.add(token); + } + + if (openTuples.size() > tupleLength) { + root.addSequence(openTuples.remove()); + } + } + + return result; + } +} diff --git a/src/cc/journeyman/milkwood/Milkwood.java b/src/cc/journeyman/milkwood/Milkwood.java index f8d249e..2343614 100644 --- a/src/cc/journeyman/milkwood/Milkwood.java +++ b/src/cc/journeyman/milkwood/Milkwood.java @@ -15,60 +15,97 @@ import java.io.OutputStream; * Copyright (c) 2013 Simon Brooke */ - /** - * + * * @author Simon Brooke */ public class Milkwood { - /** - * Parse command line arguments and kick off the process. Expected - * arguments include: - *
- *
-i, -input
- *
Input file, expected to be an English (or, frankly, other natural - * language) text. Defaults to standard in.
- *
-n, -tuple-length
- *
The length of tuples into which the file will be analised, default 2.
- *
-o, -output
- *
Output file, to which generated text will be written. - * Defaults to standard out.
- *
- * - * @param args the command line arguments - * @exception FileNotFoundException if the user specifies a file which - * isn't available. - * @excpetion IOException if could not read from input or write to output. - */ - public static void main(String[] args) throws FileNotFoundException, IOException { - InputStream in = System.in; - OutputStream out = System.out; - int tupleLength = 2; - - for (int cursor = 0; cursor < args.length; cursor++) { - String arg = args[cursor]; + /** + * Parse command line arguments and kick off the process. Expected arguments + * include: + *
+ *
-d, -debug
+ *
Print debugging output to standard error
+ *
-i, -input
+ *
Input file, expected to be an English (or, frankly, other natural + * language) text. Defaults to standard in.
+ *
-n, -tuple-length
+ *
The length of tuples into which the file will be analised, default 2. + *
+ *
-o, -output
+ *
Output file, to which generated text will be written. Defaults to + * standard out.
+ *
+ * + * @param args + * the command line arguments + * @exception FileNotFoundException + * if the user specifies a file which isn't available. + * @excpetion IOException if could not read from input or write to output. + */ + public static void main(String[] args) throws FileNotFoundException, + IOException { + InputStream in = System.in; + OutputStream out = System.out; + int tupleLength = 2; + boolean debug = false; - if (arg.startsWith("-") && arg.length() > 1) { - switch (arg.charAt(1)) { - case 'i': - // input - in = new FileInputStream(new File(args[++cursor])); - break; - case 'o': // output - out = new FileOutputStream(new File(args[++cursor])); - break; - case 'n': - case 't': // tuple length - tupleLength = Integer.parseInt(args[++cursor]); - break; - default: - throw new IllegalArgumentException( - String.format("Unrecognised argument '%s'", arg)); - } - } - } + for (int cursor = 0; cursor < args.length; cursor++) { + String arg = args[cursor]; + + if (arg.startsWith("-") && arg.length() > 1) { + switch (arg.charAt(1)) { + case 'd': + debug = true; + break; + case 'i': + // input + in = new FileInputStream(new File(args[++cursor])); + break; + case 'o': // output + out = new FileOutputStream(new File(args[++cursor])); + break; + case 'n': + case 't': // tuple length + tupleLength = Integer.parseInt(args[++cursor]); + break; + default: + throw new IllegalArgumentException(String.format( + "Unrecognised argument '%s'", arg)); + } + } + } + + new Milkwood().readAndGenerate(in, out, tupleLength, debug); + } + + /** + * Read tokens from this input and use them to generate text on this output. + * + * @param in + * the input stream to read. + * @param out + * the output stream to write to. + * @param tupleLength + * the length of tuples to be used in generation. + * @param debug + * whether to print debugging output. + * @throws IOException + * if the file system buggers up, which is not, in the cosmic + * scheme of things, very likely. + */ + void readAndGenerate(final InputStream in, final OutputStream out, + final int tupleLength, boolean debug) throws IOException { + /* The root of the rule tree I shall build. */ + RuleTreeNode root = new RuleTreeNode(); + int length = new Digester().read(in, tupleLength, root); + + if (debug) { + System.err.println(root.toString()); + } + + new TextGenerator().generate(out, tupleLength, root, length); + } - new TextGenerator().readAndGenerate( in, out, tupleLength); - } } diff --git a/src/cc/journeyman/milkwood/RuleTreeNode.java b/src/cc/journeyman/milkwood/RuleTreeNode.java index 6d28baf..144fb2c 100644 --- a/src/cc/journeyman/milkwood/RuleTreeNode.java +++ b/src/cc/journeyman/milkwood/RuleTreeNode.java @@ -23,6 +23,10 @@ import java.util.Stack; * @author Simon Brooke */ public class RuleTreeNode { + /** + * The magic token which identifies the root node of a rule tree. + */ + public static final String ROOTMAGICTOKEN = "*ROOT*"; /** * The line separator on this platform. */ @@ -41,6 +45,13 @@ public class RuleTreeNode { * Potential successors of this node */ private Map rules = new HashMap(); + + /** + * If no argument passed, generate a root node. + */ + public RuleTreeNode() { + this( RuleTreeNode.ROOTMAGICTOKEN); + } /** * Create me wrapping this word. diff --git a/src/cc/journeyman/milkwood/TextGenerator.java b/src/cc/journeyman/milkwood/TextGenerator.java index 2e4e60d..b66129a 100644 --- a/src/cc/journeyman/milkwood/TextGenerator.java +++ b/src/cc/journeyman/milkwood/TextGenerator.java @@ -6,229 +6,127 @@ */ package cc.journeyman.milkwood; -import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.OutputStream; import java.io.OutputStreamWriter; -import java.io.Reader; -import java.io.StreamTokenizer; import java.util.Collection; -import java.util.LinkedList; import java.util.Locale; -import java.util.Queue; import java.util.Random; import java.util.Stack; -import java.util.logging.Level; -import java.util.logging.Logger; - - /** * * @author Simon Brooke */ class TextGenerator { + /** - * The magic token which identifies the root node of the - * rule tree. + * The magic token which is deemed to end sentences. */ - private static final String ROOTMAGICTOKEN = "*ROOT*"; + public static final String PERIOD = "."; /** - * The special magic token which is deemed to end sentences. - */ - public static final String PERIOD = "."; - - /** - * The average number of sentences in a paragraph. - */ - public static final int AVSENTENCESPERPARA = 5; - /** - * A random number generator. - */ - private static Random RANDOM = new Random(); - /** - * Dictionary of first-words we know about; each first-word maps - * onto a tuple of tuples of word sequences beginning with that - * word, so 'I' might map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]]. - */ - TupleDictionary dictionary = new TupleDictionary(); + * The average number of sentences in a paragraph. + */ + public static final int AVSENTENCESPERPARA = 5; + /** + * A random number generator. + */ + private static Random RANDOM = new Random(); + /** + * Dictionary of first-words we know about; each first-word maps onto a + * tuple of tuples of word sequences beginning with that word, so 'I' might + * map onto [[I, CAME, COMMA],[I, SAW, COMMA],[I CONQUERED COMMA]]. + */ + TupleDictionary dictionary = new TupleDictionary(); - public TextGenerator() { - } + public TextGenerator() { + } - /** - * Read tokens from this input and use them to generate text on this output. - * @param in the input stream to read. - * @param out the output stream to write to. - * @param tupleLength the length of tuples to be used in generation. - * @throws IOException if the file system buggers up, which is not, in the - * cosmic scheme of things, very likely. - */ - void readAndGenerate(InputStream in, OutputStream out, int tupleLength) throws IOException { - /* The root of the rule tree I shall build. */ - RuleTreeNode root = new RuleTreeNode( ROOTMAGICTOKEN); - int length = read(in, tupleLength, root); - - System.err.println( root.toString()); - - generate( out, tupleLength, root, length); - } - /** - * Read tokens from the input stream, and compile them into a ruleset below root. - * @param in the input stream from which I read. - * @param tupleLength the length of the tuples I read. - * @param root the ruleset to which I shall add. - * @return the number of tokens read. - * @throws IOException - */ - private int read(InputStream in, int tupleLength, RuleTreeNode root) throws IOException { - int result = 0; - Queue openTuples = new LinkedList(); - StreamTokenizer tok = prepareTokenizer(in); - - for (int type = tok.nextToken(); type != StreamTokenizer.TT_EOF; type = tok.nextToken()) { - result ++; - final WordSequence newTuple = new WordSequence(); - String token = readBareToken(tok, type); + public void generate(OutputStream out, int tupleLength, RuleTreeNode root, + int length) throws IOException { + WordSequence tokens = this.compose(root, tupleLength, length); - openTuples.add(newTuple); - for ( WordSequence tuple : openTuples) { - tuple.add(token); - } - - if (openTuples.size() > tupleLength) { - root.addSequence( openTuples.remove()); - } - } - - return result; - } - - /** - * There surely must be a better way to get just the token out of a - * StreamTokenizer...! - * @param tok the tokenizer. - * @return just the next token. - */ - private String readBareToken(StreamTokenizer tok, int type) { - final String token; - - switch (type) { - case StreamTokenizer.TT_EOL: - token = "FIXME"; // TODO: fix this! - break; - case StreamTokenizer.TT_NUMBER: - token = new Double(tok.nval).toString(); - break; - case StreamTokenizer.TT_WORD: - token = tok.sval.toLowerCase(); - break; - default: - StringBuffer buffy = new StringBuffer(); - buffy.append((char) type); - token = buffy.toString(); - break; + if (tokens.contains(PERIOD)) { + // TODO: eq = equal? + tokens = this.truncateAtLastInstance(tokens, PERIOD); } - return token; + + this.generate(out, tokens); } - /** - * Prepare a tokeniser on this input stream, set up to handle at least - * Western European natural language text. - * @param in the stream. - * @return a suitable tokeniser. - */ - private StreamTokenizer prepareTokenizer(InputStream in) { - Reader gentle = new BufferedReader(new InputStreamReader(in)); - StreamTokenizer tok = new StreamTokenizer(gentle); - - tok.resetSyntax(); - tok.whitespaceChars(8, 15); - tok.whitespaceChars(28, 32); - /* treat quotemarks as white space */ - tok.whitespaceChars((int) '\"', (int) '\"'); - tok.whitespaceChars((int) '\'', (int) '\''); - tok.wordChars((int) '0', (int) '9'); - tok.wordChars((int) 'A', (int) 'Z'); - tok.wordChars((int) 'a', (int) 'z'); - tok.parseNumbers(); - return tok; + /** + * Write this sequence of tokens on this stream, sorting out minor issues of + * orthography. + * + * @param out + * the stream. + * @param tokens + * the tokens. + * @throws IOException + * if it is impossible to write (e.g. file system full). + */ + private void generate(OutputStream out, WordSequence tokens) + throws IOException { + BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out)); + boolean capitaliseNext = true; + + try { + for (String token : tokens) { + capitaliseNext = writeToken(dickens, capitaliseNext, token); + } + } finally { + dickens.flush(); + dickens.close(); + } } - private void generate(OutputStream out, int tupleLength, RuleTreeNode root, int length) throws IOException { - WordSequence tokens = this.compose( root, tupleLength, length); - - if ( tokens.contains(PERIOD)) { - // TODO: eq = equal? - tokens = this.truncateAtLastInstance( tokens, PERIOD); - } - - this.generate( out, tokens); - } - - /** - * Write this sequence of tokens on this stream, sorting out minor - * issues of orthography. - * @param out the stream. - * @param tokens the tokens. - * @throws IOException if it is impossible to write (e.g. file system full). - */ - private void generate(OutputStream out, WordSequence tokens) throws IOException { - BufferedWriter dickens = new BufferedWriter(new OutputStreamWriter(out)); - boolean capitaliseNext = true; - - try { - for (String token : tokens) { - capitaliseNext = writeToken(dickens, capitaliseNext, token); - } - } finally { - dickens.flush(); - dickens.close(); - } - } - - /** - * Deal with end of paragraph, capital after full stop, and other - * minor orthographic conventions. - * @param dickens the scrivenor who writes for us. - * @param capitalise whether or not the token should be capitalised - * @param token the token to write; - * @returnvtrue if the next token to be written should be capitalised. - * @throws IOException - */ + /** + * Deal with end of paragraph, capital after full stop, and other minor + * orthographic conventions. + * + * @param dickens + * the scrivenor who writes for us. + * @param capitalise + * whether or not the token should be capitalised + * @param token + * the token to write; + * @returnvtrue if the next token to be written should be capitalised. + * @throws IOException + */ private boolean writeToken(BufferedWriter dickens, boolean capitalise, String token) throws IOException { - if ( this.spaceBefore(token)) { - dickens.write( " "); + if (this.spaceBefore(token)) { + dickens.write(" "); } - if ( capitalise) { - dickens.write(token.substring(0, 1).toUpperCase(Locale.getDefault())); + if (capitalise) { + dickens.write(token.substring(0, 1) + .toUpperCase(Locale.getDefault())); dickens.write(token.substring(1)); } else { dickens.write(token); } - this.maybeParagraph( token, dickens); - + this.maybeParagraph(token, dickens); + return (token.endsWith(PERIOD)); } - /** - * Return false if token is punctuation, else true. Wouldn't it be - * nice if Java provided Character.isPunctuation(char)? However, since it - * doesn't, I can give this slightly special semantics: return true only if - * this is punctuation which would not normally be preceded with a space. - * @param ch a character. - * @return true if the should be preceded by a space, else false. - */ - private boolean spaceBefore(String token) { - final boolean result; - + /** + * Return false if token is punctuation, else true. Wouldn't it be nice if + * Java provided Character.isPunctuation(char)? However, since it doesn't, I + * can give this slightly special semantics: return true only if this is + * punctuation which would not normally be preceded with a space. + * + * @param ch + * a character. + * @return true if the should be preceded by a space, else false. + */ + private boolean spaceBefore(String token) { + final boolean result; + if (token.length() == 1) { switch (token.charAt(0)) { case '.': @@ -241,9 +139,10 @@ class TextGenerator { * the apostrophe lost */ case 't': - /* similar; probably 'doesn't' or 'shouldn't' or other cases - * of 'not' with an elided 'o'. - */ + /* + * similar; probably 'doesn't' or 'shouldn't' or other cases of + * 'not' with an elided 'o'. + */ result = false; break; default: @@ -253,107 +152,120 @@ class TextGenerator { } else { result = false; } - - return result; - } - /** - * If this token is an end-of-sentence token, then, on one chance in - * some, have the writer write two new lines. NOTE: The tokeniser is treating - * PERIOD ('.') as a word character, even though it has not been told to. - * Token.endsWith( PERIOD) is a hack to get round this problem. - * TODO: investigate and fix. - * - * @param token a token - * @param dickens our scrivenor - * @throws IOException if Mr Dickens has run out of ink - */ - private void maybeParagraph(String token, BufferedWriter dickens) throws IOException { - if ( token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { - dickens.write("\n\n"); - } - } + return result; + } - /** - * Recursive, backtracking, output generator. - * @param rules - * @param tupleLength - * @param length - * @return - */ - private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) { - Stack preamble = composePreamble( rules); - WordSequence result = new WordSequence(); - - // composing the preamble will have ended with *ROOT* on top of the stack; - // get rid of it. - preamble.pop(); - - result.addAll(preamble); - - result.addAll(this.compose( preamble, rules, rules, tupleLength, length)); - return result; - } - - /** - * Recursively attempt to find sequences in the ruleset to append to - * what's been composed so far. - * @param glanceBack - * @param allRules - * @param currentRules - * @param tupleLength - * @param length - * @return - */ + /** + * If this token is an end-of-sentence token, then, on one chance in some, + * have the writer write two new lines. NOTE: The tokeniser is treating + * PERIOD ('.') as a word character, even though it has not been told to. + * Token.endsWith( PERIOD) is a hack to get round this problem. TODO: + * investigate and fix. + * + * @param token + * a token + * @param dickens + * our scrivenor + * @throws IOException + * if Mr Dickens has run out of ink + */ + private void maybeParagraph(String token, BufferedWriter dickens) + throws IOException { + if (token.endsWith(PERIOD) && RANDOM.nextInt(AVSENTENCESPERPARA) == 0) { + dickens.write("\n\n"); + } + } + + /** + * Recursive, backtracking, output generator. + * + * @param rules + * @param tupleLength + * @param length + * @return + */ + private WordSequence compose(RuleTreeNode rules, int tupleLength, int length) { + Stack preamble = composePreamble(rules); + WordSequence result = new WordSequence(); + + // composing the preamble will have ended with *ROOT* on top of the + // stack; + // get rid of it. + preamble.pop(); + + result.addAll(preamble); + + result.addAll(this.compose(preamble, rules, rules, tupleLength, length)); + return result; + } + + /** + * Recursively attempt to find sequences in the ruleset to append to what's + * been composed so far. + * + * @param glanceBack + * @param allRules + * @param currentRules + * @param tupleLength + * @param length + * @return + */ private WordSequence compose(Stack glanceBack, RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, int length) { - assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size"; - assert (allRules.getWord() == ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set"; - WordSequence result; + assert (glanceBack.size() == tupleLength) : "Shouldn't happen: bad tuple size"; + assert (allRules.getWord() == RuleTreeNode.ROOTMAGICTOKEN) : "Shoudn't happen: bad rule set"; + WordSequence result; - try { - @SuppressWarnings("unchecked") - String here = currentRules.getWord((Stack) glanceBack.clone()); - System.err.println( String.format( "Trying token %s", here)); + try { + @SuppressWarnings("unchecked") + String here = currentRules.getWord((Stack) glanceBack + .clone()); + System.err.println(String.format("Trying token %s", here)); - result = new WordSequence(); - result.add(here); + result = new WordSequence(); + result.add(here); - if (length != 0) { - /* we're not done yet */ - Collection options = allRules.getSuccessors(); + if (length != 0) { + /* we're not done yet */ + Collection options = allRules.getSuccessors(); - for (String next : options) { - WordSequence rest = - this.tryOption( (Stack) glanceBack.clone(), allRules, - currentRules.getRule(next), tupleLength, length - 1); + for (String next : options) { + @SuppressWarnings("unchecked") + WordSequence rest = this + .tryOption((Stack) glanceBack.clone(), + allRules, currentRules.getRule(next), + tupleLength, length - 1); - if (rest != null) { - /* we have a solution */ - result.addAll(rest); - break; - } - } - } - } catch (NoSuchPathException ex) { - Logger.getLogger(TextGenerator.class.getName()).log(Level.WARNING, - String.format("No path %s: Backtracking...", glanceBack)); - result = null; - } + if (rest != null) { + /* we have a solution */ + result.addAll(rest); + break; + } + } + } + } catch (NoSuchPathException ex) { + System.err.println( String.format("No path %s: Backtracking...", glanceBack)); + result = null; + } - return result; - } - - /** - * Try composing with this ruleset - * @param glanceBack - * @param allRules all the rules there are. - * @param currentRules the current node in the rule tree. - * @param tupleLength the size of the glanceback window we're considering. - * @param length - * @return - */ + return result; + } + + /** + * Try composing with this ruleset + * + * @param glanceBack + * @param allRules + * all the rules there are. + * @param currentRules + * the current node in the rule tree. + * @param tupleLength + * the size of the glanceback window we're considering. + * @param length + * @return + */ private WordSequence tryOption(Stack glanceBack, RuleTreeNode allRules, RuleTreeNode currentRules, int tupleLength, int length) { @@ -364,69 +276,76 @@ class TextGenerator { length); } - /** - * Return a new stack comprising all the items on the current stack, - * with this new string added at the bottom - * - * @param stack the stack to restack. - * @param bottom the item to place on the bottom. - * @return the restacked stack. - */ - private Stack restack(Stack stack, String bottom) { - final Stack result; - if (stack.isEmpty()) { - result = new Stack(); - result.push(bottom); - } else { - String top = stack.pop(); - result = restack(stack, bottom); - result.push(top); - } - return result; - } + /** + * Return a new stack comprising all the items on the current stack, with + * this new string added at the bottom + * + * @param stack + * the stack to restack. + * @param bottom + * the item to place on the bottom. + * @return the restacked stack. + */ + private Stack restack(Stack stack, String bottom) { + final Stack result; + if (stack.isEmpty()) { + result = new Stack(); + result.push(bottom); + } else { + String top = stack.pop(); + result = restack(stack, bottom); + result.push(top); + } + return result; + } + /** + * Random walk of the rule tree to extract (from the root) a legal sequence + * of words the length of our tuple. + * + * @param rules + * the rule tree (fragment) to walk. + * @return a sequence of words. + */ + private Stack composePreamble(RuleTreeNode rules) { + final Stack result; + final RuleTreeNode successor = rules.getRule(); - /** - * Random walk of the rule tree to extract (from the root) a legal sequence of words the length of our tuple. - * - * @param rules the rule tree (fragment) to walk. - * @return a sequence of words. - */ - private Stack composePreamble(RuleTreeNode rules) { - final Stack result; - final RuleTreeNode successor = rules.getRule(); + if (successor == null) { + result = new Stack(); + } else { + result = this.composePreamble(successor); + result.push(rules.getWord()); + } + return result; + } - if (successor == null) { - result = new Stack(); - } else { - result = this.composePreamble(successor); - result.push(rules.getWord()); - } - return result; - } + /** + * + * @param tokens + * a sequence of tokens + * @param marker + * a marker to terminate after the last occurrance of. + * @return a copy of tokens, truncated at the last occurrance of the marker. + */ + private WordSequence truncateAtLastInstance(WordSequence tokens, + String marker) { + final WordSequence result = new WordSequence(); - /** - * - * @param tokens a sequence of tokens - * @param marker a marker to terminate after the last occurrance of. - * @return a copy of tokens, truncated at the last occurrance of the marker. - */ - private WordSequence truncateAtLastInstance(WordSequence tokens, - String marker) { - final WordSequence result = new WordSequence(); + if (!tokens.isEmpty()) { - if (!tokens.isEmpty()) { + String token = tokens.remove(); + result.add(token); + if (!(marker.equals(token) && !tokens.contains(marker))) { + /* + * woah, double negatives. If the token we're looking at is the + * marker, and the remainder of the tokens does not include the + * marker, we're done. Otherwise, we continue. OK? + */ + result.addAll(this.truncateAtLastInstance(tokens, marker)); + } + } - String token = tokens.remove(); - result.add(token); - if (!(marker.equals(token) && !tokens.contains(marker))) { - /* woah, double negatives. If the token we're looking at is the - * marker, and the remainder of the tokens does not include the - * marker, we're done. Otherwise, we continue. OK? */ - result.addAll(this.truncateAtLastInstance(tokens, marker)); - } - } - - return result; - } + return result; + } } diff --git a/src/cc/journeyman/milkwood/Tokeniser.java b/src/cc/journeyman/milkwood/Tokeniser.java new file mode 100644 index 0000000..86a279c --- /dev/null +++ b/src/cc/journeyman/milkwood/Tokeniser.java @@ -0,0 +1,74 @@ +/* + * Proprietary unpublished source code property of + * Simon Brooke . + * + * Copyright (c) 2013 Simon Brooke + */ +package cc.journeyman.milkwood; + +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StreamTokenizer; + +/** + * A tokeniser which reads tokens in a manner which suits me. Although this + * implementation is based on a StreamTokenizer, the point of separating this + * out into its own class is that if I had more time I could reimplement. + * + * @author simon + * + */ +public class Tokeniser extends StreamTokenizer { + + public Tokeniser(Reader r) { + super(r); + + this.resetSyntax(); + this.whitespaceChars(8, 15); + this.whitespaceChars(28, 32); + /* + * treat quotemarks as white space. Actually it would be better if quote + * marks were white space only if preceded or followed by whitespace, so + * that, e.g., 'don't' and 'can't' appeared as single tokens. But that + * means really reimplementing the parser and I don't have time. + */ + this.whitespaceChars((int) '\"', (int) '\"'); + this.whitespaceChars((int) '\'', (int) '\''); + this.wordChars((int) '0', (int) '9'); + this.wordChars((int) 'A', (int) 'Z'); + this.wordChars((int) 'a', (int) 'z'); + } + + public Tokeniser(InputStream in) { + this(new BufferedReader(new InputStreamReader(in))); + } + + /** + * There surely must be a better way to get just the token out of a + * StreamTokenizer...! + */ + public String readBareToken() { + final String token; + + switch (this.ttype) { + case StreamTokenizer.TT_EOL: + token = "FIXME"; // TODO: fix this! + break; + case StreamTokenizer.TT_NUMBER: + token = new Double(this.nval).toString(); + break; + case StreamTokenizer.TT_WORD: + token = this.sval.toLowerCase(); + break; + default: + StringBuffer buffy = new StringBuffer(); + buffy.append((char) this.ttype); + token = buffy.toString(); + break; + } + return token; + } + +}