milkwood-java/src/cc/journeyman/milkwood/Tokeniser.java

85 lines
2.7 KiB
Java

/*
* Proprietary unpublished source code property of
* Simon Brooke <simon@journeyman.cc>.
*
* Copyright (c) 2013 Simon Brooke <simon@journeyman.cc>
*/
package cc.journeyman.milkwood;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StreamTokenizer;
/**
* A tokeniser which reads tokens in a manner which suits me. Although this
* implementation is based on a StreamTokenizer, the point of separating this
* out into its own class is that if I had more time I could reimplement.
*
* @author simon
*
*/
public class Tokeniser extends StreamTokenizer {
/**
* Initialise me appropriately wrapping this reader.
* @param r the reader to wrap.
*/
public Tokeniser(Reader r) {
super(r);
this.resetSyntax();
this.whitespaceChars(8, 15);
this.whitespaceChars(28, 32);
/*
* treat quotemarks as white space. Actually it would be better if quote
* marks were white space only if preceded or followed by whitespace, so
* that, e.g., 'don't' and 'can't' appeared as single tokens. But that
* means really reimplementing the parser and I don't have time.
*/
this.whitespaceChars((int) '\"', (int) '\"');
this.whitespaceChars((int) '\'', (int) '\'');
/*
* treat underscore and hyphen as whitespace as well. Again, hyphen with
* either leading or trailing non-whitespace probably ought to be
* treated specially, but...
*/
this.whitespaceChars((int) '_', (int) '_');
this.whitespaceChars((int) '-', (int) '-');
this.wordChars((int) '0', (int) '9');
this.wordChars((int) 'A', (int) 'Z');
this.wordChars((int) 'a', (int) 'z');
}
public Tokeniser(InputStream in) {
this(new BufferedReader(new InputStreamReader(in)));
}
/**
* There surely must be a better way to get just the token out of a
* StreamTokenizer...!
*/
public String readBareToken() {
final String token;
switch (this.ttype) {
case StreamTokenizer.TT_EOL:
token = "FIXME"; // TODO: fix this!
break;
case StreamTokenizer.TT_NUMBER:
token = new Double(this.nval).toString();
break;
case StreamTokenizer.TT_WORD:
token = this.sval.toLowerCase();
break;
default:
StringBuilder bob = new StringBuilder();
bob.append((char) this.ttype);
token = bob.toString();
break;
}
return token;
}
}