Now correctly tokenises all punctuation.

This commit is contained in:
Simon Brooke 2013-11-08 11:11:30 +00:00
parent 5c6a0f0b90
commit 3315eef7b8

View file

@ -33,8 +33,6 @@
;; of the path.
true (merge-rules rules (add-rule (rules (first path)) (rest path)))))
;; (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp "../milkwood/undermilkwood.txt")))
(defn analyse-tokens
"Read this sequence of tokens and process it into rules.
@ -60,4 +58,4 @@
file: the path name of a file to read;
depth: the depth of rules/length of window we're considering"
[file depth]
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp file))) depth))
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+|\p{Punct}" (slurp file))) depth))