From 3315eef7b8a19f3443518183c8c1a3782f4902ae Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Fri, 8 Nov 2013 11:11:30 +0000 Subject: [PATCH] Now correctly tokenises all punctuation. --- src/milkwood_clj/analyse.clj | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/milkwood_clj/analyse.clj b/src/milkwood_clj/analyse.clj index 1320871..d12c09b 100644 --- a/src/milkwood_clj/analyse.clj +++ b/src/milkwood_clj/analyse.clj @@ -33,8 +33,6 @@ ;; of the path. true (merge-rules rules (add-rule (rules (first path)) (rest path))))) -;; (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp "../milkwood/undermilkwood.txt"))) - (defn analyse-tokens "Read this sequence of tokens and process it into rules. @@ -60,4 +58,4 @@ file: the path name of a file to read; depth: the depth of rules/length of window we're considering" [file depth] - (analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp file))) depth)) + (analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+|\p{Punct}" (slurp file))) depth))