Now correctly tokenises all punctuation.
This commit is contained in:
parent
5c6a0f0b90
commit
3315eef7b8
|
@ -33,8 +33,6 @@
|
|||
;; of the path.
|
||||
true (merge-rules rules (add-rule (rules (first path)) (rest path)))))
|
||||
|
||||
;; (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp "../milkwood/undermilkwood.txt")))
|
||||
|
||||
(defn analyse-tokens
|
||||
"Read this sequence of tokens and process it into rules.
|
||||
|
||||
|
@ -60,4 +58,4 @@
|
|||
file: the path name of a file to read;
|
||||
depth: the depth of rules/length of window we're considering"
|
||||
[file depth]
|
||||
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp file))) depth))
|
||||
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+|\p{Punct}" (slurp file))) depth))
|
||||
|
|
Loading…
Reference in a new issue