Now correctly tokenises all punctuation.
This commit is contained in:
parent
5c6a0f0b90
commit
3315eef7b8
|
@ -33,8 +33,6 @@
|
||||||
;; of the path.
|
;; of the path.
|
||||||
true (merge-rules rules (add-rule (rules (first path)) (rest path)))))
|
true (merge-rules rules (add-rule (rules (first path)) (rest path)))))
|
||||||
|
|
||||||
;; (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp "../milkwood/undermilkwood.txt")))
|
|
||||||
|
|
||||||
(defn analyse-tokens
|
(defn analyse-tokens
|
||||||
"Read this sequence of tokens and process it into rules.
|
"Read this sequence of tokens and process it into rules.
|
||||||
|
|
||||||
|
@ -60,4 +58,4 @@
|
||||||
file: the path name of a file to read;
|
file: the path name of a file to read;
|
||||||
depth: the depth of rules/length of window we're considering"
|
depth: the depth of rules/length of window we're considering"
|
||||||
[file depth]
|
[file depth]
|
||||||
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+" (slurp file))) depth))
|
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+|\p{Punct}" (slurp file))) depth))
|
||||||
|
|
Loading…
Reference in a new issue