diff --git a/project.clj b/project.clj index b98eaa0..db2ac24 100644 --- a/project.clj +++ b/project.clj @@ -3,8 +3,9 @@ :url "http://github.com/tirkarthi/clj-wordcloud" :license {:name "MIT public license" :url "http://opensource.org/licenses/mit-license.php"} - :dependencies [[org.clojure/clojure "1.9.0"] - [com.kennycason/kumo-core "1.13"]] + :dependencies [[org.clojure/clojure "1.12.1"] + [com.kennycason/kumo-core "1.13"] + [com.taoensso/telemere "1.0.1"]] :plugins [[lein-codox "0.10.3"]] ;; TODO : Add source uri and keep them in sync for docs :codox {:output-path "docs"}) diff --git a/src/clj_wordcloud/core.clj b/src/clj_wordcloud/core.clj index d000c6c..26293fa 100644 --- a/src/clj_wordcloud/core.clj +++ b/src/clj_wordcloud/core.clj @@ -1,15 +1,19 @@ (ns clj-wordcloud.core - (:require [clojure.spec.alpha :as spec]) - (:import (java.awt Dimension Color) - (java.awt.image BufferedImage) - (javax.imageio ImageIO) - (com.kennycason.kumo WordCloud WordFrequency LayeredWordCloud CollisionMode) - (com.kennycason.kumo.bg CircleBackground RectangleBackground PixelBoundryBackground - PixelBoundryBackground) - (com.kennycason.kumo.palette ColorPalette) - (com.kennycason.kumo.font KumoFont FontWeight) - (com.kennycason.kumo.font.scale LinearFontScalar) - (com.kennycason.kumo.nlp FrequencyAnalyzer))) + (:require + [clojure.java.io :as io] + [clojure.spec.alpha :as spec] + [taoensso.telemere :refer [log!]]) + (:import + (com.kennycason.kumo CollisionMode WordCloud WordFrequency) + (com.kennycason.kumo.bg + CircleBackground + PixelBoundryBackground + PixelBoundryBackground + RectangleBackground) + (com.kennycason.kumo.font FontWeight KumoFont) + (com.kennycason.kumo.font.scale LinearFontScalar SqrtFontScalar) + (com.kennycason.kumo.palette ColorPalette) + (java.awt Color Dimension))) (defn- background-object @@ -23,7 +27,7 @@ :rectangle (RectangleBackground. dimension) :pixel - (PixelBoundryBackground. (clojure.java.io/input-stream bitmap)) + (PixelBoundryBackground. (io/input-stream bitmap)) (CircleBackground. size)))) @@ -31,6 +35,7 @@ [frequency-map] (doto (java.util.ArrayList.) (#(doseq [[word count] frequency-map] +;; (log! (format "Adding word `%s`, used `%d` times" word count)) (.add %1 (WordFrequency. (str word) count)))))) @@ -41,7 +46,8 @@ y-scale (get-in options [:font :y-scale] 10)] (case scale-type :linear - (LinearFontScalar. x-scale y-scale)))) + (LinearFontScalar. x-scale y-scale) + :sqrt (SqrtFontScalar. x-scale y-scale)))) (defn- build-font @@ -102,7 +108,7 @@ ; font specs (spec/def :font/type string?) (spec/def :font/weight #{:plain :bold :italic}) -(spec/def :font/scale-type #{:linear}) +(spec/def :font/scale-type #{:linear :sqrt}) (spec/def :font/x-scale (spec/and number? pos?)) (spec/def :font/y-scale (spec/and number? pos?)) (spec/def :font/padding (spec/and number? pos?)) @@ -136,9 +142,9 @@ ([frequency-map] (word-cloud frequency-map {})) ([frequency-map options] - (if-not (spec/valid? ::frequency-map frequency-map) + (when-not (spec/valid? ::frequency-map frequency-map) (throw (ex-info "Invalid data : " (spec/explain-data ::frequency-map frequency-map)))) - (if-not (spec/valid? ::options options) + (when-not (spec/valid? ::options options) (throw (ex-info "Invalid options : " (spec/explain-data ::options options)))) (let [word-frequencies (build-word-frequency frequency-map) dimension (build-dimension options) diff --git a/src/clj_wordcloud/word_identifier.clj b/src/clj_wordcloud/word_identifier.clj new file mode 100644 index 0000000..f377acf --- /dev/null +++ b/src/clj_wordcloud/word_identifier.clj @@ -0,0 +1,97 @@ +(ns clj-wordcloud.word-identifier + (:require [clojure.string :as s]) + (:import [java.util.regex Pattern])) + +(def word-matcher + "What is a word? It's a sequence of alphabetical characters, for some + alphabet (in the case of English, a slightly extended Latin alphabet). + But some of those characters may be decorated with diacriticals, and + occasionally characters from other alphabets will be used. In my + writing, these are encoded as 'HTML entities'. + + This regular expression attempts to match anything which is a word. + It does not have a catalogue of valid HTML entities, so things which have + the form of HTML entities but which are not valid will be accepted" + #"([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)*|[A-Za-z]*") + +(def trimable-matcher + "Intended to matche a string which contains exactly one word + (doesn't work yet)." + #"^[A-Za-z0-9&;]([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)^[A-Za-z0-9&]*") +; +(def candidate-matcher + "This is ineligant. It identifies a string which MIGHT contain a word, and + isolates the bit that might be a word." + #"[^A-Za-z&;]*(.*)[^A-Za-z&;]*") + +(def left-trim + #"[^A-Za-z&]*(.*)") +(def right-trim + #"([A-Za-z0-9&;]*).*") + +(def whitespace-chars ["" ;;/* dummy empty string for homogeneity */ + "\\u0009" ;; CHARACTER TABULATION + "\\u000A" ;; LINE FEED (LF) + "\\u000B" ;; LINE TABULATION + "\\u000C" ;; FORM FEED (FF) + "\\u000D" ;; CARRIAGE RETURN (CR) + "\\u0020" ;; SPACE + "\\u0085" ;; NEXT LINE (NEL) + "\\u00A0" ;; NO-BREAK SPACE + "\\u1680" ;; OGHAM SPACE MARK + "\\u180E" ;; MONGOLIAN VOWEL SEPARATOR + "\\u2000" ;; EN QUAD + "\\u2001" ;; EM QUAD + "\\u2002" ;; EN SPACE + "\\u2003" ;; EM SPACE + "\\u2004" ;; THREE-PER-EM SPACE + "\\u2005" ;; FOUR-PER-EM SPACE + "\\u2006" ;; SIX-PER-EM SPACE + "\\u2007" ;; FIGURE SPACE + "\\u2008" ;; PUNCTUATION SPACE + "\\u2009" ;; THIN SPACE + "\\u200A" ;; HAIR SPACE + "\\u2028" ;; LINE SEPARATOR + "\\u2029" ;; PARAGRAPH SEPARATOR + "\\u202F" ;; NARROW NO-BREAK SPACE + "\\u205F" ;; MEDIUM MATHEMATICAL SPACE + "\\u3000" ;; IDEOGRAPHIC SPACE + ]) + +(def whitespace-matcher + (Pattern/compile (apply str (concat ["["] whitespace-chars ["]+"])))) + +(defn word? + "True predicate, returning `true` if string `s` matches the pattern + `word-matcher`, `false` otherwise." + [s] + (if (and (string? s)(re-matches word-matcher s)) true false)) + +(defmacro word?? + "Not a true predicate; is string `s` is a word, returns `s`, else `nil`." + [s] + `(when (word? ~s) ~s)) + +;; (defn word??? +;; [s] +;; (let [c (re-matches candidate-matcher s)] +;; (when c (word?? (s/trim (second c)))))) + +(defn word??? + "When `s` is a string which contains exactly one word, returns that word." + [s] + (let [r (re-matches left-trim s) + c (second (re-matches right-trim (second r))) + c' (if (and (string? c) + (= (last c) \;)) + (subs c 0 (dec (count c))) + c) + c'' (when-not (empty? c') c')] + (word?? c''))) + +(defn words + [s] + ;; You'd think that this would be as simple as + ;; (re-seq word-matcher s), but that doesn't work for me. + (remove nil? + (map word??? (s/split s whitespace-matcher)))) \ No newline at end of file