Added convenience.clj, word-identifier.clj; Upversioned

2025-07-01 09:38:08 +00:00 · 2025-06-17 08:40:52 +01:00 · 2025-06-17 08:40:52 +01:00 · efa3c1289e
parent 2a854e7f2b
commit efa3c1289e
3 changed files with 122 additions and 18 deletions
--- a/project.clj
+++ b/project.clj
@ -3,8 +3,9 @@
  :url "http://github.com/tirkarthi/clj-wordcloud"
  :license {:name "MIT public license"
            :url "http://opensource.org/licenses/mit-license.php"}
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [com.kennycason/kumo-core "1.13"]]
+  :dependencies [[org.clojure/clojure "1.12.1"]
+                 [com.kennycason/kumo-core "1.13"]
+                 [com.taoensso/telemere "1.0.1"]]
  :plugins [[lein-codox "0.10.3"]]
  ;; TODO : Add source uri and keep them in sync for docs
  :codox {:output-path "docs"})
--- a/src/clj_wordcloud/core.clj
+++ b/src/clj_wordcloud/core.clj
@ -1,15 +1,19 @@
 (ns clj-wordcloud.core
-  (:require [clojure.spec.alpha :as spec])
-  (:import (java.awt Dimension Color)
-           (java.awt.image BufferedImage)
-           (javax.imageio ImageIO)
-           (com.kennycason.kumo WordCloud WordFrequency LayeredWordCloud CollisionMode)
-           (com.kennycason.kumo.bg CircleBackground RectangleBackground PixelBoundryBackground
-                                   PixelBoundryBackground)
+  (:require
+   [clojure.java.io :as io]
+   [clojure.spec.alpha :as spec] 
+   [taoensso.telemere :refer [log!]])
+  (:import
+   (com.kennycason.kumo CollisionMode WordCloud WordFrequency)
+   (com.kennycason.kumo.bg
+    CircleBackground
+    PixelBoundryBackground
+    PixelBoundryBackground
+    RectangleBackground)
+   (com.kennycason.kumo.font FontWeight KumoFont)
+   (com.kennycason.kumo.font.scale LinearFontScalar SqrtFontScalar)
   (com.kennycason.kumo.palette ColorPalette)
-           (com.kennycason.kumo.font KumoFont FontWeight)
-           (com.kennycason.kumo.font.scale LinearFontScalar)
-           (com.kennycason.kumo.nlp FrequencyAnalyzer)))
+   (java.awt Color Dimension)))


 (defn- background-object
@ -23,7 +27,7 @@
      :rectangle
      (RectangleBackground. dimension)
      :pixel
-      (PixelBoundryBackground. (clojure.java.io/input-stream bitmap))
+      (PixelBoundryBackground. (io/input-stream bitmap))
      (CircleBackground. size))))


@ -31,6 +35,7 @@
  [frequency-map]
  (doto (java.util.ArrayList.)
    (#(doseq [[word count] frequency-map]
+;;        (log! (format "Adding word `%s`, used `%d` times" word count))
        (.add %1 (WordFrequency. (str word) count))))))


@ -41,7 +46,8 @@
        y-scale    (get-in options [:font :y-scale] 10)]
    (case scale-type
      :linear
-      (LinearFontScalar. x-scale y-scale))))
+      (LinearFontScalar. x-scale y-scale)
+      :sqrt (SqrtFontScalar. x-scale y-scale))))


 (defn- build-font
@ -102,7 +108,7 @@
 ; font specs
 (spec/def :font/type string?)
 (spec/def :font/weight #{:plain :bold :italic})
-(spec/def :font/scale-type #{:linear})
+(spec/def :font/scale-type #{:linear :sqrt})
 (spec/def :font/x-scale (spec/and number? pos?))
 (spec/def :font/y-scale (spec/and number? pos?))
 (spec/def :font/padding (spec/and number? pos?))
@ -136,9 +142,9 @@
  ([frequency-map]
   (word-cloud frequency-map {}))
  ([frequency-map options]
-   (if-not (spec/valid? ::frequency-map frequency-map)
+   (when-not (spec/valid? ::frequency-map frequency-map)
     (throw (ex-info "Invalid data : " (spec/explain-data ::frequency-map frequency-map))))
-   (if-not (spec/valid? ::options options)
+   (when-not (spec/valid? ::options options)
     (throw (ex-info "Invalid options : " (spec/explain-data ::options options))))
  (let [word-frequencies (build-word-frequency frequency-map)
        dimension        (build-dimension options)
--- a/src/clj_wordcloud/word_identifier.clj
+++ b/src/clj_wordcloud/word_identifier.clj
@ -0,0 +1,97 @@
+(ns clj-wordcloud.word-identifier
+  (:require [clojure.string :as s])
+  (:import [java.util.regex Pattern]))
+
+(def word-matcher
+  "What is a word? It's a sequence of alphabetical characters, for some 
+   alphabet (in the case of English, a slightly extended Latin alphabet).
+   But some of those characters may be decorated with diacriticals, and
+   occasionally characters from other alphabets will be used. In my
+   writing, these are encoded as 'HTML entities'.
+   
+   This regular expression attempts to match anything which is a word.
+   It does not have a catalogue of valid HTML entities, so things which have
+   the form of HTML entities but which are not valid will be accepted"
+  #"([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)*|[A-Za-z]*")
+
+(def trimable-matcher
+  "Intended to matche a string which contains exactly one word 
+   (doesn't work yet)."
+  #"^[A-Za-z0-9&;]([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)^[A-Za-z0-9&]*")
+;
+(def candidate-matcher
+  "This is ineligant. It identifies a string which MIGHT contain a word, and 
+   isolates the bit that might be a word."
+  #"[^A-Za-z&;]*(.*)[^A-Za-z&;]*")
+
+(def left-trim
+  #"[^A-Za-z&]*(.*)")
+(def right-trim
+  #"([A-Za-z0-9&;]*).*")
+
+(def whitespace-chars [""       ;;/* dummy empty string for homogeneity */
+                       "\\u0009" ;; CHARACTER TABULATION
+                       "\\u000A" ;; LINE FEED (LF)
+                       "\\u000B" ;; LINE TABULATION
+                       "\\u000C" ;; FORM FEED (FF)
+                       "\\u000D" ;; CARRIAGE RETURN (CR)
+                       "\\u0020" ;; SPACE
+                       "\\u0085" ;; NEXT LINE (NEL) 
+                       "\\u00A0" ;; NO-BREAK SPACE
+                       "\\u1680" ;; OGHAM SPACE MARK
+                       "\\u180E" ;; MONGOLIAN VOWEL SEPARATOR
+                       "\\u2000" ;; EN QUAD 
+                       "\\u2001" ;; EM QUAD 
+                       "\\u2002" ;; EN SPACE
+                       "\\u2003" ;; EM SPACE
+                       "\\u2004" ;; THREE-PER-EM SPACE
+                       "\\u2005" ;; FOUR-PER-EM SPACE
+                       "\\u2006" ;; SIX-PER-EM SPACE
+                       "\\u2007" ;; FIGURE SPACE
+                       "\\u2008" ;; PUNCTUATION SPACE
+                       "\\u2009" ;; THIN SPACE
+                       "\\u200A" ;; HAIR SPACE
+                       "\\u2028" ;; LINE SEPARATOR
+                       "\\u2029" ;; PARAGRAPH SEPARATOR
+                       "\\u202F" ;; NARROW NO-BREAK SPACE
+                       "\\u205F" ;; MEDIUM MATHEMATICAL SPACE
+                       "\\u3000" ;; IDEOGRAPHIC SPACE 
+                       ])
+
+(def whitespace-matcher 
+  (Pattern/compile (apply str (concat ["["] whitespace-chars ["]+"]))))
+
+(defn word?
+  "True predicate, returning `true` if string `s` matches the pattern
+   `word-matcher`, `false` otherwise."
+  [s]
+  (if (and (string? s)(re-matches word-matcher s)) true false))
+
+(defmacro word??
+  "Not a true predicate; is string `s` is a word, returns `s`, else `nil`."
+  [s]
+  `(when (word? ~s) ~s))
+
+;; (defn word???
+;;   [s]
+;;   (let [c (re-matches candidate-matcher s)]
+;;      (when c (word?? (s/trim (second c))))))
+
+(defn word???
+  "When `s` is a string which contains exactly one word, returns that word."
+  [s]
+  (let [r (re-matches left-trim s)
+        c (second (re-matches right-trim (second r)))
+        c' (if (and (string? c)
+                    (= (last c) \;))
+             (subs c 0 (dec (count c)))
+             c)
+        c'' (when-not (empty? c') c')]
+    (word?? c'')))
+
+(defn words
+  [s]
+  ;; You'd think that this would be as simple as 
+  ;; (re-seq word-matcher s), but that doesn't work for me.
+  (remove nil?
+          (map word??? (s/split s whitespace-matcher))))