Added convenience.clj, word-identifier.clj; Upversioned

This commit is contained in:
Simon Brooke 2025-06-17 08:40:52 +01:00
parent 2a854e7f2b
commit efa3c1289e
3 changed files with 122 additions and 18 deletions

View file

@ -3,8 +3,9 @@
:url "http://github.com/tirkarthi/clj-wordcloud" :url "http://github.com/tirkarthi/clj-wordcloud"
:license {:name "MIT public license" :license {:name "MIT public license"
:url "http://opensource.org/licenses/mit-license.php"} :url "http://opensource.org/licenses/mit-license.php"}
:dependencies [[org.clojure/clojure "1.9.0"] :dependencies [[org.clojure/clojure "1.12.1"]
[com.kennycason/kumo-core "1.13"]] [com.kennycason/kumo-core "1.13"]
[com.taoensso/telemere "1.0.1"]]
:plugins [[lein-codox "0.10.3"]] :plugins [[lein-codox "0.10.3"]]
;; TODO : Add source uri and keep them in sync for docs ;; TODO : Add source uri and keep them in sync for docs
:codox {:output-path "docs"}) :codox {:output-path "docs"})

View file

@ -1,15 +1,19 @@
(ns clj-wordcloud.core (ns clj-wordcloud.core
(:require [clojure.spec.alpha :as spec]) (:require
(:import (java.awt Dimension Color) [clojure.java.io :as io]
(java.awt.image BufferedImage) [clojure.spec.alpha :as spec]
(javax.imageio ImageIO) [taoensso.telemere :refer [log!]])
(com.kennycason.kumo WordCloud WordFrequency LayeredWordCloud CollisionMode) (:import
(com.kennycason.kumo.bg CircleBackground RectangleBackground PixelBoundryBackground (com.kennycason.kumo CollisionMode WordCloud WordFrequency)
PixelBoundryBackground) (com.kennycason.kumo.bg
CircleBackground
PixelBoundryBackground
PixelBoundryBackground
RectangleBackground)
(com.kennycason.kumo.font FontWeight KumoFont)
(com.kennycason.kumo.font.scale LinearFontScalar SqrtFontScalar)
(com.kennycason.kumo.palette ColorPalette) (com.kennycason.kumo.palette ColorPalette)
(com.kennycason.kumo.font KumoFont FontWeight) (java.awt Color Dimension)))
(com.kennycason.kumo.font.scale LinearFontScalar)
(com.kennycason.kumo.nlp FrequencyAnalyzer)))
(defn- background-object (defn- background-object
@ -23,7 +27,7 @@
:rectangle :rectangle
(RectangleBackground. dimension) (RectangleBackground. dimension)
:pixel :pixel
(PixelBoundryBackground. (clojure.java.io/input-stream bitmap)) (PixelBoundryBackground. (io/input-stream bitmap))
(CircleBackground. size)))) (CircleBackground. size))))
@ -31,6 +35,7 @@
[frequency-map] [frequency-map]
(doto (java.util.ArrayList.) (doto (java.util.ArrayList.)
(#(doseq [[word count] frequency-map] (#(doseq [[word count] frequency-map]
;; (log! (format "Adding word `%s`, used `%d` times" word count))
(.add %1 (WordFrequency. (str word) count)))))) (.add %1 (WordFrequency. (str word) count))))))
@ -41,7 +46,8 @@
y-scale (get-in options [:font :y-scale] 10)] y-scale (get-in options [:font :y-scale] 10)]
(case scale-type (case scale-type
:linear :linear
(LinearFontScalar. x-scale y-scale)))) (LinearFontScalar. x-scale y-scale)
:sqrt (SqrtFontScalar. x-scale y-scale))))
(defn- build-font (defn- build-font
@ -102,7 +108,7 @@
; font specs ; font specs
(spec/def :font/type string?) (spec/def :font/type string?)
(spec/def :font/weight #{:plain :bold :italic}) (spec/def :font/weight #{:plain :bold :italic})
(spec/def :font/scale-type #{:linear}) (spec/def :font/scale-type #{:linear :sqrt})
(spec/def :font/x-scale (spec/and number? pos?)) (spec/def :font/x-scale (spec/and number? pos?))
(spec/def :font/y-scale (spec/and number? pos?)) (spec/def :font/y-scale (spec/and number? pos?))
(spec/def :font/padding (spec/and number? pos?)) (spec/def :font/padding (spec/and number? pos?))
@ -136,9 +142,9 @@
([frequency-map] ([frequency-map]
(word-cloud frequency-map {})) (word-cloud frequency-map {}))
([frequency-map options] ([frequency-map options]
(if-not (spec/valid? ::frequency-map frequency-map) (when-not (spec/valid? ::frequency-map frequency-map)
(throw (ex-info "Invalid data : " (spec/explain-data ::frequency-map frequency-map)))) (throw (ex-info "Invalid data : " (spec/explain-data ::frequency-map frequency-map))))
(if-not (spec/valid? ::options options) (when-not (spec/valid? ::options options)
(throw (ex-info "Invalid options : " (spec/explain-data ::options options)))) (throw (ex-info "Invalid options : " (spec/explain-data ::options options))))
(let [word-frequencies (build-word-frequency frequency-map) (let [word-frequencies (build-word-frequency frequency-map)
dimension (build-dimension options) dimension (build-dimension options)

View file

@ -0,0 +1,97 @@
(ns clj-wordcloud.word-identifier
(:require [clojure.string :as s])
(:import [java.util.regex Pattern]))
(def word-matcher
"What is a word? It's a sequence of alphabetical characters, for some
alphabet (in the case of English, a slightly extended Latin alphabet).
But some of those characters may be decorated with diacriticals, and
occasionally characters from other alphabets will be used. In my
writing, these are encoded as 'HTML entities'.
This regular expression attempts to match anything which is a word.
It does not have a catalogue of valid HTML entities, so things which have
the form of HTML entities but which are not valid will be accepted"
#"([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)*|[A-Za-z]*")
(def trimable-matcher
"Intended to matche a string which contains exactly one word
(doesn't work yet)."
#"^[A-Za-z0-9&;]([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)^[A-Za-z0-9&]*")
;
(def candidate-matcher
"This is ineligant. It identifies a string which MIGHT contain a word, and
isolates the bit that might be a word."
#"[^A-Za-z&;]*(.*)[^A-Za-z&;]*")
(def left-trim
#"[^A-Za-z&]*(.*)")
(def right-trim
#"([A-Za-z0-9&;]*).*")
(def whitespace-chars ["" ;;/* dummy empty string for homogeneity */
"\\u0009" ;; CHARACTER TABULATION
"\\u000A" ;; LINE FEED (LF)
"\\u000B" ;; LINE TABULATION
"\\u000C" ;; FORM FEED (FF)
"\\u000D" ;; CARRIAGE RETURN (CR)
"\\u0020" ;; SPACE
"\\u0085" ;; NEXT LINE (NEL)
"\\u00A0" ;; NO-BREAK SPACE
"\\u1680" ;; OGHAM SPACE MARK
"\\u180E" ;; MONGOLIAN VOWEL SEPARATOR
"\\u2000" ;; EN QUAD
"\\u2001" ;; EM QUAD
"\\u2002" ;; EN SPACE
"\\u2003" ;; EM SPACE
"\\u2004" ;; THREE-PER-EM SPACE
"\\u2005" ;; FOUR-PER-EM SPACE
"\\u2006" ;; SIX-PER-EM SPACE
"\\u2007" ;; FIGURE SPACE
"\\u2008" ;; PUNCTUATION SPACE
"\\u2009" ;; THIN SPACE
"\\u200A" ;; HAIR SPACE
"\\u2028" ;; LINE SEPARATOR
"\\u2029" ;; PARAGRAPH SEPARATOR
"\\u202F" ;; NARROW NO-BREAK SPACE
"\\u205F" ;; MEDIUM MATHEMATICAL SPACE
"\\u3000" ;; IDEOGRAPHIC SPACE
])
(def whitespace-matcher
(Pattern/compile (apply str (concat ["["] whitespace-chars ["]+"]))))
(defn word?
"True predicate, returning `true` if string `s` matches the pattern
`word-matcher`, `false` otherwise."
[s]
(if (and (string? s)(re-matches word-matcher s)) true false))
(defmacro word??
"Not a true predicate; is string `s` is a word, returns `s`, else `nil`."
[s]
`(when (word? ~s) ~s))
;; (defn word???
;; [s]
;; (let [c (re-matches candidate-matcher s)]
;; (when c (word?? (s/trim (second c))))))
(defn word???
"When `s` is a string which contains exactly one word, returns that word."
[s]
(let [r (re-matches left-trim s)
c (second (re-matches right-trim (second r)))
c' (if (and (string? c)
(= (last c) \;))
(subs c 0 (dec (count c)))
c)
c'' (when-not (empty? c') c')]
(word?? c'')))
(defn words
[s]
;; You'd think that this would be as simple as
;; (re-seq word-matcher s), but that doesn't work for me.
(remove nil?
(map word??? (s/split s whitespace-matcher))))