Added convenience.clj, word-identifier.clj; Upversioned

This commit is contained in:
Simon Brooke 2025-06-17 08:40:52 +01:00
parent 2a854e7f2b
commit efa3c1289e
3 changed files with 122 additions and 18 deletions

View file

@ -3,8 +3,9 @@
:url "http://github.com/tirkarthi/clj-wordcloud"
:license {:name "MIT public license"
:url "http://opensource.org/licenses/mit-license.php"}
:dependencies [[org.clojure/clojure "1.9.0"]
[com.kennycason/kumo-core "1.13"]]
:dependencies [[org.clojure/clojure "1.12.1"]
[com.kennycason/kumo-core "1.13"]
[com.taoensso/telemere "1.0.1"]]
:plugins [[lein-codox "0.10.3"]]
;; TODO : Add source uri and keep them in sync for docs
:codox {:output-path "docs"})

View file

@ -1,15 +1,19 @@
(ns clj-wordcloud.core
(:require [clojure.spec.alpha :as spec])
(:import (java.awt Dimension Color)
(java.awt.image BufferedImage)
(javax.imageio ImageIO)
(com.kennycason.kumo WordCloud WordFrequency LayeredWordCloud CollisionMode)
(com.kennycason.kumo.bg CircleBackground RectangleBackground PixelBoundryBackground
PixelBoundryBackground)
(com.kennycason.kumo.palette ColorPalette)
(com.kennycason.kumo.font KumoFont FontWeight)
(com.kennycason.kumo.font.scale LinearFontScalar)
(com.kennycason.kumo.nlp FrequencyAnalyzer)))
(:require
[clojure.java.io :as io]
[clojure.spec.alpha :as spec]
[taoensso.telemere :refer [log!]])
(:import
(com.kennycason.kumo CollisionMode WordCloud WordFrequency)
(com.kennycason.kumo.bg
CircleBackground
PixelBoundryBackground
PixelBoundryBackground
RectangleBackground)
(com.kennycason.kumo.font FontWeight KumoFont)
(com.kennycason.kumo.font.scale LinearFontScalar SqrtFontScalar)
(com.kennycason.kumo.palette ColorPalette)
(java.awt Color Dimension)))
(defn- background-object
@ -23,7 +27,7 @@
:rectangle
(RectangleBackground. dimension)
:pixel
(PixelBoundryBackground. (clojure.java.io/input-stream bitmap))
(PixelBoundryBackground. (io/input-stream bitmap))
(CircleBackground. size))))
@ -31,6 +35,7 @@
[frequency-map]
(doto (java.util.ArrayList.)
(#(doseq [[word count] frequency-map]
;; (log! (format "Adding word `%s`, used `%d` times" word count))
(.add %1 (WordFrequency. (str word) count))))))
@ -41,7 +46,8 @@
y-scale (get-in options [:font :y-scale] 10)]
(case scale-type
:linear
(LinearFontScalar. x-scale y-scale))))
(LinearFontScalar. x-scale y-scale)
:sqrt (SqrtFontScalar. x-scale y-scale))))
(defn- build-font
@ -102,7 +108,7 @@
; font specs
(spec/def :font/type string?)
(spec/def :font/weight #{:plain :bold :italic})
(spec/def :font/scale-type #{:linear})
(spec/def :font/scale-type #{:linear :sqrt})
(spec/def :font/x-scale (spec/and number? pos?))
(spec/def :font/y-scale (spec/and number? pos?))
(spec/def :font/padding (spec/and number? pos?))
@ -136,9 +142,9 @@
([frequency-map]
(word-cloud frequency-map {}))
([frequency-map options]
(if-not (spec/valid? ::frequency-map frequency-map)
(when-not (spec/valid? ::frequency-map frequency-map)
(throw (ex-info "Invalid data : " (spec/explain-data ::frequency-map frequency-map))))
(if-not (spec/valid? ::options options)
(when-not (spec/valid? ::options options)
(throw (ex-info "Invalid options : " (spec/explain-data ::options options))))
(let [word-frequencies (build-word-frequency frequency-map)
dimension (build-dimension options)

View file

@ -0,0 +1,97 @@
(ns clj-wordcloud.word-identifier
(:require [clojure.string :as s])
(:import [java.util.regex Pattern]))
(def word-matcher
"What is a word? It's a sequence of alphabetical characters, for some
alphabet (in the case of English, a slightly extended Latin alphabet).
But some of those characters may be decorated with diacriticals, and
occasionally characters from other alphabets will be used. In my
writing, these are encoded as 'HTML entities'.
This regular expression attempts to match anything which is a word.
It does not have a catalogue of valid HTML entities, so things which have
the form of HTML entities but which are not valid will be accepted"
#"([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)*|[A-Za-z]*")
(def trimable-matcher
"Intended to matche a string which contains exactly one word
(doesn't work yet)."
#"^[A-Za-z0-9&;]([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)^[A-Za-z0-9&]*")
;
(def candidate-matcher
"This is ineligant. It identifies a string which MIGHT contain a word, and
isolates the bit that might be a word."
#"[^A-Za-z&;]*(.*)[^A-Za-z&;]*")
(def left-trim
#"[^A-Za-z&]*(.*)")
(def right-trim
#"([A-Za-z0-9&;]*).*")
(def whitespace-chars ["" ;;/* dummy empty string for homogeneity */
"\\u0009" ;; CHARACTER TABULATION
"\\u000A" ;; LINE FEED (LF)
"\\u000B" ;; LINE TABULATION
"\\u000C" ;; FORM FEED (FF)
"\\u000D" ;; CARRIAGE RETURN (CR)
"\\u0020" ;; SPACE
"\\u0085" ;; NEXT LINE (NEL)
"\\u00A0" ;; NO-BREAK SPACE
"\\u1680" ;; OGHAM SPACE MARK
"\\u180E" ;; MONGOLIAN VOWEL SEPARATOR
"\\u2000" ;; EN QUAD
"\\u2001" ;; EM QUAD
"\\u2002" ;; EN SPACE
"\\u2003" ;; EM SPACE
"\\u2004" ;; THREE-PER-EM SPACE
"\\u2005" ;; FOUR-PER-EM SPACE
"\\u2006" ;; SIX-PER-EM SPACE
"\\u2007" ;; FIGURE SPACE
"\\u2008" ;; PUNCTUATION SPACE
"\\u2009" ;; THIN SPACE
"\\u200A" ;; HAIR SPACE
"\\u2028" ;; LINE SEPARATOR
"\\u2029" ;; PARAGRAPH SEPARATOR
"\\u202F" ;; NARROW NO-BREAK SPACE
"\\u205F" ;; MEDIUM MATHEMATICAL SPACE
"\\u3000" ;; IDEOGRAPHIC SPACE
])
(def whitespace-matcher
(Pattern/compile (apply str (concat ["["] whitespace-chars ["]+"]))))
(defn word?
"True predicate, returning `true` if string `s` matches the pattern
`word-matcher`, `false` otherwise."
[s]
(if (and (string? s)(re-matches word-matcher s)) true false))
(defmacro word??
"Not a true predicate; is string `s` is a word, returns `s`, else `nil`."
[s]
`(when (word? ~s) ~s))
;; (defn word???
;; [s]
;; (let [c (re-matches candidate-matcher s)]
;; (when c (word?? (s/trim (second c))))))
(defn word???
"When `s` is a string which contains exactly one word, returns that word."
[s]
(let [r (re-matches left-trim s)
c (second (re-matches right-trim (second r)))
c' (if (and (string? c)
(= (last c) \;))
(subs c 0 (dec (count c)))
c)
c'' (when-not (empty? c') c')]
(word?? c'')))
(defn words
[s]
;; You'd think that this would be as simple as
;; (re-seq word-matcher s), but that doesn't work for me.
(remove nil?
(map word??? (s/split s whitespace-matcher))))