mirror of
https://github.com/simon-brooke/clj-wordcloud.git
synced 2025-07-01 09:38:08 +00:00
Added convenience.clj
, word-identifier.clj
; Upversioned
This commit is contained in:
parent
2a854e7f2b
commit
efa3c1289e
|
@ -3,8 +3,9 @@
|
|||
:url "http://github.com/tirkarthi/clj-wordcloud"
|
||||
:license {:name "MIT public license"
|
||||
:url "http://opensource.org/licenses/mit-license.php"}
|
||||
:dependencies [[org.clojure/clojure "1.9.0"]
|
||||
[com.kennycason/kumo-core "1.13"]]
|
||||
:dependencies [[org.clojure/clojure "1.12.1"]
|
||||
[com.kennycason/kumo-core "1.13"]
|
||||
[com.taoensso/telemere "1.0.1"]]
|
||||
:plugins [[lein-codox "0.10.3"]]
|
||||
;; TODO : Add source uri and keep them in sync for docs
|
||||
:codox {:output-path "docs"})
|
||||
|
|
|
@ -1,15 +1,19 @@
|
|||
(ns clj-wordcloud.core
|
||||
(:require [clojure.spec.alpha :as spec])
|
||||
(:import (java.awt Dimension Color)
|
||||
(java.awt.image BufferedImage)
|
||||
(javax.imageio ImageIO)
|
||||
(com.kennycason.kumo WordCloud WordFrequency LayeredWordCloud CollisionMode)
|
||||
(com.kennycason.kumo.bg CircleBackground RectangleBackground PixelBoundryBackground
|
||||
PixelBoundryBackground)
|
||||
(:require
|
||||
[clojure.java.io :as io]
|
||||
[clojure.spec.alpha :as spec]
|
||||
[taoensso.telemere :refer [log!]])
|
||||
(:import
|
||||
(com.kennycason.kumo CollisionMode WordCloud WordFrequency)
|
||||
(com.kennycason.kumo.bg
|
||||
CircleBackground
|
||||
PixelBoundryBackground
|
||||
PixelBoundryBackground
|
||||
RectangleBackground)
|
||||
(com.kennycason.kumo.font FontWeight KumoFont)
|
||||
(com.kennycason.kumo.font.scale LinearFontScalar SqrtFontScalar)
|
||||
(com.kennycason.kumo.palette ColorPalette)
|
||||
(com.kennycason.kumo.font KumoFont FontWeight)
|
||||
(com.kennycason.kumo.font.scale LinearFontScalar)
|
||||
(com.kennycason.kumo.nlp FrequencyAnalyzer)))
|
||||
(java.awt Color Dimension)))
|
||||
|
||||
|
||||
(defn- background-object
|
||||
|
@ -23,7 +27,7 @@
|
|||
:rectangle
|
||||
(RectangleBackground. dimension)
|
||||
:pixel
|
||||
(PixelBoundryBackground. (clojure.java.io/input-stream bitmap))
|
||||
(PixelBoundryBackground. (io/input-stream bitmap))
|
||||
(CircleBackground. size))))
|
||||
|
||||
|
||||
|
@ -31,6 +35,7 @@
|
|||
[frequency-map]
|
||||
(doto (java.util.ArrayList.)
|
||||
(#(doseq [[word count] frequency-map]
|
||||
;; (log! (format "Adding word `%s`, used `%d` times" word count))
|
||||
(.add %1 (WordFrequency. (str word) count))))))
|
||||
|
||||
|
||||
|
@ -41,7 +46,8 @@
|
|||
y-scale (get-in options [:font :y-scale] 10)]
|
||||
(case scale-type
|
||||
:linear
|
||||
(LinearFontScalar. x-scale y-scale))))
|
||||
(LinearFontScalar. x-scale y-scale)
|
||||
:sqrt (SqrtFontScalar. x-scale y-scale))))
|
||||
|
||||
|
||||
(defn- build-font
|
||||
|
@ -102,7 +108,7 @@
|
|||
; font specs
|
||||
(spec/def :font/type string?)
|
||||
(spec/def :font/weight #{:plain :bold :italic})
|
||||
(spec/def :font/scale-type #{:linear})
|
||||
(spec/def :font/scale-type #{:linear :sqrt})
|
||||
(spec/def :font/x-scale (spec/and number? pos?))
|
||||
(spec/def :font/y-scale (spec/and number? pos?))
|
||||
(spec/def :font/padding (spec/and number? pos?))
|
||||
|
@ -136,9 +142,9 @@
|
|||
([frequency-map]
|
||||
(word-cloud frequency-map {}))
|
||||
([frequency-map options]
|
||||
(if-not (spec/valid? ::frequency-map frequency-map)
|
||||
(when-not (spec/valid? ::frequency-map frequency-map)
|
||||
(throw (ex-info "Invalid data : " (spec/explain-data ::frequency-map frequency-map))))
|
||||
(if-not (spec/valid? ::options options)
|
||||
(when-not (spec/valid? ::options options)
|
||||
(throw (ex-info "Invalid options : " (spec/explain-data ::options options))))
|
||||
(let [word-frequencies (build-word-frequency frequency-map)
|
||||
dimension (build-dimension options)
|
||||
|
|
97
src/clj_wordcloud/word_identifier.clj
Normal file
97
src/clj_wordcloud/word_identifier.clj
Normal file
|
@ -0,0 +1,97 @@
|
|||
(ns clj-wordcloud.word-identifier
|
||||
(:require [clojure.string :as s])
|
||||
(:import [java.util.regex Pattern]))
|
||||
|
||||
(def word-matcher
|
||||
"What is a word? It's a sequence of alphabetical characters, for some
|
||||
alphabet (in the case of English, a slightly extended Latin alphabet).
|
||||
But some of those characters may be decorated with diacriticals, and
|
||||
occasionally characters from other alphabets will be used. In my
|
||||
writing, these are encoded as 'HTML entities'.
|
||||
|
||||
This regular expression attempts to match anything which is a word.
|
||||
It does not have a catalogue of valid HTML entities, so things which have
|
||||
the form of HTML entities but which are not valid will be accepted"
|
||||
#"([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)*|[A-Za-z]*")
|
||||
|
||||
(def trimable-matcher
|
||||
"Intended to matche a string which contains exactly one word
|
||||
(doesn't work yet)."
|
||||
#"^[A-Za-z0-9&;]([A-Za-z]*&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-fA-F]{1,6});[A-Za-z]*)^[A-Za-z0-9&]*")
|
||||
;
|
||||
(def candidate-matcher
|
||||
"This is ineligant. It identifies a string which MIGHT contain a word, and
|
||||
isolates the bit that might be a word."
|
||||
#"[^A-Za-z&;]*(.*)[^A-Za-z&;]*")
|
||||
|
||||
(def left-trim
|
||||
#"[^A-Za-z&]*(.*)")
|
||||
(def right-trim
|
||||
#"([A-Za-z0-9&;]*).*")
|
||||
|
||||
(def whitespace-chars ["" ;;/* dummy empty string for homogeneity */
|
||||
"\\u0009" ;; CHARACTER TABULATION
|
||||
"\\u000A" ;; LINE FEED (LF)
|
||||
"\\u000B" ;; LINE TABULATION
|
||||
"\\u000C" ;; FORM FEED (FF)
|
||||
"\\u000D" ;; CARRIAGE RETURN (CR)
|
||||
"\\u0020" ;; SPACE
|
||||
"\\u0085" ;; NEXT LINE (NEL)
|
||||
"\\u00A0" ;; NO-BREAK SPACE
|
||||
"\\u1680" ;; OGHAM SPACE MARK
|
||||
"\\u180E" ;; MONGOLIAN VOWEL SEPARATOR
|
||||
"\\u2000" ;; EN QUAD
|
||||
"\\u2001" ;; EM QUAD
|
||||
"\\u2002" ;; EN SPACE
|
||||
"\\u2003" ;; EM SPACE
|
||||
"\\u2004" ;; THREE-PER-EM SPACE
|
||||
"\\u2005" ;; FOUR-PER-EM SPACE
|
||||
"\\u2006" ;; SIX-PER-EM SPACE
|
||||
"\\u2007" ;; FIGURE SPACE
|
||||
"\\u2008" ;; PUNCTUATION SPACE
|
||||
"\\u2009" ;; THIN SPACE
|
||||
"\\u200A" ;; HAIR SPACE
|
||||
"\\u2028" ;; LINE SEPARATOR
|
||||
"\\u2029" ;; PARAGRAPH SEPARATOR
|
||||
"\\u202F" ;; NARROW NO-BREAK SPACE
|
||||
"\\u205F" ;; MEDIUM MATHEMATICAL SPACE
|
||||
"\\u3000" ;; IDEOGRAPHIC SPACE
|
||||
])
|
||||
|
||||
(def whitespace-matcher
|
||||
(Pattern/compile (apply str (concat ["["] whitespace-chars ["]+"]))))
|
||||
|
||||
(defn word?
|
||||
"True predicate, returning `true` if string `s` matches the pattern
|
||||
`word-matcher`, `false` otherwise."
|
||||
[s]
|
||||
(if (and (string? s)(re-matches word-matcher s)) true false))
|
||||
|
||||
(defmacro word??
|
||||
"Not a true predicate; is string `s` is a word, returns `s`, else `nil`."
|
||||
[s]
|
||||
`(when (word? ~s) ~s))
|
||||
|
||||
;; (defn word???
|
||||
;; [s]
|
||||
;; (let [c (re-matches candidate-matcher s)]
|
||||
;; (when c (word?? (s/trim (second c))))))
|
||||
|
||||
(defn word???
|
||||
"When `s` is a string which contains exactly one word, returns that word."
|
||||
[s]
|
||||
(let [r (re-matches left-trim s)
|
||||
c (second (re-matches right-trim (second r)))
|
||||
c' (if (and (string? c)
|
||||
(= (last c) \;))
|
||||
(subs c 0 (dec (count c)))
|
||||
c)
|
||||
c'' (when-not (empty? c') c')]
|
||||
(word?? c'')))
|
||||
|
||||
(defn words
|
||||
[s]
|
||||
;; You'd think that this would be as simple as
|
||||
;; (re-seq word-matcher s), but that doesn't work for me.
|
||||
(remove nil?
|
||||
(map word??? (s/split s whitespace-matcher))))
|
Loading…
Reference in a new issue