diff --git a/.gitignore b/.gitignore index 9ad0120..3387254 100644 --- a/.gitignore +++ b/.gitignore @@ -13,6 +13,4 @@ pom.xml.asc .lein-failures .nrepl-port .cpcache/ -.lsp/ -.clj-kondo/ -.portal/ \ No newline at end of file + diff --git a/project.clj b/project.clj index b5d83f0..68c9eda 100644 --- a/project.clj +++ b/project.clj @@ -3,8 +3,7 @@ :url "http://example.com/FIXME" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[dev.weavejester/medley "1.9.0"] - [org.clojure/clojure "1.11.1"] + :dependencies [[org.clojure/clojure "1.11.1"] [peco "0.1.6"]] :repl-options {:init-ns cc.journeyman.elboob.core} :source-paths ["src/clj"]) diff --git a/resources/ignorable-words.en-GB.edn b/resources/ignorable-words.en-GB.edn deleted file mode 120000 index 1151b98..0000000 --- a/resources/ignorable-words.en-GB.edn +++ /dev/null @@ -1 +0,0 @@ -ignorable-words.en.edn \ No newline at end of file diff --git a/resources/ignorable-words.edn b/resources/ignorable-words.en_GB.edn similarity index 100% rename from resources/ignorable-words.edn rename to resources/ignorable-words.en_GB.edn diff --git a/src/clj/cc/journeyman/elboob/core.clj b/src/clj/cc/journeyman/elboob/core.clj index 5a7bc89..8e433ea 100644 --- a/src/clj/cc/journeyman/elboob/core.clj +++ b/src/clj/cc/journeyman/elboob/core.clj @@ -1,43 +1,9 @@ -(ns cc.journeyman.elboob.core - (:require [clojure.java.io :refer [as-relative-path file resource]] - [clojure.string :refer [ends-with?]] - [medley.core :refer [deep-merge]] - [peco.core :refer [tokenizer]])) +(ns cc.journeyman.elboob.core) -(defn get-ignorable-words [] - (let [locale (java.util.Locale/getDefault) - tag (.toLanguageTag locale) - language (.getLanguage locale)] - - (first - (map #(try (println (format "resources/ignorable-words%s.edn" %)) - (read-string - (slurp - (file (format "resources/ignorable-words%s.edn" %)))) - (catch Exception e (println (.getMessage e)) nil)) - [(str "." tag) (str "." language) ""])))) - -(defn compile-file - "Compile an index for an individual file `f`, tokenised with `tokenise` and - filtered with `ignorable?`." - [f tokenise ignorable?] - (let [f' (file f) - rel (as-relative-path f') - tokens (frequencies (remove ignorable? (tokenise (slurp f'))))] - (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens)))) - -(defn compile-index +(defn compile "scans `dir-paths` as directories of Markdown files. Returns a map which keys each lexical token occurring in each file (with Markdown formatting, common words, punctuation etc excepted) to a map which keys the relative file path of each file in which the token occurs to the frequency the token occurs within the file." [& dir-paths] - (let [ignorable-word? (set (get-ignorable-words)) - tokenise (tokenizer [:lower-case :concat-singles])] - (reduce deep-merge {} - (map (fn [dir] - (map #(compile-file % tokenise ignorable-word?) - (filter - #(ends-with? (.getName %) ".md") - (file-seq (file dir))))) dir-paths)))) - + (println "Hello, World!"))