From ea2d6927d26a4b3013fdb18a48be48a297db6dfb Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Fri, 31 Oct 2025 15:37:36 +0000 Subject: [PATCH 1/2] OK, this can now compile an index for a single file, and do it very quickly; it cannot yet iterate over a directory, and I need to work out why not. But progress! --- project.clj | 3 +- ...le-words.en_GB.edn => ignorable-words.edn} | 0 resources/ignorable-words.en-GB.edn | 1 + src/clj/cc/journeyman/elboob/core.clj | 40 +++++++++++++++++-- 4 files changed, 40 insertions(+), 4 deletions(-) rename resources/{ignorable-words.en_GB.edn => ignorable-words.edn} (100%) create mode 120000 resources/ignorable-words.en-GB.edn diff --git a/project.clj b/project.clj index 68c9eda..b5d83f0 100644 --- a/project.clj +++ b/project.clj @@ -3,7 +3,8 @@ :url "http://example.com/FIXME" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[org.clojure/clojure "1.11.1"] + :dependencies [[dev.weavejester/medley "1.9.0"] + [org.clojure/clojure "1.11.1"] [peco "0.1.6"]] :repl-options {:init-ns cc.journeyman.elboob.core} :source-paths ["src/clj"]) diff --git a/resources/ignorable-words.en_GB.edn b/resources/ignorable-words.edn similarity index 100% rename from resources/ignorable-words.en_GB.edn rename to resources/ignorable-words.edn diff --git a/resources/ignorable-words.en-GB.edn b/resources/ignorable-words.en-GB.edn new file mode 120000 index 0000000..1151b98 --- /dev/null +++ b/resources/ignorable-words.en-GB.edn @@ -0,0 +1 @@ +ignorable-words.en.edn \ No newline at end of file diff --git a/src/clj/cc/journeyman/elboob/core.clj b/src/clj/cc/journeyman/elboob/core.clj index 8e433ea..5a7bc89 100644 --- a/src/clj/cc/journeyman/elboob/core.clj +++ b/src/clj/cc/journeyman/elboob/core.clj @@ -1,9 +1,43 @@ -(ns cc.journeyman.elboob.core) +(ns cc.journeyman.elboob.core + (:require [clojure.java.io :refer [as-relative-path file resource]] + [clojure.string :refer [ends-with?]] + [medley.core :refer [deep-merge]] + [peco.core :refer [tokenizer]])) -(defn compile +(defn get-ignorable-words [] + (let [locale (java.util.Locale/getDefault) + tag (.toLanguageTag locale) + language (.getLanguage locale)] + + (first + (map #(try (println (format "resources/ignorable-words%s.edn" %)) + (read-string + (slurp + (file (format "resources/ignorable-words%s.edn" %)))) + (catch Exception e (println (.getMessage e)) nil)) + [(str "." tag) (str "." language) ""])))) + +(defn compile-file + "Compile an index for an individual file `f`, tokenised with `tokenise` and + filtered with `ignorable?`." + [f tokenise ignorable?] + (let [f' (file f) + rel (as-relative-path f') + tokens (frequencies (remove ignorable? (tokenise (slurp f'))))] + (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens)))) + +(defn compile-index "scans `dir-paths` as directories of Markdown files. Returns a map which keys each lexical token occurring in each file (with Markdown formatting, common words, punctuation etc excepted) to a map which keys the relative file path of each file in which the token occurs to the frequency the token occurs within the file." [& dir-paths] - (println "Hello, World!")) + (let [ignorable-word? (set (get-ignorable-words)) + tokenise (tokenizer [:lower-case :concat-singles])] + (reduce deep-merge {} + (map (fn [dir] + (map #(compile-file % tokenise ignorable-word?) + (filter + #(ends-with? (.getName %) ".md") + (file-seq (file dir))))) dir-paths)))) + From 86665db3b8006d98970cb0dfb4c0b6f854be1805 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Fri, 31 Oct 2025 15:41:02 +0000 Subject: [PATCH 2/2] Added more ignorables to .gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 3387254..9ad0120 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ pom.xml.asc .lein-failures .nrepl-port .cpcache/ - +.lsp/ +.clj-kondo/ +.portal/ \ No newline at end of file