OK, this can now compile an index for a single file, and do it very quickly;

it cannot yet iterate over a directory, and I need to work out why not.

But progress!
This commit is contained in:
Simon Brooke 2025-10-31 15:37:36 +00:00
parent 73549a5c90
commit ea2d6927d2
4 changed files with 40 additions and 4 deletions

View file

@ -3,7 +3,8 @@
:url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.11.1"]
:dependencies [[dev.weavejester/medley "1.9.0"]
[org.clojure/clojure "1.11.1"]
[peco "0.1.6"]]
:repl-options {:init-ns cc.journeyman.elboob.core}
:source-paths ["src/clj"])

View file

@ -0,0 +1 @@
ignorable-words.en.edn

View file

@ -1,9 +1,43 @@
(ns cc.journeyman.elboob.core)
(ns cc.journeyman.elboob.core
(:require [clojure.java.io :refer [as-relative-path file resource]]
[clojure.string :refer [ends-with?]]
[medley.core :refer [deep-merge]]
[peco.core :refer [tokenizer]]))
(defn compile
(defn get-ignorable-words []
(let [locale (java.util.Locale/getDefault)
tag (.toLanguageTag locale)
language (.getLanguage locale)]
(first
(map #(try (println (format "resources/ignorable-words%s.edn" %))
(read-string
(slurp
(file (format "resources/ignorable-words%s.edn" %))))
(catch Exception e (println (.getMessage e)) nil))
[(str "." tag) (str "." language) ""]))))
(defn compile-file
"Compile an index for an individual file `f`, tokenised with `tokenise` and
filtered with `ignorable?`."
[f tokenise ignorable?]
(let [f' (file f)
rel (as-relative-path f')
tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
(reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
(defn compile-index
"scans `dir-paths` as directories of Markdown files. Returns a map which keys
each lexical token occurring in each file (with Markdown formatting, common
words, punctuation etc excepted) to a map which keys the relative file path
of each file in which the token occurs to the frequency the token occurs within the file."
[& dir-paths]
(println "Hello, World!"))
(let [ignorable-word? (set (get-ignorable-words))
tokenise (tokenizer [:lower-case :concat-singles])]
(reduce deep-merge {}
(map (fn [dir]
(map #(compile-file % tokenise ignorable-word?)
(filter
#(ends-with? (.getName %) ".md")
(file-seq (file dir))))) dir-paths))))