OK, this can now compile an index for a single file, and do it very quickly;

it cannot yet iterate over a directory, and I need to work out why not.

But progress!
This commit is contained in:
Simon Brooke 2025-10-31 15:37:36 +00:00
parent 73549a5c90
commit ea2d6927d2
4 changed files with 40 additions and 4 deletions

View file

@ -3,7 +3,8 @@
:url "http://example.com/FIXME" :url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"} :url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.11.1"] :dependencies [[dev.weavejester/medley "1.9.0"]
[org.clojure/clojure "1.11.1"]
[peco "0.1.6"]] [peco "0.1.6"]]
:repl-options {:init-ns cc.journeyman.elboob.core} :repl-options {:init-ns cc.journeyman.elboob.core}
:source-paths ["src/clj"]) :source-paths ["src/clj"])

View file

@ -0,0 +1 @@
ignorable-words.en.edn

View file

@ -1,9 +1,43 @@
(ns cc.journeyman.elboob.core) (ns cc.journeyman.elboob.core
(:require [clojure.java.io :refer [as-relative-path file resource]]
[clojure.string :refer [ends-with?]]
[medley.core :refer [deep-merge]]
[peco.core :refer [tokenizer]]))
(defn compile (defn get-ignorable-words []
(let [locale (java.util.Locale/getDefault)
tag (.toLanguageTag locale)
language (.getLanguage locale)]
(first
(map #(try (println (format "resources/ignorable-words%s.edn" %))
(read-string
(slurp
(file (format "resources/ignorable-words%s.edn" %))))
(catch Exception e (println (.getMessage e)) nil))
[(str "." tag) (str "." language) ""]))))
(defn compile-file
"Compile an index for an individual file `f`, tokenised with `tokenise` and
filtered with `ignorable?`."
[f tokenise ignorable?]
(let [f' (file f)
rel (as-relative-path f')
tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
(reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
(defn compile-index
"scans `dir-paths` as directories of Markdown files. Returns a map which keys "scans `dir-paths` as directories of Markdown files. Returns a map which keys
each lexical token occurring in each file (with Markdown formatting, common each lexical token occurring in each file (with Markdown formatting, common
words, punctuation etc excepted) to a map which keys the relative file path words, punctuation etc excepted) to a map which keys the relative file path
of each file in which the token occurs to the frequency the token occurs within the file." of each file in which the token occurs to the frequency the token occurs within the file."
[& dir-paths] [& dir-paths]
(println "Hello, World!")) (let [ignorable-word? (set (get-ignorable-words))
tokenise (tokenizer [:lower-case :concat-singles])]
(reduce deep-merge {}
(map (fn [dir]
(map #(compile-file % tokenise ignorable-word?)
(filter
#(ends-with? (.getName %) ".md")
(file-seq (file dir))))) dir-paths))))