Compare commits

...

2 commits

Author SHA1 Message Date
86665db3b8 Added more ignorables to .gitignore 2025-10-31 15:41:02 +00:00
ea2d6927d2 OK, this can now compile an index for a single file, and do it very quickly;
it cannot yet iterate over a directory, and I need to work out why not.

But progress!
2025-10-31 15:37:36 +00:00
5 changed files with 43 additions and 5 deletions

4
.gitignore vendored
View file

@ -13,4 +13,6 @@ pom.xml.asc
.lein-failures .lein-failures
.nrepl-port .nrepl-port
.cpcache/ .cpcache/
.lsp/
.clj-kondo/
.portal/

View file

@ -3,7 +3,8 @@
:url "http://example.com/FIXME" :url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"} :url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.11.1"] :dependencies [[dev.weavejester/medley "1.9.0"]
[org.clojure/clojure "1.11.1"]
[peco "0.1.6"]] [peco "0.1.6"]]
:repl-options {:init-ns cc.journeyman.elboob.core} :repl-options {:init-ns cc.journeyman.elboob.core}
:source-paths ["src/clj"]) :source-paths ["src/clj"])

View file

@ -0,0 +1 @@
ignorable-words.en.edn

View file

@ -1,9 +1,43 @@
(ns cc.journeyman.elboob.core) (ns cc.journeyman.elboob.core
(:require [clojure.java.io :refer [as-relative-path file resource]]
[clojure.string :refer [ends-with?]]
[medley.core :refer [deep-merge]]
[peco.core :refer [tokenizer]]))
(defn compile (defn get-ignorable-words []
(let [locale (java.util.Locale/getDefault)
tag (.toLanguageTag locale)
language (.getLanguage locale)]
(first
(map #(try (println (format "resources/ignorable-words%s.edn" %))
(read-string
(slurp
(file (format "resources/ignorable-words%s.edn" %))))
(catch Exception e (println (.getMessage e)) nil))
[(str "." tag) (str "." language) ""]))))
(defn compile-file
"Compile an index for an individual file `f`, tokenised with `tokenise` and
filtered with `ignorable?`."
[f tokenise ignorable?]
(let [f' (file f)
rel (as-relative-path f')
tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
(reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
(defn compile-index
"scans `dir-paths` as directories of Markdown files. Returns a map which keys "scans `dir-paths` as directories of Markdown files. Returns a map which keys
each lexical token occurring in each file (with Markdown formatting, common each lexical token occurring in each file (with Markdown formatting, common
words, punctuation etc excepted) to a map which keys the relative file path words, punctuation etc excepted) to a map which keys the relative file path
of each file in which the token occurs to the frequency the token occurs within the file." of each file in which the token occurs to the frequency the token occurs within the file."
[& dir-paths] [& dir-paths]
(println "Hello, World!")) (let [ignorable-word? (set (get-ignorable-words))
tokenise (tokenizer [:lower-case :concat-singles])]
(reduce deep-merge {}
(map (fn [dir]
(map #(compile-file % tokenise ignorable-word?)
(filter
#(ends-with? (.getName %) ".md")
(file-seq (file dir))))) dir-paths))))