From 21b6bfd67ea9bfe796f61785746e11ea022971e2 Mon Sep 17 00:00:00 2001
From: Simon Brooke <simon@journeyman.cc>
Date: Fri, 31 Oct 2025 18:46:08 +0000
Subject: [PATCH] Minor improvements to indexing.

---
 README.md                                | 15 +++++++--
 project.clj                              |  7 ++--
 src/clj/cc/journeyman/elboob/core.clj    | 42 ++++++++++++++----------
 src/clj/cc/journeyman/elboob/search.cljc |  1 +
 4 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/README.md b/README.md
index 56f26ac..ac569c1 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
 # elboob
 
-A site search engine for Cryogen with search on the client side
+A site search engine for [Cryogen](http://cryogenweb.org/) with search on the client side
+
+## Justification
+
+Left, of course.
+
+More seriously `elboob` is as near as I can get to an inversion of Google.
 
 ## Design intention
 
@@ -32,7 +38,12 @@ Then the output should be
 
  ## Implementation
 
- Has not started yet.
+ Is at an early stage. I have a working indexer, which conforms to the specification given above. There are problems with it:
+
+ 1. It contains many many repetitions of long file path names, which results in a large data size (although it make it efficient to search);
+ 2. It doesn't contain human readable metadata about the files, which, given this is Cryogen and the files have metadata headers, it easily could.
+
+ I could assign a gensym to each file path name, store that gensym in the main index, add a separate dictionary map entry to the index which translated those gensyms into the full file paths. That would substantially reduce the file size without greatly increasing the cost of search. 
 
  ## License
 
diff --git a/project.clj b/project.clj
index b5d83f0..5720301 100644
--- a/project.clj
+++ b/project.clj
@@ -1,10 +1,11 @@
 (defproject cc.journeyman/elboob "0.1.0-SNAPSHOT"
   :description "FIXME: write description"
-  :url "http://example.com/FIXME"
-  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
-            :url "https://www.eclipse.org/legal/epl-2.0/"}
+  :url "https://git.journeyman.cc/simon/elboob"
+  :license {:name "GPL-2.0-or-later"
+            :url "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"}
   :dependencies [[dev.weavejester/medley "1.9.0"]
                  [org.clojure/clojure "1.11.1"]
                  [peco "0.1.6"]]
   :repl-options {:init-ns cc.journeyman.elboob.core}
+  :resource-paths ["resources"]
   :source-paths      ["src/clj"])
diff --git a/src/clj/cc/journeyman/elboob/core.clj b/src/clj/cc/journeyman/elboob/core.clj
index 7016490..c06d8ee 100644
--- a/src/clj/cc/journeyman/elboob/core.clj
+++ b/src/clj/cc/journeyman/elboob/core.clj
@@ -4,18 +4,20 @@
             [medley.core :refer [deep-merge]]
             [peco.core :refer [tokenizer]]))
 
-(defn get-ignorable-words []
-  (let [locale (java.util.Locale/getDefault)
-        tag (.toLanguageTag locale)
-        language (.getLanguage locale)]
-
-    (first
-     (map #(try (println (format "resources/ignorable-words%s.edn" %))
-                (read-string
-                 (slurp
-                  (file (format "resources/ignorable-words%s.edn" %))))
-                (catch Exception e (println (.getMessage e)) nil))
-          [(str "." tag) (str "." language) ""]))))
+(defn get-ignorable-words
+  "Retrieve a list of words from resources which are ignorable"
+  ([] (get-ignorable-words (java.util.Locale/getDefault)))
+  ([^java.util.Locale locale]
+   (let [tag (.toLanguageTag locale)
+         language (.getLanguage locale)]
+     (first
+      (remove
+       empty?
+       (map #(try (read-string
+                   (slurp
+                    (resource (format "ignorable-words%s.edn" %))))
+                  (catch Exception e (println (.getMessage e)) nil))
+            [(str "." tag) (str "." language) ""]))))))
 
 (defn compile-file
   "Compile an index for an individual file `f`, tokenised with `tokenise` and 
@@ -26,20 +28,24 @@
         tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
     (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
 
-(defn files-with-suffix [dir suffix]
+(defn files-with-extension 
+  "Return a sequence of files from the specified `dir` which have the filename
+   extenstion `extn`."
+  [dir extn]
   (filter
-   #(ends-with? (.getName %) suffix)
+   #(ends-with? (.getName %) extn)
    (file-seq (file dir))))
 
 (defn compile-index
   "scans `dir-paths` as directories of Markdown files. Returns a map which keys
    each lexical token occurring in each file (with Markdown formatting, common
    words, punctuation etc excepted) to a map which keys the relative file path 
-   of each file in which the token occurs to the frequency the token occurs within the file."
+   of each file in which the token occurs to the frequency the token occurs 
+   within the file."
   [& dir-paths]
   (let [ignorable-word? (set (get-ignorable-words))
         tokenise (tokenizer [:lower-case :concat-singles])]
-    (reduce deep-merge {} 
-                   (map #(compile-file % tokenise ignorable-word?)
-                        (flatten (map #(files-with-suffix % ".md") dir-paths))))))
+    (reduce deep-merge {}
+            (map #(compile-file % tokenise ignorable-word?)
+                 (flatten (map #(files-with-extension % ".md") dir-paths))))))
 
diff --git a/src/clj/cc/journeyman/elboob/search.cljc b/src/clj/cc/journeyman/elboob/search.cljc
index e69de29..d6cbba2 100644
--- a/src/clj/cc/journeyman/elboob/search.cljc
+++ b/src/clj/cc/journeyman/elboob/search.cljc
@@ -0,0 +1 @@
+(ns cc.journeyman.elboob.search)
\ No newline at end of file