Minor improvements to indexing.

2025-10-31 18:46:08 +00:00 · 2025-10-31 18:46:08 +00:00 · 21b6bfd67e
commit 21b6bfd67e
parent 5e33f2c815
4 changed files with 42 additions and 23 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,12 @@
 # elboob

-A site search engine for Cryogen with search on the client side
+A site search engine for [Cryogen](http://cryogenweb.org/) with search on the client side
+
+## Justification
+
+Left, of course.
+
+More seriously `elboob` is as near as I can get to an inversion of Google.

 ## Design intention

@ -32,7 +38,12 @@ Then the output should be

 ## Implementation

- Has not started yet.
+ Is at an early stage. I have a working indexer, which conforms to the specification given above. There are problems with it:
+
+ 1. It contains many many repetitions of long file path names, which results in a large data size (although it make it efficient to search);
+ 2. It doesn't contain human readable metadata about the files, which, given this is Cryogen and the files have metadata headers, it easily could.
+
+ I could assign a gensym to each file path name, store that gensym in the main index, add a separate dictionary map entry to the index which translated those gensyms into the full file paths. That would substantially reduce the file size without greatly increasing the cost of search. 

 ## License

--- a/project.clj
+++ b/project.clj
@ -1,10 +1,11 @@
 (defproject cc.journeyman/elboob "0.1.0-SNAPSHOT"
  :description "FIXME: write description"
-  :url "http://example.com/FIXME"
-  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
-            :url "https://www.eclipse.org/legal/epl-2.0/"}
+  :url "https://git.journeyman.cc/simon/elboob"
+  :license {:name "GPL-2.0-or-later"
+            :url "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"}
  :dependencies [[dev.weavejester/medley "1.9.0"]
                 [org.clojure/clojure "1.11.1"]
                 [peco "0.1.6"]]
  :repl-options {:init-ns cc.journeyman.elboob.core}
+  :resource-paths ["resources"]
  :source-paths      ["src/clj"])
--- a/src/clj/cc/journeyman/elboob/core.clj
+++ b/src/clj/cc/journeyman/elboob/core.clj
@ -4,18 +4,20 @@
            [medley.core :refer [deep-merge]]
            [peco.core :refer [tokenizer]]))

-(defn get-ignorable-words []
-  (let [locale (java.util.Locale/getDefault)
-        tag (.toLanguageTag locale)
-        language (.getLanguage locale)]
-
-    (first
-     (map #(try (println (format "resources/ignorable-words%s.edn" %))
-                (read-string
-                 (slurp
-                  (file (format "resources/ignorable-words%s.edn" %))))
-                (catch Exception e (println (.getMessage e)) nil))
-          [(str "." tag) (str "." language) ""]))))
+(defn get-ignorable-words
+  "Retrieve a list of words from resources which are ignorable"
+  ([] (get-ignorable-words (java.util.Locale/getDefault)))
+  ([^java.util.Locale locale]
+   (let [tag (.toLanguageTag locale)
+         language (.getLanguage locale)]
+     (first
+      (remove
+       empty?
+       (map #(try (read-string
+                   (slurp
+                    (resource (format "ignorable-words%s.edn" %))))
+                  (catch Exception e (println (.getMessage e)) nil))
+            [(str "." tag) (str "." language) ""]))))))

 (defn compile-file
  "Compile an index for an individual file `f`, tokenised with `tokenise` and 
@ -26,20 +28,24 @@
        tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
    (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))

-(defn files-with-suffix [dir suffix]
+(defn files-with-extension 
+  "Return a sequence of files from the specified `dir` which have the filename
+   extenstion `extn`."
+  [dir extn]
  (filter
-   #(ends-with? (.getName %) suffix)
+   #(ends-with? (.getName %) extn)
   (file-seq (file dir))))

 (defn compile-index
  "scans `dir-paths` as directories of Markdown files. Returns a map which keys
   each lexical token occurring in each file (with Markdown formatting, common
   words, punctuation etc excepted) to a map which keys the relative file path 
-   of each file in which the token occurs to the frequency the token occurs within the file."
+   of each file in which the token occurs to the frequency the token occurs 
+   within the file."
  [& dir-paths]
  (let [ignorable-word? (set (get-ignorable-words))
        tokenise (tokenizer [:lower-case :concat-singles])]
-    (reduce deep-merge {} 
-                   (map #(compile-file % tokenise ignorable-word?)
-                        (flatten (map #(files-with-suffix % ".md") dir-paths))))))
+    (reduce deep-merge {}
+            (map #(compile-file % tokenise ignorable-word?)
+                 (flatten (map #(files-with-extension % ".md") dir-paths))))))

--- a/src/clj/cc/journeyman/elboob/search.cljc
+++ b/src/clj/cc/journeyman/elboob/search.cljc
@ -0,0 +1 @@
+(ns cc.journeyman.elboob.search)