From 21b6bfd67ea9bfe796f61785746e11ea022971e2 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Fri, 31 Oct 2025 18:46:08 +0000 Subject: [PATCH] Minor improvements to indexing. --- README.md | 15 +++++++-- project.clj | 7 ++-- src/clj/cc/journeyman/elboob/core.clj | 42 ++++++++++++++---------- src/clj/cc/journeyman/elboob/search.cljc | 1 + 4 files changed, 42 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index 56f26ac..ac569c1 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ # elboob -A site search engine for Cryogen with search on the client side +A site search engine for [Cryogen](http://cryogenweb.org/) with search on the client side + +## Justification + +Left, of course. + +More seriously `elboob` is as near as I can get to an inversion of Google. ## Design intention @@ -32,7 +38,12 @@ Then the output should be ## Implementation - Has not started yet. + Is at an early stage. I have a working indexer, which conforms to the specification given above. There are problems with it: + + 1. It contains many many repetitions of long file path names, which results in a large data size (although it make it efficient to search); + 2. It doesn't contain human readable metadata about the files, which, given this is Cryogen and the files have metadata headers, it easily could. + + I could assign a gensym to each file path name, store that gensym in the main index, add a separate dictionary map entry to the index which translated those gensyms into the full file paths. That would substantially reduce the file size without greatly increasing the cost of search. ## License diff --git a/project.clj b/project.clj index b5d83f0..5720301 100644 --- a/project.clj +++ b/project.clj @@ -1,10 +1,11 @@ (defproject cc.journeyman/elboob "0.1.0-SNAPSHOT" :description "FIXME: write description" - :url "http://example.com/FIXME" - :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" - :url "https://www.eclipse.org/legal/epl-2.0/"} + :url "https://git.journeyman.cc/simon/elboob" + :license {:name "GPL-2.0-or-later" + :url "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"} :dependencies [[dev.weavejester/medley "1.9.0"] [org.clojure/clojure "1.11.1"] [peco "0.1.6"]] :repl-options {:init-ns cc.journeyman.elboob.core} + :resource-paths ["resources"] :source-paths ["src/clj"]) diff --git a/src/clj/cc/journeyman/elboob/core.clj b/src/clj/cc/journeyman/elboob/core.clj index 7016490..c06d8ee 100644 --- a/src/clj/cc/journeyman/elboob/core.clj +++ b/src/clj/cc/journeyman/elboob/core.clj @@ -4,18 +4,20 @@ [medley.core :refer [deep-merge]] [peco.core :refer [tokenizer]])) -(defn get-ignorable-words [] - (let [locale (java.util.Locale/getDefault) - tag (.toLanguageTag locale) - language (.getLanguage locale)] - - (first - (map #(try (println (format "resources/ignorable-words%s.edn" %)) - (read-string - (slurp - (file (format "resources/ignorable-words%s.edn" %)))) - (catch Exception e (println (.getMessage e)) nil)) - [(str "." tag) (str "." language) ""])))) +(defn get-ignorable-words + "Retrieve a list of words from resources which are ignorable" + ([] (get-ignorable-words (java.util.Locale/getDefault))) + ([^java.util.Locale locale] + (let [tag (.toLanguageTag locale) + language (.getLanguage locale)] + (first + (remove + empty? + (map #(try (read-string + (slurp + (resource (format "ignorable-words%s.edn" %)))) + (catch Exception e (println (.getMessage e)) nil)) + [(str "." tag) (str "." language) ""])))))) (defn compile-file "Compile an index for an individual file `f`, tokenised with `tokenise` and @@ -26,20 +28,24 @@ tokens (frequencies (remove ignorable? (tokenise (slurp f'))))] (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens)))) -(defn files-with-suffix [dir suffix] +(defn files-with-extension + "Return a sequence of files from the specified `dir` which have the filename + extenstion `extn`." + [dir extn] (filter - #(ends-with? (.getName %) suffix) + #(ends-with? (.getName %) extn) (file-seq (file dir)))) (defn compile-index "scans `dir-paths` as directories of Markdown files. Returns a map which keys each lexical token occurring in each file (with Markdown formatting, common words, punctuation etc excepted) to a map which keys the relative file path - of each file in which the token occurs to the frequency the token occurs within the file." + of each file in which the token occurs to the frequency the token occurs + within the file." [& dir-paths] (let [ignorable-word? (set (get-ignorable-words)) tokenise (tokenizer [:lower-case :concat-singles])] - (reduce deep-merge {} - (map #(compile-file % tokenise ignorable-word?) - (flatten (map #(files-with-suffix % ".md") dir-paths)))))) + (reduce deep-merge {} + (map #(compile-file % tokenise ignorable-word?) + (flatten (map #(files-with-extension % ".md") dir-paths)))))) diff --git a/src/clj/cc/journeyman/elboob/search.cljc b/src/clj/cc/journeyman/elboob/search.cljc index e69de29..d6cbba2 100644 --- a/src/clj/cc/journeyman/elboob/search.cljc +++ b/src/clj/cc/journeyman/elboob/search.cljc @@ -0,0 +1 @@ +(ns cc.journeyman.elboob.search) \ No newline at end of file