Minor improvements to indexing.

This commit is contained in:
Simon Brooke 2025-10-31 18:46:08 +00:00
parent 5e33f2c815
commit 21b6bfd67e
4 changed files with 42 additions and 23 deletions

View file

@ -1,6 +1,12 @@
# elboob
A site search engine for Cryogen with search on the client side
A site search engine for [Cryogen](http://cryogenweb.org/) with search on the client side
## Justification
Left, of course.
More seriously `elboob` is as near as I can get to an inversion of Google.
## Design intention
@ -32,7 +38,12 @@ Then the output should be
## Implementation
Has not started yet.
Is at an early stage. I have a working indexer, which conforms to the specification given above. There are problems with it:
1. It contains many many repetitions of long file path names, which results in a large data size (although it make it efficient to search);
2. It doesn't contain human readable metadata about the files, which, given this is Cryogen and the files have metadata headers, it easily could.
I could assign a gensym to each file path name, store that gensym in the main index, add a separate dictionary map entry to the index which translated those gensyms into the full file paths. That would substantially reduce the file size without greatly increasing the cost of search.
## License

View file

@ -1,10 +1,11 @@
(defproject cc.journeyman/elboob "0.1.0-SNAPSHOT"
:description "FIXME: write description"
:url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:url "https://git.journeyman.cc/simon/elboob"
:license {:name "GPL-2.0-or-later"
:url "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"}
:dependencies [[dev.weavejester/medley "1.9.0"]
[org.clojure/clojure "1.11.1"]
[peco "0.1.6"]]
:repl-options {:init-ns cc.journeyman.elboob.core}
:resource-paths ["resources"]
:source-paths ["src/clj"])

View file

@ -4,18 +4,20 @@
[medley.core :refer [deep-merge]]
[peco.core :refer [tokenizer]]))
(defn get-ignorable-words []
(let [locale (java.util.Locale/getDefault)
tag (.toLanguageTag locale)
language (.getLanguage locale)]
(first
(map #(try (println (format "resources/ignorable-words%s.edn" %))
(read-string
(slurp
(file (format "resources/ignorable-words%s.edn" %))))
(catch Exception e (println (.getMessage e)) nil))
[(str "." tag) (str "." language) ""]))))
(defn get-ignorable-words
"Retrieve a list of words from resources which are ignorable"
([] (get-ignorable-words (java.util.Locale/getDefault)))
([^java.util.Locale locale]
(let [tag (.toLanguageTag locale)
language (.getLanguage locale)]
(first
(remove
empty?
(map #(try (read-string
(slurp
(resource (format "ignorable-words%s.edn" %))))
(catch Exception e (println (.getMessage e)) nil))
[(str "." tag) (str "." language) ""]))))))
(defn compile-file
"Compile an index for an individual file `f`, tokenised with `tokenise` and
@ -26,20 +28,24 @@
tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
(reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
(defn files-with-suffix [dir suffix]
(defn files-with-extension
"Return a sequence of files from the specified `dir` which have the filename
extenstion `extn`."
[dir extn]
(filter
#(ends-with? (.getName %) suffix)
#(ends-with? (.getName %) extn)
(file-seq (file dir))))
(defn compile-index
"scans `dir-paths` as directories of Markdown files. Returns a map which keys
each lexical token occurring in each file (with Markdown formatting, common
words, punctuation etc excepted) to a map which keys the relative file path
of each file in which the token occurs to the frequency the token occurs within the file."
of each file in which the token occurs to the frequency the token occurs
within the file."
[& dir-paths]
(let [ignorable-word? (set (get-ignorable-words))
tokenise (tokenizer [:lower-case :concat-singles])]
(reduce deep-merge {}
(map #(compile-file % tokenise ignorable-word?)
(flatten (map #(files-with-suffix % ".md") dir-paths))))))
(reduce deep-merge {}
(map #(compile-file % tokenise ignorable-word?)
(flatten (map #(files-with-extension % ".md") dir-paths))))))

View file

@ -0,0 +1 @@
(ns cc.journeyman.elboob.search)