Minor improvements to indexing.
This commit is contained in:
parent
5e33f2c815
commit
21b6bfd67e
4 changed files with 42 additions and 23 deletions
15
README.md
15
README.md
|
|
@ -1,6 +1,12 @@
|
|||
# elboob
|
||||
|
||||
A site search engine for Cryogen with search on the client side
|
||||
A site search engine for [Cryogen](http://cryogenweb.org/) with search on the client side
|
||||
|
||||
## Justification
|
||||
|
||||
Left, of course.
|
||||
|
||||
More seriously `elboob` is as near as I can get to an inversion of Google.
|
||||
|
||||
## Design intention
|
||||
|
||||
|
|
@ -32,7 +38,12 @@ Then the output should be
|
|||
|
||||
## Implementation
|
||||
|
||||
Has not started yet.
|
||||
Is at an early stage. I have a working indexer, which conforms to the specification given above. There are problems with it:
|
||||
|
||||
1. It contains many many repetitions of long file path names, which results in a large data size (although it make it efficient to search);
|
||||
2. It doesn't contain human readable metadata about the files, which, given this is Cryogen and the files have metadata headers, it easily could.
|
||||
|
||||
I could assign a gensym to each file path name, store that gensym in the main index, add a separate dictionary map entry to the index which translated those gensyms into the full file paths. That would substantially reduce the file size without greatly increasing the cost of search.
|
||||
|
||||
## License
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,11 @@
|
|||
(defproject cc.journeyman/elboob "0.1.0-SNAPSHOT"
|
||||
:description "FIXME: write description"
|
||||
:url "http://example.com/FIXME"
|
||||
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
|
||||
:url "https://www.eclipse.org/legal/epl-2.0/"}
|
||||
:url "https://git.journeyman.cc/simon/elboob"
|
||||
:license {:name "GPL-2.0-or-later"
|
||||
:url "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"}
|
||||
:dependencies [[dev.weavejester/medley "1.9.0"]
|
||||
[org.clojure/clojure "1.11.1"]
|
||||
[peco "0.1.6"]]
|
||||
:repl-options {:init-ns cc.journeyman.elboob.core}
|
||||
:resource-paths ["resources"]
|
||||
:source-paths ["src/clj"])
|
||||
|
|
|
|||
|
|
@ -4,18 +4,20 @@
|
|||
[medley.core :refer [deep-merge]]
|
||||
[peco.core :refer [tokenizer]]))
|
||||
|
||||
(defn get-ignorable-words []
|
||||
(let [locale (java.util.Locale/getDefault)
|
||||
tag (.toLanguageTag locale)
|
||||
language (.getLanguage locale)]
|
||||
|
||||
(first
|
||||
(map #(try (println (format "resources/ignorable-words%s.edn" %))
|
||||
(read-string
|
||||
(slurp
|
||||
(file (format "resources/ignorable-words%s.edn" %))))
|
||||
(catch Exception e (println (.getMessage e)) nil))
|
||||
[(str "." tag) (str "." language) ""]))))
|
||||
(defn get-ignorable-words
|
||||
"Retrieve a list of words from resources which are ignorable"
|
||||
([] (get-ignorable-words (java.util.Locale/getDefault)))
|
||||
([^java.util.Locale locale]
|
||||
(let [tag (.toLanguageTag locale)
|
||||
language (.getLanguage locale)]
|
||||
(first
|
||||
(remove
|
||||
empty?
|
||||
(map #(try (read-string
|
||||
(slurp
|
||||
(resource (format "ignorable-words%s.edn" %))))
|
||||
(catch Exception e (println (.getMessage e)) nil))
|
||||
[(str "." tag) (str "." language) ""]))))))
|
||||
|
||||
(defn compile-file
|
||||
"Compile an index for an individual file `f`, tokenised with `tokenise` and
|
||||
|
|
@ -26,20 +28,24 @@
|
|||
tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
|
||||
(reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
|
||||
|
||||
(defn files-with-suffix [dir suffix]
|
||||
(defn files-with-extension
|
||||
"Return a sequence of files from the specified `dir` which have the filename
|
||||
extenstion `extn`."
|
||||
[dir extn]
|
||||
(filter
|
||||
#(ends-with? (.getName %) suffix)
|
||||
#(ends-with? (.getName %) extn)
|
||||
(file-seq (file dir))))
|
||||
|
||||
(defn compile-index
|
||||
"scans `dir-paths` as directories of Markdown files. Returns a map which keys
|
||||
each lexical token occurring in each file (with Markdown formatting, common
|
||||
words, punctuation etc excepted) to a map which keys the relative file path
|
||||
of each file in which the token occurs to the frequency the token occurs within the file."
|
||||
of each file in which the token occurs to the frequency the token occurs
|
||||
within the file."
|
||||
[& dir-paths]
|
||||
(let [ignorable-word? (set (get-ignorable-words))
|
||||
tokenise (tokenizer [:lower-case :concat-singles])]
|
||||
(reduce deep-merge {}
|
||||
(map #(compile-file % tokenise ignorable-word?)
|
||||
(flatten (map #(files-with-suffix % ".md") dir-paths))))))
|
||||
(reduce deep-merge {}
|
||||
(map #(compile-file % tokenise ignorable-word?)
|
||||
(flatten (map #(files-with-extension % ".md") dir-paths))))))
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
(ns cc.journeyman.elboob.search)
|
||||
Loading…
Add table
Add a link
Reference in a new issue