Minor improvements to indexing.
This commit is contained in:
		
							parent
							
								
									5e33f2c815
								
							
						
					
					
						commit
						21b6bfd67e
					
				
					 4 changed files with 42 additions and 23 deletions
				
			
		
							
								
								
									
										15
									
								
								README.md
									
										
									
									
									
								
							
							
						
						
									
										15
									
								
								README.md
									
										
									
									
									
								
							| 
						 | 
					@ -1,6 +1,12 @@
 | 
				
			||||||
# elboob
 | 
					# elboob
 | 
				
			||||||
 | 
					
 | 
				
			||||||
A site search engine for Cryogen with search on the client side
 | 
					A site search engine for [Cryogen](http://cryogenweb.org/) with search on the client side
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Justification
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Left, of course.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					More seriously `elboob` is as near as I can get to an inversion of Google.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Design intention
 | 
					## Design intention
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,7 +38,12 @@ Then the output should be
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 ## Implementation
 | 
					 ## Implementation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 Has not started yet.
 | 
					 Is at an early stage. I have a working indexer, which conforms to the specification given above. There are problems with it:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 1. It contains many many repetitions of long file path names, which results in a large data size (although it make it efficient to search);
 | 
				
			||||||
 | 
					 2. It doesn't contain human readable metadata about the files, which, given this is Cryogen and the files have metadata headers, it easily could.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					 I could assign a gensym to each file path name, store that gensym in the main index, add a separate dictionary map entry to the index which translated those gensyms into the full file paths. That would substantially reduce the file size without greatly increasing the cost of search. 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 ## License
 | 
					 ## License
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,11 @@
 | 
				
			||||||
(defproject cc.journeyman/elboob "0.1.0-SNAPSHOT"
 | 
					(defproject cc.journeyman/elboob "0.1.0-SNAPSHOT"
 | 
				
			||||||
  :description "FIXME: write description"
 | 
					  :description "FIXME: write description"
 | 
				
			||||||
  :url "http://example.com/FIXME"
 | 
					  :url "https://git.journeyman.cc/simon/elboob"
 | 
				
			||||||
  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
 | 
					  :license {:name "GPL-2.0-or-later"
 | 
				
			||||||
            :url "https://www.eclipse.org/legal/epl-2.0/"}
 | 
					            :url "https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html"}
 | 
				
			||||||
  :dependencies [[dev.weavejester/medley "1.9.0"]
 | 
					  :dependencies [[dev.weavejester/medley "1.9.0"]
 | 
				
			||||||
                 [org.clojure/clojure "1.11.1"]
 | 
					                 [org.clojure/clojure "1.11.1"]
 | 
				
			||||||
                 [peco "0.1.6"]]
 | 
					                 [peco "0.1.6"]]
 | 
				
			||||||
  :repl-options {:init-ns cc.journeyman.elboob.core}
 | 
					  :repl-options {:init-ns cc.journeyman.elboob.core}
 | 
				
			||||||
 | 
					  :resource-paths ["resources"]
 | 
				
			||||||
  :source-paths      ["src/clj"])
 | 
					  :source-paths      ["src/clj"])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,18 +4,20 @@
 | 
				
			||||||
            [medley.core :refer [deep-merge]]
 | 
					            [medley.core :refer [deep-merge]]
 | 
				
			||||||
            [peco.core :refer [tokenizer]]))
 | 
					            [peco.core :refer [tokenizer]]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn get-ignorable-words []
 | 
					(defn get-ignorable-words
 | 
				
			||||||
  (let [locale (java.util.Locale/getDefault)
 | 
					  "Retrieve a list of words from resources which are ignorable"
 | 
				
			||||||
        tag (.toLanguageTag locale)
 | 
					  ([] (get-ignorable-words (java.util.Locale/getDefault)))
 | 
				
			||||||
        language (.getLanguage locale)]
 | 
					  ([^java.util.Locale locale]
 | 
				
			||||||
 | 
					   (let [tag (.toLanguageTag locale)
 | 
				
			||||||
    (first
 | 
					         language (.getLanguage locale)]
 | 
				
			||||||
     (map #(try (println (format "resources/ignorable-words%s.edn" %))
 | 
					     (first
 | 
				
			||||||
                (read-string
 | 
					      (remove
 | 
				
			||||||
                 (slurp
 | 
					       empty?
 | 
				
			||||||
                  (file (format "resources/ignorable-words%s.edn" %))))
 | 
					       (map #(try (read-string
 | 
				
			||||||
                (catch Exception e (println (.getMessage e)) nil))
 | 
					                   (slurp
 | 
				
			||||||
          [(str "." tag) (str "." language) ""]))))
 | 
					                    (resource (format "ignorable-words%s.edn" %))))
 | 
				
			||||||
 | 
					                  (catch Exception e (println (.getMessage e)) nil))
 | 
				
			||||||
 | 
					            [(str "." tag) (str "." language) ""]))))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn compile-file
 | 
					(defn compile-file
 | 
				
			||||||
  "Compile an index for an individual file `f`, tokenised with `tokenise` and 
 | 
					  "Compile an index for an individual file `f`, tokenised with `tokenise` and 
 | 
				
			||||||
| 
						 | 
					@ -26,20 +28,24 @@
 | 
				
			||||||
        tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
 | 
					        tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
 | 
				
			||||||
    (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
 | 
					    (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn files-with-suffix [dir suffix]
 | 
					(defn files-with-extension 
 | 
				
			||||||
 | 
					  "Return a sequence of files from the specified `dir` which have the filename
 | 
				
			||||||
 | 
					   extenstion `extn`."
 | 
				
			||||||
 | 
					  [dir extn]
 | 
				
			||||||
  (filter
 | 
					  (filter
 | 
				
			||||||
   #(ends-with? (.getName %) suffix)
 | 
					   #(ends-with? (.getName %) extn)
 | 
				
			||||||
   (file-seq (file dir))))
 | 
					   (file-seq (file dir))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn compile-index
 | 
					(defn compile-index
 | 
				
			||||||
  "scans `dir-paths` as directories of Markdown files. Returns a map which keys
 | 
					  "scans `dir-paths` as directories of Markdown files. Returns a map which keys
 | 
				
			||||||
   each lexical token occurring in each file (with Markdown formatting, common
 | 
					   each lexical token occurring in each file (with Markdown formatting, common
 | 
				
			||||||
   words, punctuation etc excepted) to a map which keys the relative file path 
 | 
					   words, punctuation etc excepted) to a map which keys the relative file path 
 | 
				
			||||||
   of each file in which the token occurs to the frequency the token occurs within the file."
 | 
					   of each file in which the token occurs to the frequency the token occurs 
 | 
				
			||||||
 | 
					   within the file."
 | 
				
			||||||
  [& dir-paths]
 | 
					  [& dir-paths]
 | 
				
			||||||
  (let [ignorable-word? (set (get-ignorable-words))
 | 
					  (let [ignorable-word? (set (get-ignorable-words))
 | 
				
			||||||
        tokenise (tokenizer [:lower-case :concat-singles])]
 | 
					        tokenise (tokenizer [:lower-case :concat-singles])]
 | 
				
			||||||
    (reduce deep-merge {}
 | 
					    (reduce deep-merge {}
 | 
				
			||||||
                   (map #(compile-file % tokenise ignorable-word?)
 | 
					            (map #(compile-file % tokenise ignorable-word?)
 | 
				
			||||||
                        (flatten (map #(files-with-suffix % ".md") dir-paths))))))
 | 
					                 (flatten (map #(files-with-extension % ".md") dir-paths))))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					(ns cc.journeyman.elboob.search)
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue