OK, this can now compile an index for a single file, and do it very quickly;
it cannot yet iterate over a directory, and I need to work out why not. But progress!
This commit is contained in:
		
							parent
							
								
									73549a5c90
								
							
						
					
					
						commit
						ea2d6927d2
					
				
					 4 changed files with 40 additions and 4 deletions
				
			
		| 
						 | 
					@ -3,7 +3,8 @@
 | 
				
			||||||
  :url "http://example.com/FIXME"
 | 
					  :url "http://example.com/FIXME"
 | 
				
			||||||
  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
 | 
					  :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
 | 
				
			||||||
            :url "https://www.eclipse.org/legal/epl-2.0/"}
 | 
					            :url "https://www.eclipse.org/legal/epl-2.0/"}
 | 
				
			||||||
  :dependencies [[org.clojure/clojure "1.11.1"]
 | 
					  :dependencies [[dev.weavejester/medley "1.9.0"]
 | 
				
			||||||
 | 
					                 [org.clojure/clojure "1.11.1"]
 | 
				
			||||||
                 [peco "0.1.6"]]
 | 
					                 [peco "0.1.6"]]
 | 
				
			||||||
  :repl-options {:init-ns cc.journeyman.elboob.core}
 | 
					  :repl-options {:init-ns cc.journeyman.elboob.core}
 | 
				
			||||||
  :source-paths      ["src/clj"])
 | 
					  :source-paths      ["src/clj"])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										1
									
								
								resources/ignorable-words.en-GB.edn
									
										
									
									
									
										Symbolic link
									
								
							
							
						
						
									
										1
									
								
								resources/ignorable-words.en-GB.edn
									
										
									
									
									
										Symbolic link
									
								
							| 
						 | 
					@ -0,0 +1 @@
 | 
				
			||||||
 | 
					ignorable-words.en.edn
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,43 @@
 | 
				
			||||||
(ns cc.journeyman.elboob.core)
 | 
					(ns cc.journeyman.elboob.core
 | 
				
			||||||
 | 
					  (:require [clojure.java.io :refer [as-relative-path file resource]]
 | 
				
			||||||
 | 
					            [clojure.string :refer [ends-with?]]
 | 
				
			||||||
 | 
					            [medley.core :refer [deep-merge]]
 | 
				
			||||||
 | 
					            [peco.core :refer [tokenizer]]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
(defn compile
 | 
					(defn get-ignorable-words []
 | 
				
			||||||
 | 
					  (let [locale (java.util.Locale/getDefault)
 | 
				
			||||||
 | 
					        tag (.toLanguageTag locale)
 | 
				
			||||||
 | 
					        language (.getLanguage locale)]
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					     (first
 | 
				
			||||||
 | 
					      (map #(try (println (format "resources/ignorable-words%s.edn" %))
 | 
				
			||||||
 | 
					                 (read-string
 | 
				
			||||||
 | 
					                  (slurp
 | 
				
			||||||
 | 
					                   (file (format "resources/ignorable-words%s.edn" %))))
 | 
				
			||||||
 | 
					                 (catch Exception e (println (.getMessage e)) nil))
 | 
				
			||||||
 | 
					           [(str "." tag) (str "." language) ""]))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(defn compile-file
 | 
				
			||||||
 | 
					  "Compile an index for an individual file `f`, tokenised with `tokenise` and 
 | 
				
			||||||
 | 
					   filtered with `ignorable?`."
 | 
				
			||||||
 | 
					  [f tokenise ignorable?]
 | 
				
			||||||
 | 
					  (let [f' (file f)
 | 
				
			||||||
 | 
					        rel (as-relative-path f')
 | 
				
			||||||
 | 
					        tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
 | 
				
			||||||
 | 
					    (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					(defn compile-index
 | 
				
			||||||
  "scans `dir-paths` as directories of Markdown files. Returns a map which keys
 | 
					  "scans `dir-paths` as directories of Markdown files. Returns a map which keys
 | 
				
			||||||
   each lexical token occurring in each file (with Markdown formatting, common
 | 
					   each lexical token occurring in each file (with Markdown formatting, common
 | 
				
			||||||
   words, punctuation etc excepted) to a map which keys the relative file path 
 | 
					   words, punctuation etc excepted) to a map which keys the relative file path 
 | 
				
			||||||
   of each file in which the token occurs to the frequency the token occurs within the file."
 | 
					   of each file in which the token occurs to the frequency the token occurs within the file."
 | 
				
			||||||
  [& dir-paths]
 | 
					  [& dir-paths]
 | 
				
			||||||
  (println  "Hello, World!"))
 | 
					  (let [ignorable-word? (set (get-ignorable-words))
 | 
				
			||||||
 | 
					        tokenise (tokenizer [:lower-case :concat-singles])]
 | 
				
			||||||
 | 
					    (reduce deep-merge {} 
 | 
				
			||||||
 | 
					            (map (fn [dir]
 | 
				
			||||||
 | 
					                   (map #(compile-file % tokenise ignorable-word?) 
 | 
				
			||||||
 | 
					                        (filter 
 | 
				
			||||||
 | 
					                         #(ends-with? (.getName %) ".md")
 | 
				
			||||||
 | 
					                         (file-seq (file dir))))) dir-paths))))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue