Now successfully creating indexes. Started work on a search function, but out of steam.
This commit is contained in:
		
							parent
							
								
									86665db3b8
								
							
						
					
					
						commit
						5e33f2c815
					
				
					 2 changed files with 15 additions and 13 deletions
				
			
		| 
						 | 
				
			
			@ -9,13 +9,13 @@
 | 
			
		|||
        tag (.toLanguageTag locale)
 | 
			
		||||
        language (.getLanguage locale)]
 | 
			
		||||
 | 
			
		||||
     (first
 | 
			
		||||
      (map #(try (println (format "resources/ignorable-words%s.edn" %))
 | 
			
		||||
                 (read-string
 | 
			
		||||
                  (slurp
 | 
			
		||||
                   (file (format "resources/ignorable-words%s.edn" %))))
 | 
			
		||||
                 (catch Exception e (println (.getMessage e)) nil))
 | 
			
		||||
           [(str "." tag) (str "." language) ""]))))
 | 
			
		||||
    (first
 | 
			
		||||
     (map #(try (println (format "resources/ignorable-words%s.edn" %))
 | 
			
		||||
                (read-string
 | 
			
		||||
                 (slurp
 | 
			
		||||
                  (file (format "resources/ignorable-words%s.edn" %))))
 | 
			
		||||
                (catch Exception e (println (.getMessage e)) nil))
 | 
			
		||||
          [(str "." tag) (str "." language) ""]))))
 | 
			
		||||
 | 
			
		||||
(defn compile-file
 | 
			
		||||
  "Compile an index for an individual file `f`, tokenised with `tokenise` and 
 | 
			
		||||
| 
						 | 
				
			
			@ -26,6 +26,11 @@
 | 
			
		|||
        tokens (frequencies (remove ignorable? (tokenise (slurp f'))))]
 | 
			
		||||
    (reduce #(assoc %1 %2 {rel (tokens %2)}) {} (keys tokens))))
 | 
			
		||||
 | 
			
		||||
(defn files-with-suffix [dir suffix]
 | 
			
		||||
  (filter
 | 
			
		||||
   #(ends-with? (.getName %) suffix)
 | 
			
		||||
   (file-seq (file dir))))
 | 
			
		||||
 | 
			
		||||
(defn compile-index
 | 
			
		||||
  "scans `dir-paths` as directories of Markdown files. Returns a map which keys
 | 
			
		||||
   each lexical token occurring in each file (with Markdown formatting, common
 | 
			
		||||
| 
						 | 
				
			
			@ -35,9 +40,6 @@
 | 
			
		|||
  (let [ignorable-word? (set (get-ignorable-words))
 | 
			
		||||
        tokenise (tokenizer [:lower-case :concat-singles])]
 | 
			
		||||
    (reduce deep-merge {} 
 | 
			
		||||
            (map (fn [dir]
 | 
			
		||||
                   (map #(compile-file % tokenise ignorable-word?)
 | 
			
		||||
                        (filter 
 | 
			
		||||
                         #(ends-with? (.getName %) ".md")
 | 
			
		||||
                         (file-seq (file dir))))) dir-paths))))
 | 
			
		||||
                        (flatten (map #(files-with-suffix % ".md") dir-paths))))))
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										0
									
								
								src/clj/cc/journeyman/elboob/search.cljc
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								src/clj/cc/journeyman/elboob/search.cljc
									
										
									
									
									
										Normal file
									
								
							
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue