Added first sketch of ignorable words

This commit is contained in:
Simon Brooke 2025-10-31 13:52:57 +00:00
parent e5875a2a19
commit f2fc1acc80
3 changed files with 108 additions and 1 deletions

View file

@ -3,6 +3,7 @@
:url "http://example.com/FIXME"
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
:url "https://www.eclipse.org/legal/epl-2.0/"}
:dependencies [[org.clojure/clojure "1.11.1"]]
:dependencies [[org.clojure/clojure "1.11.1"]
[peco "0.1.6"]]
:repl-options {:init-ns elboob.core}
:source-paths ["src/clj"])

View file

@ -0,0 +1,105 @@
;; list of English language words that should not be indexed.
;; taken from the first hundred words in [Peter Norvig's analysis of the
;; frequency of English words](https://norvig.com/ngrams/count_1w.txt);
;; I've then commented out from the list those words which, although
;; common, I think it may be reasonable for people to search for.
["the"
"of"
"and"
"to"
"a"
"in"
"for"
"is"
"on"
"that"
"by"
"this"
"with"
"i"
"you"
"it"
"not"
"or"
"be"
"are"
"from"
"at"
"as"
"your"
"all"
"have"
"new"
"more"
"an"
"was"
"we"
"will"
"home"
"can"
"us"
"about"
"if"
"page"
"my"
"has"
"search"
"free"
"but"
"our"
"one"
"other"
"do"
"no"
;; "information"
"time"
"they"
"site"
"he"
"up"
"may"
"what"
"which"
"their"
"news"
"out"
"use"
"any"
"there"
"see"
"only"
"so"
"his"
"when"
;; "contact"
"here"
;; "business"
"who"
"web"
"also"
"now"
;; "help"
"get"
"pm"
"view"
;; "online"
"c"
"e"
"first"
"am"
"been"
"would"
"how"
"were"
"me"
"s"
;; "services"
"some"
"these"
"click"
"its"
"like"
;; "service"
"x"
"than"
"find"]

View file

@ -0,0 +1 @@
ignorable-words.en.edn