Added first sketch of ignorable words
This commit is contained in:
parent
e5875a2a19
commit
f2fc1acc80
3 changed files with 108 additions and 1 deletions
|
|
@ -3,6 +3,7 @@
|
||||||
:url "http://example.com/FIXME"
|
:url "http://example.com/FIXME"
|
||||||
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
|
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
|
||||||
:url "https://www.eclipse.org/legal/epl-2.0/"}
|
:url "https://www.eclipse.org/legal/epl-2.0/"}
|
||||||
:dependencies [[org.clojure/clojure "1.11.1"]]
|
:dependencies [[org.clojure/clojure "1.11.1"]
|
||||||
|
[peco "0.1.6"]]
|
||||||
:repl-options {:init-ns elboob.core}
|
:repl-options {:init-ns elboob.core}
|
||||||
:source-paths ["src/clj"])
|
:source-paths ["src/clj"])
|
||||||
|
|
|
||||||
105
resources/ignorable-words.en.edn
Normal file
105
resources/ignorable-words.en.edn
Normal file
|
|
@ -0,0 +1,105 @@
|
||||||
|
;; list of English language words that should not be indexed.
|
||||||
|
;; taken from the first hundred words in [Peter Norvig's analysis of the
|
||||||
|
;; frequency of English words](https://norvig.com/ngrams/count_1w.txt);
|
||||||
|
;; I've then commented out from the list those words which, although
|
||||||
|
;; common, I think it may be reasonable for people to search for.
|
||||||
|
["the"
|
||||||
|
"of"
|
||||||
|
"and"
|
||||||
|
"to"
|
||||||
|
"a"
|
||||||
|
"in"
|
||||||
|
"for"
|
||||||
|
"is"
|
||||||
|
"on"
|
||||||
|
"that"
|
||||||
|
"by"
|
||||||
|
"this"
|
||||||
|
"with"
|
||||||
|
"i"
|
||||||
|
"you"
|
||||||
|
"it"
|
||||||
|
"not"
|
||||||
|
"or"
|
||||||
|
"be"
|
||||||
|
"are"
|
||||||
|
"from"
|
||||||
|
"at"
|
||||||
|
"as"
|
||||||
|
"your"
|
||||||
|
"all"
|
||||||
|
"have"
|
||||||
|
"new"
|
||||||
|
"more"
|
||||||
|
"an"
|
||||||
|
"was"
|
||||||
|
"we"
|
||||||
|
"will"
|
||||||
|
"home"
|
||||||
|
"can"
|
||||||
|
"us"
|
||||||
|
"about"
|
||||||
|
"if"
|
||||||
|
"page"
|
||||||
|
"my"
|
||||||
|
"has"
|
||||||
|
"search"
|
||||||
|
"free"
|
||||||
|
"but"
|
||||||
|
"our"
|
||||||
|
"one"
|
||||||
|
"other"
|
||||||
|
"do"
|
||||||
|
"no"
|
||||||
|
;; "information"
|
||||||
|
"time"
|
||||||
|
"they"
|
||||||
|
"site"
|
||||||
|
"he"
|
||||||
|
"up"
|
||||||
|
"may"
|
||||||
|
"what"
|
||||||
|
"which"
|
||||||
|
"their"
|
||||||
|
"news"
|
||||||
|
"out"
|
||||||
|
"use"
|
||||||
|
"any"
|
||||||
|
"there"
|
||||||
|
"see"
|
||||||
|
"only"
|
||||||
|
"so"
|
||||||
|
"his"
|
||||||
|
"when"
|
||||||
|
;; "contact"
|
||||||
|
"here"
|
||||||
|
;; "business"
|
||||||
|
"who"
|
||||||
|
"web"
|
||||||
|
"also"
|
||||||
|
"now"
|
||||||
|
;; "help"
|
||||||
|
"get"
|
||||||
|
"pm"
|
||||||
|
"view"
|
||||||
|
;; "online"
|
||||||
|
"c"
|
||||||
|
"e"
|
||||||
|
"first"
|
||||||
|
"am"
|
||||||
|
"been"
|
||||||
|
"would"
|
||||||
|
"how"
|
||||||
|
"were"
|
||||||
|
"me"
|
||||||
|
"s"
|
||||||
|
;; "services"
|
||||||
|
"some"
|
||||||
|
"these"
|
||||||
|
"click"
|
||||||
|
"its"
|
||||||
|
"like"
|
||||||
|
;; "service"
|
||||||
|
"x"
|
||||||
|
"than"
|
||||||
|
"find"]
|
||||||
1
resources/ignorable-words.en_GB.edn
Symbolic link
1
resources/ignorable-words.en_GB.edn
Symbolic link
|
|
@ -0,0 +1 @@
|
||||||
|
ignorable-words.en.edn
|
||||||
Loading…
Add table
Add a link
Reference in a new issue