From f2fc1acc803a1d58c2e0cc35689a3434d36bbfb9 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Fri, 31 Oct 2025 13:52:57 +0000 Subject: [PATCH] Added first sketch of ignorable words --- project.clj | 3 +- resources/ignorable-words.en.edn | 105 ++++++++++++++++++++++++++++ resources/ignorable-words.en_GB.edn | 1 + 3 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 resources/ignorable-words.en.edn create mode 120000 resources/ignorable-words.en_GB.edn diff --git a/project.clj b/project.clj index 068ca5d..8349b55 100644 --- a/project.clj +++ b/project.clj @@ -3,6 +3,7 @@ :url "http://example.com/FIXME" :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" :url "https://www.eclipse.org/legal/epl-2.0/"} - :dependencies [[org.clojure/clojure "1.11.1"]] + :dependencies [[org.clojure/clojure "1.11.1"] + [peco "0.1.6"]] :repl-options {:init-ns elboob.core} :source-paths ["src/clj"]) diff --git a/resources/ignorable-words.en.edn b/resources/ignorable-words.en.edn new file mode 100644 index 0000000..1875f91 --- /dev/null +++ b/resources/ignorable-words.en.edn @@ -0,0 +1,105 @@ +;; list of English language words that should not be indexed. +;; taken from the first hundred words in [Peter Norvig's analysis of the +;; frequency of English words](https://norvig.com/ngrams/count_1w.txt); +;; I've then commented out from the list those words which, although +;; common, I think it may be reasonable for people to search for. +["the" +"of" +"and" +"to" +"a" +"in" +"for" +"is" +"on" +"that" +"by" +"this" +"with" +"i" +"you" +"it" +"not" +"or" +"be" +"are" +"from" +"at" +"as" +"your" +"all" +"have" +"new" +"more" +"an" +"was" +"we" +"will" +"home" +"can" +"us" +"about" +"if" +"page" +"my" +"has" +"search" +"free" +"but" +"our" +"one" +"other" +"do" +"no" +;; "information" +"time" +"they" +"site" +"he" +"up" +"may" +"what" +"which" +"their" +"news" +"out" +"use" +"any" +"there" +"see" +"only" +"so" +"his" +"when" +;; "contact" +"here" +;; "business" +"who" +"web" +"also" +"now" +;; "help" +"get" +"pm" +"view" +;; "online" +"c" +"e" +"first" +"am" +"been" +"would" +"how" +"were" +"me" +"s" +;; "services" +"some" +"these" +"click" +"its" +"like" +;; "service" +"x" +"than" +"find"] \ No newline at end of file diff --git a/resources/ignorable-words.en_GB.edn b/resources/ignorable-words.en_GB.edn new file mode 120000 index 0000000..1151b98 --- /dev/null +++ b/resources/ignorable-words.en_GB.edn @@ -0,0 +1 @@ +ignorable-words.en.edn \ No newline at end of file