001 (ns wwui.propositions
002 (:require [clojure.math.combinatorics :as combi]
003 [clojure.pprint :refer [pprint]]
004 [clojure.string :as s]
005 [opennlp.nlp :as nlp]
006 [opennlp.treebank :as tb]
007 [taoensso.timbre :as log :refer [debug error info spy]]
008 [wildwood.knowledge-accessor :refer [Accessor]]))
009
010 ;; Position tags used by OpenNLP for English are documented here:
011 ;; https://dpdearing.com/posts/2011/12/opennlp-part-of-speech-pos-tags-penn-english-treebank/
012
013 (def get-sentences (nlp/make-sentence-detector "models/en-sent.bin"))
014 (def tokenize (nlp/make-tokenizer "models/en-token.bin"))
015 (def pos-tag (nlp/make-pos-tagger "models/en-pos-maxent.bin"))
016 (def name-find (nlp/make-name-finder "models/namefind/en-ner-person.bin"))
017 ;; (def chunker (make-treebank-chunker "models/en-chunker.bin"))
018
019 (def grammar
020 "The objective of this grammar is to allow us to take a sequence of tagged symbols, and
021 produce a higher-level tagging of parts of speech, and ultimately propositions, from them.
022
023 *NOTE THAT* tags in this grammar are always keywords, to distinguish them from OpenNLP
024 tags, which tag individual tokens and are represented as strings."
025 {:contextual-reference [["PRP"]] ;; the documentation says PRP is 'peronal pronoun',
026 ;; but it seems to be all pronouns.
027 :noun [["NN"]["NNS"]["NNP"]["NNPS"]]
028 :full-name [["NNP"]
029 ["NNP" :full-name]] ;; an unpunctuated sequence of proper nouns
030 ;; probably represents a full name
031 :noun-phrase [[:contextual-reference]
032 [:noun]
033 [:full-name]
034 ["DT" :noun]
035 [:adjectives :noun]
036 ["DT" :adjectives :noun]]
037 :noun-phrases [[:noun-phrase]
038 [:noun-phrase "CC" :noun-phrases]
039 [:noun-phrase "," :noun-phrases]]
040 :adjective [["JJ"]["JJR"]["JJS"]]
041 :adjectives [[:adjective]
042 [:adjective :adjectives]
043 [:adjective "," :adjectives]
044 [:adjective "CC" :adjectives]]
045 :verb [["VB"]["VBD"]["VBG"]["VBN"]["VBP"]["VBZ"]]
046 :adverb [["RB"]["RBR"]["RBS"]] ;; beware here that negation and qualification show up only as adverbs
047 :adverbs [[:adverb]
048 [:adverb "," :adverbs]
049 [:adverb "CC" :adverbs]]
050 :verb-phrase [[:verb]
051 [:adverbs :verb]
052 [:verb :adverbs :verb]
053 [:verb :adverbs]
054 [:verb :adverbs :verb "TO"]]
055 :locator [["IN" :noun-phrases]]
056 :locators [[:locator]
057 [:locator :locators]
058 [:locator "," :locators]]
059 :location [[:locators]]
060 :subject [[:noun-phrases]]
061 :object [[:noun-phrases]]
062 :proposition [[:subject :verb-phrase :object]
063 [:location "," :subject :verb-phrase :object]
064 [:subject "," :location "," :verb-phrase :object]
065 [:subject :verb-phrase :object :location]]
066 :propositions [[:proposition]
067 [:proposition "CC" :propositions]
068 [:proposition "," "CC" :propositions]]})
069
070 (declare reparse rdp-seek)
071
072 (defn rdp-seek
073 "Seek a phrase which satisfies this `goal` (expected to be a keyword) in
074 this `tagged-sentence` using this `grammar`.
075
076 Return a cons comprising
077 1. the first matching phrase for the goal, tagged with the goal, or `nil` if
078 no match;
079 2. the tail of the sentence when the parts comprising the phrase are removed."
080 [tagged-sentence grammar goal]
081 (if (keyword? goal)
082 (when (not (empty? tagged-sentence))
083 (when-let [result (first
084 (sort
085 #(< (count %1) (count %2))
086 (remove
087 empty?
088 (map
089 #(reparse tagged-sentence grammar %)
090 (goal grammar)))))]
091 (cons (cons (first result) (list goal)) (rest result))))
092 (throw (Exception. (str "Non-keyword passed to rdp-seek: `" goal "` (type " (or (type goal) "nil") ")")))))
093
094 (defmacro tag
095 "The tag, on a `tagged-token`, is just the second element. Written as a macro
096 for readability."
097 [tagged-token]
098 `(nth ~tagged-token 1))
099
100 (defmacro coll-or-nil?
101 [o]
102 "For fuck's sake, `nil` isn't a collection? What planet are these people on?"
103 `(or (nil? ~o) (coll? ~o)))
104
105 (defn rdp-extend
106 "Seek a phrase which satisfies this `goal` (expected to be a collection of tags) in
107 this `tagged-sentence` using this `grammar`.
108
109 Return a cons comprising
110 1. the first matching phrase for the goal, or `nil` if no match;
111 2. the tail of the sentence when the parts comprising the phrase are removed."
112 [tagged-sentence grammar goal]
113 (cond
114 (not (coll-or-nil? goal))
115 (throw (Exception. (str "Non-collection passed to rdp-extend: `" goal "` (type " (or (type goal) "nil") ")")))
116 (empty? goal)
117 (cons (list) tagged-sentence)
118 (not (empty? tagged-sentence))
119 (let [[tt & st] tagged-sentence
120 [target & gt] goal]
121 (cond
122 (keyword? target)
123 (when-let [[h & t](reparse tagged-sentence grammar target)]
124 (when-let [[dh & dt] (reparse t grammar gt)]
125 (cons (cons h dh) dt)))
126 (= target (tag tt))
127 (when-let [[dh & dt] (reparse st grammar gt)]
128 (cons (cons tt dh) dt))))))
129
130 (defn reparse
131 "Reparse this `tagged-sentence` using this grammar to seek this `goal`.
132 Parse greedily, seeking the most extended goal.
133
134 Return a sequence comprising
135 1. the first matching phrase for the goal, tagged with the goal, or `nil`
136 if no match;
137 2. the tail of the sentence when the parts comprising the phrase are removed.
138
139 This function is called `reparse` because:
140 1. it is designed to parse sentences which have already been parsed by
141 OpenNLP: it will not work on raw sentences;
142 2. it is a recursive descent parser."
143 [tagged-sentence grammar goal]
144 (log/debug "=> Choosing strategy for "
145 goal " in " (with-out-str (pprint tagged-sentence)))
146 (let [r (cond
147 (keyword? goal) (rdp-seek tagged-sentence grammar goal)
148 (coll-or-nil? goal) (rdp-extend tagged-sentence grammar goal))]
149 (log/debug "<= " goal " in "
150 (s/trim (with-out-str (pprint tagged-sentence)))
151 " returned " (s/trim (with-out-str (pprint r))))
152 r))
153
154 (defn identify
155 [parse-tree knowledge-accessor]
156 ;; TODO: we don't yet have a working knowledge accessor. When we do,
157 ;; construct a query from the contents of this parse-tree, and pass it
158 ;; to the knowledge accessor in the hope of finding a true name.
159 parse-tree)
160
161 (defn normalise
162 [parse-tree ka]
163 (if
164 (and (coll? parse-tree) (= (count parse-tree) 2)(keyword? (nth parse-tree 1)))
165 (case (nth parse-tree 1)
166 :proposition (list
167 (reduce
168 merge
169 {}
170 (map
171 ;; TODO: use combinatorics to extract all propositions from
172 ;; a proposition having multiple locations, multiple subject,
173 ;; objects and/or verbs
174 #(assoc {} (nth % 1) (identify (first %) ka))
175 (map #(normalise % ka) (first parse-tree)))))
176 (:location :subject :object)
177 (cons
178 (reduce
179 concat
180 (remove
181 empty?
182 (map #(normalise % ka) (first parse-tree))))
183 (list (nth parse-tree 1)))
184 (:propositions :locators :noun-phrases :verbs)
185 (reduce
186 concat
187 (remove
188 empty?
189 (map #(normalise % ka) (first parse-tree))))
190 ;; else
191 parse-tree)
192 parse-tree))
193
194 (defn propositions
195 "Given a `tagged-sentence`, return a list of propositions detected in that
196 sentence; if `knowledge-accessor` is passed, try to resolve names and noun
197 phrases to entities known to that knowledge accessor.
198
199 TODO: Note that if `:subject`, `:object` or `:locator` resolves to multiple
200 objects, then that is essentially one proposition for each unique
201 combination. This is not yet implemented!"
202 ([tagged-sentence]
203 (propositions tagged-sentence nil))
204 ([tagged-sentence ;; ^wildwood.knowledge-accessor.Accessor
205 knowledge-accessor]
206 ;; TODO: doesn't work yet.
207 (reduce
208 concat
209 (remove
210 empty?
211 (map
212 #(normalise % knowledge-accessor)
213 (first (first (reparse tagged-sentence grammar :propositions))))))))
214
215 (defn propositions-from-file
216 [file-path]
217 (reduce
218 concat
219 (remove
220 empty?
221 (map
222 #(propositions (pos-tag (tokenize %)))
223 (get-sentences (slurp file-path))))))
224
225 ;; (reparse (pos-tag (tokenize "True love is the daughter of temperance, and temperance is utterly opposed to the madness of bodily pleasure.")) grammar :propositions)
226 ;; (reparse [["temperance" "NN"] ["is" "VBZ"] ["utterly" "RB"] ["opposed" "VBN"] ["to" "TO"] ["the" "DT"] ["madness" "NN"] ["of" "IN"] ["bodily" "JJ"] ["pleasure" "NN"]] grammar :subject)
227 ;; (reparse [["is" "VBZ"] ["utterly" "RB"] ["opposed" "VBN"] ["to" "TO"] ["the" "DT"] ["madness" "NN"] ["of" "IN"] ["bodily" "JJ"] ["pleasure" "NN"]] grammar :verb-phrase)
228 ;; (reparse [["is" "VBZ"] ["utterly" "RB"] ["opposed" "VBN"] ["to" "TO"] ["the" "DT"] ["madness" "NN"] ["of" "IN"] ["bodily" "JJ"] ["pleasure" "NN"]] grammar :verb-phrase)