From f98fac03a9e274cf24157767bb34e9d2244715ba Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 28 Apr 2020 13:37:27 +0100 Subject: [PATCH] This more or less works - still fine tuning to do. --- src/wwui/propositions.clj | 145 ++++++++++++-------------------- test/wwui/propositions_test.clj | 88 ++++++++++++++++--- 2 files changed, 133 insertions(+), 100 deletions(-) diff --git a/src/wwui/propositions.clj b/src/wwui/propositions.clj index c3c09dd..25a45cf 100644 --- a/src/wwui/propositions.clj +++ b/src/wwui/propositions.clj @@ -1,8 +1,9 @@ (ns wwui.propositions (:require [clojure.pprint :refer [pprint]] + [clojure.string :as s] [opennlp.nlp :as nlp] [opennlp.treebank :as tb] - [taoensso.timbre :as l :refer [info error spy]] + [taoensso.timbre :as log :refer [debug error info spy]] [wildwood.knowledge-accessor :refer [Accessor]])) ;; Position tags used by OpenNLP for English are documented here: @@ -23,15 +24,19 @@ {:contextual-reference [["PRP"]] ;; the documentation says PRP is 'peronal pronoun', ;; but it seems to be all pronouns. :noun [["NN"]["NNS"]["NNP"]["NNPS"]] + :full-name [["NNP"] + ["NNP" :full-name]] ;; an unpunctuated sequence of proper nouns + ;; probably represents a full name :noun-phrase [[:contextual-reference] [:noun] + [:full-name] ["DT" :noun] [:adjectives :noun] ["DT" :adjectives :noun]] -;; :noun-phrases [[:noun-phrase] -;; [:noun-phrase "CC" :noun-phrases] -;; [:noun-phrase "IN" :noun-phrases] -;; [:noun-phrase "," :noun-phrases]] + :noun-phrases [[:noun-phrase] + [:noun-phrase "CC" :noun-phrases] + [:noun-phrase "IN" :noun-phrases] + [:noun-phrase "," :noun-phrases]] :adjective [["JJ"]["JJR"]["JJS"]] :adjectives [[:adjective] [:adjective :adjectives] @@ -46,16 +51,16 @@ [:adverbs :verb] [:verb :adverb :verb] [:verb :adverbs]] - :locator [["IN" :noun-phrase]] + :locator [["IN" :noun-phrases]] :locators [[:locator] - [:locators :locator] - [:locators "," :locator]] - :subject [[:noun-phrase]] - :object [[:noun-phrase]] - :proposition [[:subject :verb :object] - [:locator "," :subject :verb :object] - [:subject "," :locator "," :verb :object] - [:subject :verb-phrase :object :locator]] + [:locator :locators] + [:locator "," :locators]] + :subject [[:noun-phrases]] + :object [[:noun-phrases]] + :proposition [[:subject :verb-phrase :object] + [:locators "," :subject :verb-phrase :object] + [:subject "," :locators "," :verb-phrase :object] + [:subject :verb-phrase :object :locators]] :propositions [[:proposition] [:proposition "CC" :propositions]]}) @@ -70,22 +75,19 @@ no match; 2. the tail of the sentence when the parts comprising the phrase are removed." [tagged-sentence grammar goal] - (l/info "Seeking " goal " in " (with-out-str (pprint tagged-sentence))) (if (keyword? goal) (when (not (empty? tagged-sentence)) (when-let [result (first - (sort - #(> (count %1) (count %2)) - (map - #(reparse tagged-sentence grammar %) - (goal grammar))))] - (cons (cons (first result) (list goal)) (rest result)))) + (sort + #(< (count %1) (count %2)) + (remove + empty? + (map + #(reparse tagged-sentence grammar %) + (goal grammar)))))] + (cons (cons (first result) (list goal)) (rest result)))) (throw (Exception. (str "Non-keyword passed to rdp-seek: `" goal "` (type " (or (type goal) "nil") ")"))))) -;; (rdp-seek [["The" "DT"] ["Forum" "NNP"]] grammar :noun-phrase) -;; (reparse [["The" "DT"] ["Forum" "NNP"]] grammar ["DT" "NNP"]) -;; (:noun-phrase grammar) - (defmacro tag "The tag, on a `tagged-token`, is just the second element. Written as a macro for readability." @@ -94,7 +96,7 @@ (defmacro coll-or-nil? [o] - "For fuck's sake, nil isn't a collection? What planet are these people on?" + "For fuck's sake, `nil` isn't a collection? What planet are these people on?" `(or (nil? ~o) (coll? ~o))) (defn rdp-extend @@ -105,7 +107,6 @@ 1. the first matching phrase for the goal, or `nil` if no match; 2. the tail of the sentence when the parts comprising the phrase are removed." [tagged-sentence grammar goal] - (l/info "Extending " goal " in " (with-out-str (pprint tagged-sentence))) (cond (not (coll-or-nil? goal)) (throw (Exception. (str "Non-collection passed to rdp-extend: `" goal "` (type " (or (type goal) "nil") ")"))) @@ -115,30 +116,13 @@ (let [[tt & st] tagged-sentence [target & gt] goal] (cond - (= target (tag tt)) - (when-let [[dh & dt] (rdp-extend st grammar gt)] - (cons (cons tt dh) dt)) (keyword? target) - (when-let [[ph & pt] (rdp-seek tagged-sentence grammar target)] ;; it's this branch which is wrong. Getting it right, however, is not easy - (let [[dh & dt] (rdp-extend st grammar pt)] - (cons (cons (cons ph dh) (list target)) dt))))))) - -;; As of 23:00 on 20160427, I'm getting -;; (rdp-seek [["the" "DT"] ["forum" "NN"]] grammar :noun-phrase) -;; => (((["the" "DT"] (((["forum" "NN"]) :noun)) :noun) :noun-phrase)) -;; it's wrong, but it's interestingly and probably tractably wrong. - -;; Seeking a keyword target whose definition in the grammar comprises only -;; string targets seems to produce the right result... -;; but seeking a keyword target whose definition expands to further keyword -;; targets (e.g. :noun-phrase being defined in terms of :noun) blows up. - - - -;; (rdp-extend [["The" "DT"] ["Forum" "NNP"]] grammar []) -;; (rdp-extend [["The" "DT"] ["Forum" "NNP"]] grammar ["DT"]) -;; (rdp-extend '(["The" "DT"] ["Forum" "NNP"]) grammar ["DT" "NNP"]) -;; (rdp-extend '(["The" "DT"] ["Forum" "NNP"]) grammar ["DT" "FOO"]) + (when-let [[h & t](reparse tagged-sentence grammar target)] + (when-let [[dh & dt] (reparse t grammar gt)] + (cons (cons h dh) dt))) + (= target (tag tt)) + (when-let [[dh & dt] (reparse st grammar gt)] + (cons (cons tt dh) dt)))))) (defn reparse "Reparse this `tagged-sentence` using this grammar to seek this `goal`. @@ -151,24 +135,34 @@ This function is called `reparse` because: 1. it is designed to parse sentences which have already been parsed by - OpenNLP: it will not work on raw sentences; + OpenNLP: it will not work on raw sentences; 2. it is a recursive descent parser." [tagged-sentence grammar goal] - (l/info "Choosing strategy for " goal " in " (with-out-str (pprint tagged-sentence))) - (cond - ;; (empty? tagged-sentence) - ;; nil - (keyword? goal) - (rdp-seek tagged-sentence grammar goal) - (coll-or-nil? goal) - (rdp-extend tagged-sentence grammar goal))) + (log/debug "=> Choosing strategy for " + goal " in " (with-out-str (pprint tagged-sentence))) + (let [r (cond + (keyword? goal) (rdp-seek tagged-sentence grammar goal) + (coll-or-nil? goal) (rdp-extend tagged-sentence grammar goal))] + (log/debug "<= " goal " in " + (s/trim (with-out-str (pprint tagged-sentence))) + " returned " (s/trim (with-out-str (pprint r)))) + r)) (defn propositions "Given a `tagged-sentence`, return a list of propositions detected in that sentence; if `knowledge-accessor` is passed, try to resolve names and noun - phrases to entities known to that knowledge accessor." + phrases to entities known to that knowledge accessor. + + TODO: Note that if `:subject`, `:object` or `:locator` resolves to multiple + objects, then that is essentially one proposition for each unique + combination. This is not yet implemented!" ([tagged-sentence] - (reparse tagged-sentence grammar :propositions)) + (reduce + merge + {} + (map + #(assoc {} (nth % 1) (first %)) + (first (first (first (reparse tagged-sentence grammar :propositions))))))) ([tagged-sentence ;; ^wildwood.knowledge-accessor.Accessor knowledge-accessor] ;; TODO: doesn't work yet. @@ -184,32 +178,3 @@ #(propositions (pos-tag (tokenize %))) (get-sentences (slurp file-path)))))) -;; (reparse [] grammar :noun) -;; (rdp-seek (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun) -;; (coll? ["NPP"]) -;; (reparse (pos-tag (tokenize "killed Caesar")) grammar :verb) -;; (reparse (pos-tag (tokenize "The Forum")) grammar :noun-phrase) -;; (reparse (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"]) - -;; (reparse [["Forum" "NNP"]] grammar :noun-phrase) - -;; (map -;; #(reparse (pos-tag (tokenize "Forum")) grammar %) -;; (:noun-phrase grammar)) - -;; (rdp-extend (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"]) - -;; (nil nil -;; ((["The" "DT"]) ["Forum" "NNP"]) -;; nil -;; ((["The" "DT"]) ["Forum" "NNP"]) nil nil nil) - -;; (reparse (pos-tag (tokenize "in the Forum")) grammar :locator) - -;; (reparse (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"]) - -;; (rdp-extend (pos-tag (tokenize "The Forum")) grammar ["DT" :noun]) -;; (let [deeper (rdp-extend (pos-tag (tokenize "Forum on Sunday")) grammar ["NNP"])] -;; (cons (cons ["The" "DT"] (first deeper)) (rest deeper))) -;; (let [deeper (rdp-extend (pos-tag (tokenize "The Forum on Sunday")) grammar ["DT" "NNP"])] -;; deeper) diff --git a/test/wwui/propositions_test.clj b/test/wwui/propositions_test.clj index 0c8affa..20131ef 100644 --- a/test/wwui/propositions_test.clj +++ b/test/wwui/propositions_test.clj @@ -1,27 +1,95 @@ (ns wwui.propositions-test (:require [clojure.test :refer :all] - [wwui.propositions :refer :all])) + [wwui.propositions :refer :all] + [taoensso.timbre :as log :refer [set-level!]])) + +(log/set-level! :error) (deftest reparser-tests (testing "Simplest constructs" - (is (= (recursive-descent-parser [] grammar :noun) nil)) + (is (= (reparse [] grammar :noun) nil)) (is (= - (recursive-descent-parser (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun) + (reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :noun) '(((["Brutus" "NNP"]) :noun) ["killed" "VBD"] ["Caesar" "NNP"]))) (is (= - (recursive-descent-parser (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun-phrase) + (reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :noun-phrase) '((((["Brutus" "NNP"]) :noun) :nown-phrase) ["killed" "VBD"] ["Caesar" "NNP"]))) (is (= - (recursive-descent-parser (pos-tag (tokenize "The Forum")) grammar :noun-phrase) - (((["The" "DT"]["Forum" "NNP"]) :noun-phrase)))) + (reparse [["The" "DT"] ["Forum" "NNP"]] grammar :noun-phrase) + '(((["The" "DT"]["Forum" "NNP"]) :noun-phrase)))) (is (= - (recursive-descent-parser (pos-tag (tokenize "killed Caesar")) grammar :verb) - (((["killed" "VBN"]) :verb) ["Caesar" "NNP"]))) + (reparse [["killed" "VBD"] ["Caesar" "NNP"]] grammar :verb) + '(((["killed" "VBD"]) :verb) ["Caesar" "NNP"]))) (is (= - (recursive-descent-parser (pos-tag (tokenize "in the Forum")) grammar :locator) - (((["in" "IN"]["the" "DT"]["Forum" "NNP"]) :locator) ))))) + (reparse [["in" "IN"] ["the" "DT"] ["Forum" "NNP"]] grammar :locator) + '(((["in" "IN"] ((["the" "DT"] ((["Forum" "NNP"]) :noun)) :noun-phrase)) :locator)))) + (is + (= + (reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator) + '(((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator)))) + ) + (testing "collections" + (is + (= (count (reparse (pos-tag (tokenize "brave, noble")) grammar :adjectives)) 1) + "Currently, lists of adjectives are not being recognised, and this fails.") + (is + (= (count (reparse (pos-tag (tokenize "cruelly and wickedly")) grammar :adverbs)) 1) + "Currently, lists of adverbs are not being recognised, and this fails.")) + (testing "locators" + (is + (= + (reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator) + '(((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator))) + "Positional locator") + (is + (= + (count + (reparse + [["on" "IN"] ["the" "DT"] ["ides" "NNS"] ["of" "IN"] ["March" "NNP"]] + grammar + :locator)) + 1) + "Temporal locator: currently, 'of March' is not being recognised as part of the locator, so this is failing.") + (is + (= + (reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator) + '(((((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator)) :locators))) + "Single locator as locators") + ) + (testing "propositions" + (is + (= + (reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :proposition) + '(((((((((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject) + ((((["killed" "VBD"]) :verb)) :verb-phrase) + ((((((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition)) + )) + (is + (= + (reparse + [["Proud" "JJ"] ["Brutus" "NNP"] ["killed" "VBD"] ["noble" "JJ"] ["Caesar" "NNP"]] + grammar :proposition) + '(((((((((((["Proud" "JJ"]) :adjective)) :adjectives) + ((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject) + ((((["killed" "VBD"]) :verb)) :verb-phrase) + ((((((((["noble" "JJ"]) :adjective)) :adjectives) + ((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition)) + ) "Single adjectives") + (is + (= + (reparse + [["Proud" "JJ"] ["Brutus" "NNP"] ["brutally" "RB"] ["killed" "VBD"] ["noble" "JJ"] ["Caesar" "NNP"]] + grammar :proposition) + '(((((((((((["Proud" "JJ"]) :adjective)) :adjectives) + ((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject) + ((((((["brutally" "RB"]) :adverb)) :adverbs) + ((["killed" "VBD"]) :verb)) :verb-phrase) + ((((((((["noble" "JJ"]) :adjective)) :adjectives) + ((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition)) + ) "Single adverb") + ))