This more or less works - still fine tuning to do.

This commit is contained in:
Simon Brooke 2020-04-28 13:37:27 +01:00
parent fb00ec31a0
commit f98fac03a9
No known key found for this signature in database
GPG key ID: A7A4F18D1D4DF987
2 changed files with 133 additions and 100 deletions

View file

@ -1,8 +1,9 @@
(ns wwui.propositions (ns wwui.propositions
(:require [clojure.pprint :refer [pprint]] (:require [clojure.pprint :refer [pprint]]
[clojure.string :as s]
[opennlp.nlp :as nlp] [opennlp.nlp :as nlp]
[opennlp.treebank :as tb] [opennlp.treebank :as tb]
[taoensso.timbre :as l :refer [info error spy]] [taoensso.timbre :as log :refer [debug error info spy]]
[wildwood.knowledge-accessor :refer [Accessor]])) [wildwood.knowledge-accessor :refer [Accessor]]))
;; Position tags used by OpenNLP for English are documented here: ;; Position tags used by OpenNLP for English are documented here:
@ -23,15 +24,19 @@
{:contextual-reference [["PRP"]] ;; the documentation says PRP is 'peronal pronoun', {:contextual-reference [["PRP"]] ;; the documentation says PRP is 'peronal pronoun',
;; but it seems to be all pronouns. ;; but it seems to be all pronouns.
:noun [["NN"]["NNS"]["NNP"]["NNPS"]] :noun [["NN"]["NNS"]["NNP"]["NNPS"]]
:full-name [["NNP"]
["NNP" :full-name]] ;; an unpunctuated sequence of proper nouns
;; probably represents a full name
:noun-phrase [[:contextual-reference] :noun-phrase [[:contextual-reference]
[:noun] [:noun]
[:full-name]
["DT" :noun] ["DT" :noun]
[:adjectives :noun] [:adjectives :noun]
["DT" :adjectives :noun]] ["DT" :adjectives :noun]]
;; :noun-phrases [[:noun-phrase] :noun-phrases [[:noun-phrase]
;; [:noun-phrase "CC" :noun-phrases] [:noun-phrase "CC" :noun-phrases]
;; [:noun-phrase "IN" :noun-phrases] [:noun-phrase "IN" :noun-phrases]
;; [:noun-phrase "," :noun-phrases]] [:noun-phrase "," :noun-phrases]]
:adjective [["JJ"]["JJR"]["JJS"]] :adjective [["JJ"]["JJR"]["JJS"]]
:adjectives [[:adjective] :adjectives [[:adjective]
[:adjective :adjectives] [:adjective :adjectives]
@ -46,16 +51,16 @@
[:adverbs :verb] [:adverbs :verb]
[:verb :adverb :verb] [:verb :adverb :verb]
[:verb :adverbs]] [:verb :adverbs]]
:locator [["IN" :noun-phrase]] :locator [["IN" :noun-phrases]]
:locators [[:locator] :locators [[:locator]
[:locators :locator] [:locator :locators]
[:locators "," :locator]] [:locator "," :locators]]
:subject [[:noun-phrase]] :subject [[:noun-phrases]]
:object [[:noun-phrase]] :object [[:noun-phrases]]
:proposition [[:subject :verb :object] :proposition [[:subject :verb-phrase :object]
[:locator "," :subject :verb :object] [:locators "," :subject :verb-phrase :object]
[:subject "," :locator "," :verb :object] [:subject "," :locators "," :verb-phrase :object]
[:subject :verb-phrase :object :locator]] [:subject :verb-phrase :object :locators]]
:propositions [[:proposition] :propositions [[:proposition]
[:proposition "CC" :propositions]]}) [:proposition "CC" :propositions]]})
@ -70,22 +75,19 @@
no match; no match;
2. the tail of the sentence when the parts comprising the phrase are removed." 2. the tail of the sentence when the parts comprising the phrase are removed."
[tagged-sentence grammar goal] [tagged-sentence grammar goal]
(l/info "Seeking " goal " in " (with-out-str (pprint tagged-sentence)))
(if (keyword? goal) (if (keyword? goal)
(when (not (empty? tagged-sentence)) (when (not (empty? tagged-sentence))
(when-let [result (first (when-let [result (first
(sort (sort
#(> (count %1) (count %2)) #(< (count %1) (count %2))
(map (remove
#(reparse tagged-sentence grammar %) empty?
(goal grammar))))] (map
(cons (cons (first result) (list goal)) (rest result)))) #(reparse tagged-sentence grammar %)
(goal grammar)))))]
(cons (cons (first result) (list goal)) (rest result))))
(throw (Exception. (str "Non-keyword passed to rdp-seek: `" goal "` (type " (or (type goal) "nil") ")"))))) (throw (Exception. (str "Non-keyword passed to rdp-seek: `" goal "` (type " (or (type goal) "nil") ")")))))
;; (rdp-seek [["The" "DT"] ["Forum" "NNP"]] grammar :noun-phrase)
;; (reparse [["The" "DT"] ["Forum" "NNP"]] grammar ["DT" "NNP"])
;; (:noun-phrase grammar)
(defmacro tag (defmacro tag
"The tag, on a `tagged-token`, is just the second element. Written as a macro "The tag, on a `tagged-token`, is just the second element. Written as a macro
for readability." for readability."
@ -94,7 +96,7 @@
(defmacro coll-or-nil? (defmacro coll-or-nil?
[o] [o]
"For fuck's sake, nil isn't a collection? What planet are these people on?" "For fuck's sake, `nil` isn't a collection? What planet are these people on?"
`(or (nil? ~o) (coll? ~o))) `(or (nil? ~o) (coll? ~o)))
(defn rdp-extend (defn rdp-extend
@ -105,7 +107,6 @@
1. the first matching phrase for the goal, or `nil` if no match; 1. the first matching phrase for the goal, or `nil` if no match;
2. the tail of the sentence when the parts comprising the phrase are removed." 2. the tail of the sentence when the parts comprising the phrase are removed."
[tagged-sentence grammar goal] [tagged-sentence grammar goal]
(l/info "Extending " goal " in " (with-out-str (pprint tagged-sentence)))
(cond (cond
(not (coll-or-nil? goal)) (not (coll-or-nil? goal))
(throw (Exception. (str "Non-collection passed to rdp-extend: `" goal "` (type " (or (type goal) "nil") ")"))) (throw (Exception. (str "Non-collection passed to rdp-extend: `" goal "` (type " (or (type goal) "nil") ")")))
@ -115,30 +116,13 @@
(let [[tt & st] tagged-sentence (let [[tt & st] tagged-sentence
[target & gt] goal] [target & gt] goal]
(cond (cond
(= target (tag tt))
(when-let [[dh & dt] (rdp-extend st grammar gt)]
(cons (cons tt dh) dt))
(keyword? target) (keyword? target)
(when-let [[ph & pt] (rdp-seek tagged-sentence grammar target)] ;; it's this branch which is wrong. Getting it right, however, is not easy (when-let [[h & t](reparse tagged-sentence grammar target)]
(let [[dh & dt] (rdp-extend st grammar pt)] (when-let [[dh & dt] (reparse t grammar gt)]
(cons (cons (cons ph dh) (list target)) dt))))))) (cons (cons h dh) dt)))
(= target (tag tt))
;; As of 23:00 on 20160427, I'm getting (when-let [[dh & dt] (reparse st grammar gt)]
;; (rdp-seek [["the" "DT"] ["forum" "NN"]] grammar :noun-phrase) (cons (cons tt dh) dt))))))
;; => (((["the" "DT"] (((["forum" "NN"]) :noun)) :noun) :noun-phrase))
;; it's wrong, but it's interestingly and probably tractably wrong.
;; Seeking a keyword target whose definition in the grammar comprises only
;; string targets seems to produce the right result...
;; but seeking a keyword target whose definition expands to further keyword
;; targets (e.g. :noun-phrase being defined in terms of :noun) blows up.
;; (rdp-extend [["The" "DT"] ["Forum" "NNP"]] grammar [])
;; (rdp-extend [["The" "DT"] ["Forum" "NNP"]] grammar ["DT"])
;; (rdp-extend '(["The" "DT"] ["Forum" "NNP"]) grammar ["DT" "NNP"])
;; (rdp-extend '(["The" "DT"] ["Forum" "NNP"]) grammar ["DT" "FOO"])
(defn reparse (defn reparse
"Reparse this `tagged-sentence` using this grammar to seek this `goal`. "Reparse this `tagged-sentence` using this grammar to seek this `goal`.
@ -151,24 +135,34 @@
This function is called `reparse` because: This function is called `reparse` because:
1. it is designed to parse sentences which have already been parsed by 1. it is designed to parse sentences which have already been parsed by
OpenNLP: it will not work on raw sentences; OpenNLP: it will not work on raw sentences;
2. it is a recursive descent parser." 2. it is a recursive descent parser."
[tagged-sentence grammar goal] [tagged-sentence grammar goal]
(l/info "Choosing strategy for " goal " in " (with-out-str (pprint tagged-sentence))) (log/debug "=> Choosing strategy for "
(cond goal " in " (with-out-str (pprint tagged-sentence)))
;; (empty? tagged-sentence) (let [r (cond
;; nil (keyword? goal) (rdp-seek tagged-sentence grammar goal)
(keyword? goal) (coll-or-nil? goal) (rdp-extend tagged-sentence grammar goal))]
(rdp-seek tagged-sentence grammar goal) (log/debug "<= " goal " in "
(coll-or-nil? goal) (s/trim (with-out-str (pprint tagged-sentence)))
(rdp-extend tagged-sentence grammar goal))) " returned " (s/trim (with-out-str (pprint r))))
r))
(defn propositions (defn propositions
"Given a `tagged-sentence`, return a list of propositions detected in that "Given a `tagged-sentence`, return a list of propositions detected in that
sentence; if `knowledge-accessor` is passed, try to resolve names and noun sentence; if `knowledge-accessor` is passed, try to resolve names and noun
phrases to entities known to that knowledge accessor." phrases to entities known to that knowledge accessor.
TODO: Note that if `:subject`, `:object` or `:locator` resolves to multiple
objects, then that is essentially one proposition for each unique
combination. This is not yet implemented!"
([tagged-sentence] ([tagged-sentence]
(reparse tagged-sentence grammar :propositions)) (reduce
merge
{}
(map
#(assoc {} (nth % 1) (first %))
(first (first (first (reparse tagged-sentence grammar :propositions)))))))
([tagged-sentence ;; ^wildwood.knowledge-accessor.Accessor ([tagged-sentence ;; ^wildwood.knowledge-accessor.Accessor
knowledge-accessor] knowledge-accessor]
;; TODO: doesn't work yet. ;; TODO: doesn't work yet.
@ -184,32 +178,3 @@
#(propositions (pos-tag (tokenize %))) #(propositions (pos-tag (tokenize %)))
(get-sentences (slurp file-path)))))) (get-sentences (slurp file-path))))))
;; (reparse [] grammar :noun)
;; (rdp-seek (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun)
;; (coll? ["NPP"])
;; (reparse (pos-tag (tokenize "killed Caesar")) grammar :verb)
;; (reparse (pos-tag (tokenize "The Forum")) grammar :noun-phrase)
;; (reparse (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"])
;; (reparse [["Forum" "NNP"]] grammar :noun-phrase)
;; (map
;; #(reparse (pos-tag (tokenize "Forum")) grammar %)
;; (:noun-phrase grammar))
;; (rdp-extend (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"])
;; (nil nil
;; ((["The" "DT"]) ["Forum" "NNP"])
;; nil
;; ((["The" "DT"]) ["Forum" "NNP"]) nil nil nil)
;; (reparse (pos-tag (tokenize "in the Forum")) grammar :locator)
;; (reparse (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"])
;; (rdp-extend (pos-tag (tokenize "The Forum")) grammar ["DT" :noun])
;; (let [deeper (rdp-extend (pos-tag (tokenize "Forum on Sunday")) grammar ["NNP"])]
;; (cons (cons ["The" "DT"] (first deeper)) (rest deeper)))
;; (let [deeper (rdp-extend (pos-tag (tokenize "The Forum on Sunday")) grammar ["DT" "NNP"])]
;; deeper)

View file

@ -1,27 +1,95 @@
(ns wwui.propositions-test (ns wwui.propositions-test
(:require [clojure.test :refer :all] (:require [clojure.test :refer :all]
[wwui.propositions :refer :all])) [wwui.propositions :refer :all]
[taoensso.timbre :as log :refer [set-level!]]))
(log/set-level! :error)
(deftest reparser-tests (deftest reparser-tests
(testing "Simplest constructs" (testing "Simplest constructs"
(is (= (recursive-descent-parser [] grammar :noun) nil)) (is (= (reparse [] grammar :noun) nil))
(is (is
(= (=
(recursive-descent-parser (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun) (reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :noun)
'(((["Brutus" "NNP"]) :noun) ["killed" "VBD"] ["Caesar" "NNP"]))) '(((["Brutus" "NNP"]) :noun) ["killed" "VBD"] ["Caesar" "NNP"])))
(is (is
(= (=
(recursive-descent-parser (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun-phrase) (reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :noun-phrase)
'((((["Brutus" "NNP"]) :noun) :nown-phrase) ["killed" "VBD"] ["Caesar" "NNP"]))) '((((["Brutus" "NNP"]) :noun) :nown-phrase) ["killed" "VBD"] ["Caesar" "NNP"])))
(is (is
(= (=
(recursive-descent-parser (pos-tag (tokenize "The Forum")) grammar :noun-phrase) (reparse [["The" "DT"] ["Forum" "NNP"]] grammar :noun-phrase)
(((["The" "DT"]["Forum" "NNP"]) :noun-phrase)))) '(((["The" "DT"]["Forum" "NNP"]) :noun-phrase))))
(is (is
(= (=
(recursive-descent-parser (pos-tag (tokenize "killed Caesar")) grammar :verb) (reparse [["killed" "VBD"] ["Caesar" "NNP"]] grammar :verb)
(((["killed" "VBN"]) :verb) ["Caesar" "NNP"]))) '(((["killed" "VBD"]) :verb) ["Caesar" "NNP"])))
(is (is
(= (=
(recursive-descent-parser (pos-tag (tokenize "in the Forum")) grammar :locator) (reparse [["in" "IN"] ["the" "DT"] ["Forum" "NNP"]] grammar :locator)
(((["in" "IN"]["the" "DT"]["Forum" "NNP"]) :locator) ))))) '(((["in" "IN"] ((["the" "DT"] ((["Forum" "NNP"]) :noun)) :noun-phrase)) :locator))))
(is
(=
(reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator)
'(((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator))))
)
(testing "collections"
(is
(= (count (reparse (pos-tag (tokenize "brave, noble")) grammar :adjectives)) 1)
"Currently, lists of adjectives are not being recognised, and this fails.")
(is
(= (count (reparse (pos-tag (tokenize "cruelly and wickedly")) grammar :adverbs)) 1)
"Currently, lists of adverbs are not being recognised, and this fails."))
(testing "locators"
(is
(=
(reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator)
'(((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator)))
"Positional locator")
(is
(=
(count
(reparse
[["on" "IN"] ["the" "DT"] ["ides" "NNS"] ["of" "IN"] ["March" "NNP"]]
grammar
:locator))
1)
"Temporal locator: currently, 'of March' is not being recognised as part of the locator, so this is failing.")
(is
(=
(reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator)
'(((((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator)) :locators)))
"Single locator as locators")
)
(testing "propositions"
(is
(=
(reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :proposition)
'(((((((((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject)
((((["killed" "VBD"]) :verb)) :verb-phrase)
((((((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition))
))
(is
(=
(reparse
[["Proud" "JJ"] ["Brutus" "NNP"] ["killed" "VBD"] ["noble" "JJ"] ["Caesar" "NNP"]]
grammar :proposition)
'(((((((((((["Proud" "JJ"]) :adjective)) :adjectives)
((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject)
((((["killed" "VBD"]) :verb)) :verb-phrase)
((((((((["noble" "JJ"]) :adjective)) :adjectives)
((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition))
) "Single adjectives")
(is
(=
(reparse
[["Proud" "JJ"] ["Brutus" "NNP"] ["brutally" "RB"] ["killed" "VBD"] ["noble" "JJ"] ["Caesar" "NNP"]]
grammar :proposition)
'(((((((((((["Proud" "JJ"]) :adjective)) :adjectives)
((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject)
((((((["brutally" "RB"]) :adverb)) :adverbs)
((["killed" "VBD"]) :verb)) :verb-phrase)
((((((((["noble" "JJ"]) :adjective)) :adjectives)
((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition))
) "Single adverb")
))