This more or less works - still fine tuning to do.

This commit is contained in:
Simon Brooke 2020-04-28 13:37:27 +01:00
parent fb00ec31a0
commit f98fac03a9
No known key found for this signature in database
GPG key ID: A7A4F18D1D4DF987
2 changed files with 133 additions and 100 deletions

View file

@ -1,8 +1,9 @@
(ns wwui.propositions
(:require [clojure.pprint :refer [pprint]]
[clojure.string :as s]
[opennlp.nlp :as nlp]
[opennlp.treebank :as tb]
[taoensso.timbre :as l :refer [info error spy]]
[taoensso.timbre :as log :refer [debug error info spy]]
[wildwood.knowledge-accessor :refer [Accessor]]))
;; Position tags used by OpenNLP for English are documented here:
@ -23,15 +24,19 @@
{:contextual-reference [["PRP"]] ;; the documentation says PRP is 'peronal pronoun',
;; but it seems to be all pronouns.
:noun [["NN"]["NNS"]["NNP"]["NNPS"]]
:full-name [["NNP"]
["NNP" :full-name]] ;; an unpunctuated sequence of proper nouns
;; probably represents a full name
:noun-phrase [[:contextual-reference]
[:noun]
[:full-name]
["DT" :noun]
[:adjectives :noun]
["DT" :adjectives :noun]]
;; :noun-phrases [[:noun-phrase]
;; [:noun-phrase "CC" :noun-phrases]
;; [:noun-phrase "IN" :noun-phrases]
;; [:noun-phrase "," :noun-phrases]]
:noun-phrases [[:noun-phrase]
[:noun-phrase "CC" :noun-phrases]
[:noun-phrase "IN" :noun-phrases]
[:noun-phrase "," :noun-phrases]]
:adjective [["JJ"]["JJR"]["JJS"]]
:adjectives [[:adjective]
[:adjective :adjectives]
@ -46,16 +51,16 @@
[:adverbs :verb]
[:verb :adverb :verb]
[:verb :adverbs]]
:locator [["IN" :noun-phrase]]
:locator [["IN" :noun-phrases]]
:locators [[:locator]
[:locators :locator]
[:locators "," :locator]]
:subject [[:noun-phrase]]
:object [[:noun-phrase]]
:proposition [[:subject :verb :object]
[:locator "," :subject :verb :object]
[:subject "," :locator "," :verb :object]
[:subject :verb-phrase :object :locator]]
[:locator :locators]
[:locator "," :locators]]
:subject [[:noun-phrases]]
:object [[:noun-phrases]]
:proposition [[:subject :verb-phrase :object]
[:locators "," :subject :verb-phrase :object]
[:subject "," :locators "," :verb-phrase :object]
[:subject :verb-phrase :object :locators]]
:propositions [[:proposition]
[:proposition "CC" :propositions]]})
@ -70,22 +75,19 @@
no match;
2. the tail of the sentence when the parts comprising the phrase are removed."
[tagged-sentence grammar goal]
(l/info "Seeking " goal " in " (with-out-str (pprint tagged-sentence)))
(if (keyword? goal)
(when (not (empty? tagged-sentence))
(when-let [result (first
(sort
#(> (count %1) (count %2))
#(< (count %1) (count %2))
(remove
empty?
(map
#(reparse tagged-sentence grammar %)
(goal grammar))))]
(goal grammar)))))]
(cons (cons (first result) (list goal)) (rest result))))
(throw (Exception. (str "Non-keyword passed to rdp-seek: `" goal "` (type " (or (type goal) "nil") ")")))))
;; (rdp-seek [["The" "DT"] ["Forum" "NNP"]] grammar :noun-phrase)
;; (reparse [["The" "DT"] ["Forum" "NNP"]] grammar ["DT" "NNP"])
;; (:noun-phrase grammar)
(defmacro tag
"The tag, on a `tagged-token`, is just the second element. Written as a macro
for readability."
@ -94,7 +96,7 @@
(defmacro coll-or-nil?
[o]
"For fuck's sake, nil isn't a collection? What planet are these people on?"
"For fuck's sake, `nil` isn't a collection? What planet are these people on?"
`(or (nil? ~o) (coll? ~o)))
(defn rdp-extend
@ -105,7 +107,6 @@
1. the first matching phrase for the goal, or `nil` if no match;
2. the tail of the sentence when the parts comprising the phrase are removed."
[tagged-sentence grammar goal]
(l/info "Extending " goal " in " (with-out-str (pprint tagged-sentence)))
(cond
(not (coll-or-nil? goal))
(throw (Exception. (str "Non-collection passed to rdp-extend: `" goal "` (type " (or (type goal) "nil") ")")))
@ -115,30 +116,13 @@
(let [[tt & st] tagged-sentence
[target & gt] goal]
(cond
(= target (tag tt))
(when-let [[dh & dt] (rdp-extend st grammar gt)]
(cons (cons tt dh) dt))
(keyword? target)
(when-let [[ph & pt] (rdp-seek tagged-sentence grammar target)] ;; it's this branch which is wrong. Getting it right, however, is not easy
(let [[dh & dt] (rdp-extend st grammar pt)]
(cons (cons (cons ph dh) (list target)) dt)))))))
;; As of 23:00 on 20160427, I'm getting
;; (rdp-seek [["the" "DT"] ["forum" "NN"]] grammar :noun-phrase)
;; => (((["the" "DT"] (((["forum" "NN"]) :noun)) :noun) :noun-phrase))
;; it's wrong, but it's interestingly and probably tractably wrong.
;; Seeking a keyword target whose definition in the grammar comprises only
;; string targets seems to produce the right result...
;; but seeking a keyword target whose definition expands to further keyword
;; targets (e.g. :noun-phrase being defined in terms of :noun) blows up.
;; (rdp-extend [["The" "DT"] ["Forum" "NNP"]] grammar [])
;; (rdp-extend [["The" "DT"] ["Forum" "NNP"]] grammar ["DT"])
;; (rdp-extend '(["The" "DT"] ["Forum" "NNP"]) grammar ["DT" "NNP"])
;; (rdp-extend '(["The" "DT"] ["Forum" "NNP"]) grammar ["DT" "FOO"])
(when-let [[h & t](reparse tagged-sentence grammar target)]
(when-let [[dh & dt] (reparse t grammar gt)]
(cons (cons h dh) dt)))
(= target (tag tt))
(when-let [[dh & dt] (reparse st grammar gt)]
(cons (cons tt dh) dt))))))
(defn reparse
"Reparse this `tagged-sentence` using this grammar to seek this `goal`.
@ -154,21 +138,31 @@
OpenNLP: it will not work on raw sentences;
2. it is a recursive descent parser."
[tagged-sentence grammar goal]
(l/info "Choosing strategy for " goal " in " (with-out-str (pprint tagged-sentence)))
(cond
;; (empty? tagged-sentence)
;; nil
(keyword? goal)
(rdp-seek tagged-sentence grammar goal)
(coll-or-nil? goal)
(rdp-extend tagged-sentence grammar goal)))
(log/debug "=> Choosing strategy for "
goal " in " (with-out-str (pprint tagged-sentence)))
(let [r (cond
(keyword? goal) (rdp-seek tagged-sentence grammar goal)
(coll-or-nil? goal) (rdp-extend tagged-sentence grammar goal))]
(log/debug "<= " goal " in "
(s/trim (with-out-str (pprint tagged-sentence)))
" returned " (s/trim (with-out-str (pprint r))))
r))
(defn propositions
"Given a `tagged-sentence`, return a list of propositions detected in that
sentence; if `knowledge-accessor` is passed, try to resolve names and noun
phrases to entities known to that knowledge accessor."
phrases to entities known to that knowledge accessor.
TODO: Note that if `:subject`, `:object` or `:locator` resolves to multiple
objects, then that is essentially one proposition for each unique
combination. This is not yet implemented!"
([tagged-sentence]
(reparse tagged-sentence grammar :propositions))
(reduce
merge
{}
(map
#(assoc {} (nth % 1) (first %))
(first (first (first (reparse tagged-sentence grammar :propositions)))))))
([tagged-sentence ;; ^wildwood.knowledge-accessor.Accessor
knowledge-accessor]
;; TODO: doesn't work yet.
@ -184,32 +178,3 @@
#(propositions (pos-tag (tokenize %)))
(get-sentences (slurp file-path))))))
;; (reparse [] grammar :noun)
;; (rdp-seek (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun)
;; (coll? ["NPP"])
;; (reparse (pos-tag (tokenize "killed Caesar")) grammar :verb)
;; (reparse (pos-tag (tokenize "The Forum")) grammar :noun-phrase)
;; (reparse (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"])
;; (reparse [["Forum" "NNP"]] grammar :noun-phrase)
;; (map
;; #(reparse (pos-tag (tokenize "Forum")) grammar %)
;; (:noun-phrase grammar))
;; (rdp-extend (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"])
;; (nil nil
;; ((["The" "DT"]) ["Forum" "NNP"])
;; nil
;; ((["The" "DT"]) ["Forum" "NNP"]) nil nil nil)
;; (reparse (pos-tag (tokenize "in the Forum")) grammar :locator)
;; (reparse (pos-tag (tokenize "The Forum")) grammar ["DT" "NNP"])
;; (rdp-extend (pos-tag (tokenize "The Forum")) grammar ["DT" :noun])
;; (let [deeper (rdp-extend (pos-tag (tokenize "Forum on Sunday")) grammar ["NNP"])]
;; (cons (cons ["The" "DT"] (first deeper)) (rest deeper)))
;; (let [deeper (rdp-extend (pos-tag (tokenize "The Forum on Sunday")) grammar ["DT" "NNP"])]
;; deeper)

View file

@ -1,27 +1,95 @@
(ns wwui.propositions-test
(:require [clojure.test :refer :all]
[wwui.propositions :refer :all]))
[wwui.propositions :refer :all]
[taoensso.timbre :as log :refer [set-level!]]))
(log/set-level! :error)
(deftest reparser-tests
(testing "Simplest constructs"
(is (= (recursive-descent-parser [] grammar :noun) nil))
(is (= (reparse [] grammar :noun) nil))
(is
(=
(recursive-descent-parser (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun)
(reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :noun)
'(((["Brutus" "NNP"]) :noun) ["killed" "VBD"] ["Caesar" "NNP"])))
(is
(=
(recursive-descent-parser (pos-tag (tokenize "Brutus killed Caesar")) grammar :noun-phrase)
(reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :noun-phrase)
'((((["Brutus" "NNP"]) :noun) :nown-phrase) ["killed" "VBD"] ["Caesar" "NNP"])))
(is
(=
(recursive-descent-parser (pos-tag (tokenize "The Forum")) grammar :noun-phrase)
(((["The" "DT"]["Forum" "NNP"]) :noun-phrase))))
(reparse [["The" "DT"] ["Forum" "NNP"]] grammar :noun-phrase)
'(((["The" "DT"]["Forum" "NNP"]) :noun-phrase))))
(is
(=
(recursive-descent-parser (pos-tag (tokenize "killed Caesar")) grammar :verb)
(((["killed" "VBN"]) :verb) ["Caesar" "NNP"])))
(reparse [["killed" "VBD"] ["Caesar" "NNP"]] grammar :verb)
'(((["killed" "VBD"]) :verb) ["Caesar" "NNP"])))
(is
(=
(recursive-descent-parser (pos-tag (tokenize "in the Forum")) grammar :locator)
(((["in" "IN"]["the" "DT"]["Forum" "NNP"]) :locator) )))))
(reparse [["in" "IN"] ["the" "DT"] ["Forum" "NNP"]] grammar :locator)
'(((["in" "IN"] ((["the" "DT"] ((["Forum" "NNP"]) :noun)) :noun-phrase)) :locator))))
(is
(=
(reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator)
'(((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator))))
)
(testing "collections"
(is
(= (count (reparse (pos-tag (tokenize "brave, noble")) grammar :adjectives)) 1)
"Currently, lists of adjectives are not being recognised, and this fails.")
(is
(= (count (reparse (pos-tag (tokenize "cruelly and wickedly")) grammar :adverbs)) 1)
"Currently, lists of adverbs are not being recognised, and this fails."))
(testing "locators"
(is
(=
(reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator)
'(((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator)))
"Positional locator")
(is
(=
(count
(reparse
[["on" "IN"] ["the" "DT"] ["ides" "NNS"] ["of" "IN"] ["March" "NNP"]]
grammar
:locator))
1)
"Temporal locator: currently, 'of March' is not being recognised as part of the locator, so this is failing.")
(is
(=
(reparse [["in" "IN"] ["the" "DT"] ["forum" "NN"]] grammar :locator)
'(((((["in" "IN"] ((["the" "DT"] ((["forum" "NN"]) :noun)) :noun-phrase)) :locator)) :locators)))
"Single locator as locators")
)
(testing "propositions"
(is
(=
(reparse [["Brutus" "NNP"] ["killed" "VBD"] ["Caesar" "NNP"]] grammar :proposition)
'(((((((((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject)
((((["killed" "VBD"]) :verb)) :verb-phrase)
((((((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition))
))
(is
(=
(reparse
[["Proud" "JJ"] ["Brutus" "NNP"] ["killed" "VBD"] ["noble" "JJ"] ["Caesar" "NNP"]]
grammar :proposition)
'(((((((((((["Proud" "JJ"]) :adjective)) :adjectives)
((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject)
((((["killed" "VBD"]) :verb)) :verb-phrase)
((((((((["noble" "JJ"]) :adjective)) :adjectives)
((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition))
) "Single adjectives")
(is
(=
(reparse
[["Proud" "JJ"] ["Brutus" "NNP"] ["brutally" "RB"] ["killed" "VBD"] ["noble" "JJ"] ["Caesar" "NNP"]]
grammar :proposition)
'(((((((((((["Proud" "JJ"]) :adjective)) :adjectives)
((["Brutus" "NNP"]) :noun)) :noun-phrase)) :subject)
((((((["brutally" "RB"]) :adverb)) :adverbs)
((["killed" "VBD"]) :verb)) :verb-phrase)
((((((((["noble" "JJ"]) :adjective)) :adjectives)
((["Caesar" "NNP"]) :noun)) :noun-phrase)) :object)) :proposition))
) "Single adverb")
))