diff --git a/README.md b/README.md index f7c44f7..b26bfab 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ Download from http://example.com/FIXME. ## Usage -**WARNING**: does not even nearly work yet, and may nver do so. The value of this archive is probably mainly as something to mine for algorithms and ideas, especially for those doing natural language interpreteation and generation. +**WARNING**: does not even nearly work yet, and may never do so. The value of this archive is probably mainly as something to mine for algorithms and ideas, especially for those doing natural language interpreteation and generation. **WARNING**: nothing in this is stable. Nothing should be treated as an API. Feel free to steal ideas and code, but don't depend on `wwui` as a library. diff --git a/docs/cloverage/index.html b/docs/cloverage/index.html index ed8ae42..529264a 100644 --- a/docs/cloverage/index.html +++ b/docs/cloverage/index.html @@ -30,7 +30,7 @@ 713 - wwui.propositions
wwui.parser
484
+ wwui/parser.clj + + + + 001  (ns wwui.parser +
+ + 002    (:require [clojure.math.combinatorics :as combi] +
+ + 003              [clojure.pprint :refer [pprint]] +
+ + 004              [clojure.string :as s] +
+ + 005              [opennlp.nlp :as nlp] +
+ + 006              [opennlp.treebank :as tb] +
+ + 007              [taoensso.timbre :as log :refer [debug error info spy]] +
+ + 008              [wildwood.knowledge-accessor :refer [Accessor]])) +
+ + 009   +
+ + 010  ;; Position tags used by OpenNLP for English are documented here: +
+ + 011  ;; https://dpdearing.com/posts/2011/12/opennlp-part-of-speech-pos-tags-penn-english-treebank/ +
+ + 012   +
+ + 013  (def get-sentences (nlp/make-sentence-detector "models/en-sent.bin")) +
+ + 014  (def tokenize (nlp/make-tokenizer "models/en-token.bin")) +
+ + 015  (def pos-tag (nlp/make-pos-tagger "models/en-pos-maxent.bin")) +
+ + 016  (def name-find (nlp/make-name-finder "models/namefind/en-ner-person.bin")) +
+ + 017  ;; (def chunker (make-treebank-chunker "models/en-chunker.bin")) +
+ + 018   +
+ + 019  (def grammar +
+ + 020    "The objective of this grammar is to allow us to take a sequence of tagged symbols, and +
+ + 021    produce a higher-level tagging of parts of speech, and ultimately propositions, from them. +
+ + 022   +
+ + 023    *NOTE THAT* tags in this grammar are always keywords, to distinguish them from OpenNLP +
+ + 024    tags, which tag individual tokens and are represented as strings." +
+ + 025    {:contextual-reference [["PRP"]] ;; the documentation says PRP is 'peronal pronoun', +
+ + 026     ;; but it seems to be all pronouns. +
+ + 027     :noun [["NN"]["NNS"]["NNP"]["NNPS"]] +
+ + 028     :full-name [["NNP"] +
+ + 029                 ["NNP" :full-name]] ;; an unpunctuated sequence of proper nouns +
+ + 030                                     ;; probably represents a full name +
+ + 031     :noun-phrase [[:contextual-reference] +
+ + 032                   [:noun] +
+ + 033                   [:full-name] +
+ + 034                   ["DT" :noun] +
+ + 035                   [:adjectives :noun] +
+ + 036                   ["DT" :adjectives :noun]] +
+ + 037     :noun-phrases [[:noun-phrase] +
+ + 038                   [:noun-phrase "CC" :noun-phrases] +
+ + 039                   [:noun-phrase "," :noun-phrases]] +
+ + 040     :adjective [["JJ"]["JJR"]["JJS"]] +
+ + 041     :adjectives [[:adjective] +
+ + 042                  [:adjective :adjectives] +
+ + 043                  [:adjective "," :adjectives] +
+ + 044                  [:adjective "CC" :adjectives]] +
+ + 045     :verb [["VB"]["VBD"]["VBG"]["VBN"]["VBP"]["VBZ"]] +
+ + 046     :adverb [["RB"]["RBR"]["RBS"]] ;; beware here that negation and qualification show up only as adverbs +
+ + 047     :adverbs [[:adverb] +
+ + 048               [:adverb "," :adverbs] +
+ + 049               [:adverb "CC" :adverbs]] +
+ + 050     :verb-phrase [[:verb] +
+ + 051                   [:adverbs :verb] +
+ + 052                   [:verb :adverbs :verb] +
+ + 053                   [:verb :adverbs] +
+ + 054                   [:verb :adverbs :verb "TO"]] +
+ + 055     :locator [["IN" :noun-phrases]] +
+ + 056     :locators [[:locator] +
+ + 057                [:locator :locators] +
+ + 058                [:locator "," :locators]] +
+ + 059     :location [[:locators]] +
+ + 060     :subject [[:noun-phrases]] +
+ + 061     :object [[:noun-phrases]] +
+ + 062     :proposition [[:subject :verb-phrase :object] +
+ + 063                   [:location "," :subject :verb-phrase :object] +
+ + 064                   [:subject "," :location "," :verb-phrase :object] +
+ + 065                   [:subject :verb-phrase :object :location]] +
+ + 066     :propositions [[:proposition] +
+ + 067                    [:proposition "CC" :propositions] +
+ + 068                    [:proposition "," "CC" :propositions]]}) +
+ + 069   +
+ + 070  (declare reparse rdp-seek) +
+ + 071   +
+ + 072  (defn rdp-seek +
+ + 073    "Seek a phrase which satisfies this `goal` (expected to be a keyword) in +
+ + 074    this `tagged-sentence` using this `grammar`. +
+ + 075   +
+ + 076    Return a cons comprising +
+ + 077    1. the first matching phrase for the goal, tagged with the goal, or `nil` if +
+ + 078    no match; +
+ + 079    2. the tail of the sentence when the parts comprising the phrase are removed." +
+ + 080    [tagged-sentence grammar goal] +
+ + 081    (if (keyword? goal) +
+ + 082      (when (not (empty? tagged-sentence)) +
+ + 083        (when-let [result (first +
+ + 084                            (sort +
+ + 085                              #(< (count %1) (count %2)) +
+ + 086                              (remove +
+ + 087                                empty? +
+ + 088                                (map +
+ + 089                                  #(reparse tagged-sentence grammar %) +
+ + 090                                  (goal grammar)))))] +
+ + 091          (cons (cons (first result) (list goal)) (rest result)))) +
+ + 092      (throw (Exception. (str "Non-keyword passed to rdp-seek: `" goal "` (type " (or (type goal) "nil") ")"))))) +
+ + 093   +
+ + 094  (defmacro tag +
+ + 095    "The tag, on a `tagged-token`, is just the second element. Written as a macro +
+ + 096    for readability." +
+ + 097    [tagged-token] +
+ + 098    `(nth ~tagged-token 1)) +
+ + 099   +
+ + 100  (defmacro coll-or-nil? +
+ + 101    [o] +
+ + 102    "For fuck's sake, `nil` isn't a collection? What planet are these people on?" +
+ + 103    `(or (nil? ~o) (coll? ~o))) +
+ + 104   +
+ + 105  (defn rdp-extend +
+ + 106    "Seek a phrase which satisfies this `goal` (expected to be a collection of tags) in +
+ + 107    this `tagged-sentence` using this `grammar`. +
+ + 108   +
+ + 109    Return a cons comprising +
+ + 110    1. the first matching phrase for the goal, or `nil` if no match; +
+ + 111    2. the tail of the sentence when the parts comprising the phrase are removed." +
+ + 112    [tagged-sentence grammar goal] +
+ + 113    (cond +
+ + 114      (not (coll-or-nil? goal)) +
+ + 115      (throw (Exception. (str "Non-collection passed to rdp-extend: `" goal "` (type " (or (type goal) "nil") ")"))) +
+ + 116      (empty? goal) +
+ + 117      (cons (list) tagged-sentence) +
+ + 118      (not (empty? tagged-sentence)) +
+ + 119      (let [[tt & st] tagged-sentence +
+ + 120            [target & gt] goal] +
+ + 121        (cond +
+ + 122          (keyword? target) +
+ + 123          (when-let [[h & t](reparse tagged-sentence grammar target)] +
+ + 124            (when-let [[dh & dt] (reparse t grammar gt)] +
+ + 125              (cons (cons h dh) dt))) +
+ + 126          (= target (tag tt)) +
+ + 127            (when-let [[dh & dt] (reparse st grammar gt)] +
+ + 128          (cons (cons tt dh) dt)))))) +
+ + 129   +
+ + 130  (defn reparse +
+ + 131    "Reparse this `tagged-sentence` using this grammar to seek this `goal`. +
+ + 132    Parse greedily, seeking the most extended goal. +
+ + 133   +
+ + 134    Return a sequence comprising +
+ + 135    1. the first matching phrase for the goal, tagged with the goal, or `nil` +
+ + 136    if no match; +
+ + 137    2. the tail of the sentence when the parts comprising the phrase are removed. +
+ + 138   +
+ + 139    This function is called `reparse` because: +
+ + 140    1. it is designed to parse sentences which have already been parsed by +
+ + 141    OpenNLP: it will not work on raw sentences; +
+ + 142    2. it is a recursive descent parser." +
+ + 143    [tagged-sentence grammar goal] +
+ + 144    (log/debug "=> Choosing strategy for " +
+ + 145               goal " in " (with-out-str (pprint tagged-sentence))) +
+ + 146    (let [r (cond +
+ + 147              (keyword? goal) (rdp-seek tagged-sentence grammar goal) +
+ + 148              (coll-or-nil? goal) (rdp-extend tagged-sentence grammar goal))] +
+ + 149      (log/debug "<= " goal " in " +
+ + 150                 (s/trim (with-out-str (pprint tagged-sentence))) +
+ + 151                 " returned " (s/trim (with-out-str (pprint r)))) +
+ + 152      r)) +
+ + 153   +
+ + 154  (defn identify +
+ + 155    [parse-tree knowledge-accessor] +
+ + 156    ;; TODO: we don't yet have a working knowledge accessor. When we do, +
+ + 157    ;; construct a query from the contents of this parse-tree, and pass it +
+ + 158    ;; to the knowledge accessor in the hope of finding a true name. +
+ + 159    parse-tree) +
+ + 160   +
+ + 161  (defn normalise +
+ + 162    [parse-tree ka] +
+ + 163    (if +
+ + 164      (and (coll? parse-tree) (= (count parse-tree) 2)(keyword? (nth parse-tree 1))) +
+ + 165      (case (nth parse-tree 1) +
+ + 166        :proposition (list +
+ + 167                       (reduce +
+ + 168                         merge +
+ + 169                         {} +
+ + 170                         (map +
+ + 171                           ;; TODO: use combinatorics to extract all propositions from +
+ + 172                           ;; a proposition having multiple locations, multiple subject, +
+ + 173                           ;; objects and/or verbs +
+ + 174                           #(assoc {} (nth % 1) (identify (first %) ka)) +
+ + 175                           (map #(normalise % ka) (first parse-tree))))) +
+ + 176        (:location :subject :object) +
+ + 177        (cons +
+ + 178          (reduce +
+ + 179            concat +
+ + 180            (remove +
+ + 181              empty? +
+ + 182              (map #(normalise % ka) (first parse-tree)))) +
+ + 183          (list (nth parse-tree 1))) +
+ + 184        (:propositions :locators :noun-phrases :verbs) +
+ + 185        (reduce +
+ + 186          concat +
+ + 187          (remove +
+ + 188            empty? +
+ + 189            (map #(normalise % ka) (first parse-tree)))) +
+ + 190        ;; else +
+ + 191        parse-tree) +
+ + 192      parse-tree)) +
+ + 193   +
+ + 194  (defn propositions +
+ + 195    "Given a `tagged-sentence`, return a list of propositions detected in that +
+ + 196    sentence; if `knowledge-accessor` is passed, try to resolve names and noun +
+ + 197    phrases to entities known to that knowledge accessor. +
+ + 198   +
+ + 199    TODO: Note that if `:subject`, `:object` or `:locator` resolves to multiple +
+ + 200    objects, then that is essentially one proposition for each unique +
+ + 201    combination. This is not yet implemented!" +
+ + 202    ([tagged-sentence] +
+ + 203     (propositions tagged-sentence nil)) +
+ + 204    ([tagged-sentence ;; ^wildwood.knowledge-accessor.Accessor +
+ + 205      knowledge-accessor] +
+ + 206     ;; TODO: doesn't work yet. +
+ + 207     (reduce +
+ + 208       concat +
+ + 209       (remove +
+ + 210         empty? +
+ + 211         (map +
+ + 212           #(normalise % knowledge-accessor) +
+ + 213           (first (first (reparse tagged-sentence grammar :propositions)))))))) +
+ + 214   +
+ + 215  (defn propositions-from-file +
+ + 216    [file-path] +
+ + 217    (reduce +
+ + 218      concat +
+ + 219      (remove +
+ + 220        empty? +
+ + 221        (map +
+ + 222          #(propositions (pos-tag (tokenize %))) +
+ + 223          (get-sentences (slurp file-path)))))) +
+ + 224   +
+ + 225  ;; (reparse (pos-tag (tokenize "True love is the daughter of temperance, and temperance is utterly opposed to the madness of bodily pleasure.")) grammar :propositions) +
+ + 226  ;; (reparse [["temperance" "NN"] ["is" "VBZ"] ["utterly" "RB"] ["opposed" "VBN"] ["to" "TO"] ["the" "DT"] ["madness" "NN"] ["of" "IN"] ["bodily" "JJ"] ["pleasure" "NN"]] grammar :subject) +
+ + 227  ;; (reparse [["is" "VBZ"] ["utterly" "RB"] ["opposed" "VBN"] ["to" "TO"] ["the" "DT"] ["madness" "NN"] ["of" "IN"] ["bodily" "JJ"] ["pleasure" "NN"]] grammar :verb-phrase) +
+ + 228  ;; (reparse [["is" "VBZ"] ["utterly" "RB"] ["opposed" "VBN"] ["to" "TO"] ["the" "DT"] ["madness" "NN"] ["of" "IN"] ["bodily" "JJ"] ["pleasure" "NN"]] grammar :verb-phrase) +
+ + diff --git a/docs/codox/index.html b/docs/codox/index.html index 130425c..b235299 100644 --- a/docs/codox/index.html +++ b/docs/codox/index.html @@ -1,3 +1,3 @@ -Wwui 0.1.0-SNAPSHOT

Wwui 0.1.0-SNAPSHOT

Released under the GNU General Public License,version 2.0 or (at your option) any later version

Experimental work towards a conversational interface to Wildwood.

Installation

To install, add the following dependency to your project or build file:

[wwui "0.1.0-SNAPSHOT"]

Topics

Namespaces

wwui.core

TODO: write docs

Public variables and functions:

\ No newline at end of file +Wwui 0.1.0-SNAPSHOT

Wwui 0.1.0-SNAPSHOT

Released under the GNU General Public License,version 2.0 or (at your option) any later version

Experimental work towards a conversational interface to Wildwood.

Installation

To install, add the following dependency to your project or build file:

[wwui "0.1.0-SNAPSHOT"]

Topics

Namespaces

wwui.core

TODO: write docs

Public variables and functions:

\ No newline at end of file diff --git a/docs/codox/intro.html b/docs/codox/intro.html index b758443..dd4a5fb 100644 --- a/docs/codox/intro.html +++ b/docs/codox/intro.html @@ -1,4 +1,18 @@ -Introduction to wwui

Introduction to wwui

-

TODO: write great documentation

\ No newline at end of file +Introduction to wwui

Introduction to wwui

+

The long term objective for WWUI is to provide a conversational, natural language interface to Wildwood: that is, that you should be able to question Wildwood agents in normal, everyday natural language - initially only English - and have it reply in the same way. It may possibly also display on a screen diagrams and documents which support its answer and reasoning, but the primary intention is that it should converse.

+

This boils down to two things: parsing, and idiomatic language generation. The intention here is not to have full general comrehension of a natural language. Questions which may sensibly be asked of an agent essentially query the truth value of a proposition, or the value of some fact, such as:

+
    +
  • Is Brutus honourable?
  • +
  • Where did Caesar die?
  • +
  • Did Brutus kill Caesar?
  • +
+

and it’s perfectly OK, if a question doesn’t conform to this general form, for the interface to respond with some text of the general form ‘I don’t understand’, or ‘I don’t know about…’

+

In the past, other conversational artificial intelligence systems, e.g. Mycin, APES have allowed the user to query how a result was achieved. I’m hoping that whe working of Wildwood will be sufficiently transparent that this question is largely redundant; however, it’s entirely possible that I shall implement a response to both the primitive how question (‘How do you know that?’, where the proposition being asked about is inferred from the conversational context) and a more general (‘How do you know that P?’, where P is any proposition). Indeed, the former is a special case of the latter.

+

Major components

+

Parser

+

Initial work so far is on parsing propositions out of continuous English text. At present (1st May 2020), I’m finding 63 propositions in the 5,285 sentences of Jowett’s translation into English of Plato’s Republic, which is not a good strike rate. More discussion of the parser is found here.

+

A critical point about the parser is that it must be able not only to parse user input, but also to parse documents in order to extract knowledge from them.

+

Generator

+

The generator is in principle much simpler than the parser; it merely needs to be able to output, as idiomatic natural language, the trace of the moves in the inference game, and the proposition which is the conclusion of the inference. To make this flow naturally, several templates will need to be stored for each potential output form, in order that these can be varied to prevent unduly repetitive output; and there may be some heuristics which guide when to use particular templates.

\ No newline at end of file diff --git a/docs/codox/parsing.html b/docs/codox/parsing.html new file mode 100644 index 0000000..7b6aa5f --- /dev/null +++ b/docs/codox/parsing.html @@ -0,0 +1,34 @@ + +Parsing

Parsing

+

Generally, The wwui parser needs to be able to recognise, and to extract in a form usable by wildwood, two general forms of utterances:

+
    +
  1. Propositions, and
  2. +
  3. Questions;
  4. +
+

where questions can be further subdivided:

+
    +
  1. is (proposition) true at present?
  2. +
  3. was (proposition) true in the past?
  4. +
  5. will (proposition) be true in the future?
  6. +
  7. is (proposition) true at this (time or time range specification)?
  8. +
  9. what is the value of (property) of (entity)?
  10. +
  11. how do you know that (proposition) has (truth-value)?
  12. +
  13. how do you know that (property) of (entity) has (value)?
  14. +
+

So the key things we need to know about and identify in natural language input are

+
    +
  1. Propositions;
  2. +
  3. Entities;
  4. +
  5. Properties;
  6. +
  7. Values
  8. +
+

At the current stage of development of the current iteration, is is anticipated that the key construct that wildwood will reason with are located two position propositions; that is to say propositions having

+
    +
  1. A verb;
  2. +
  3. A subject, being an entity;
  4. +
  5. An object, being an entity;
  6. +
  7. Optionally, a spatial location;
  8. +
  9. Optionally, a temporal location.
  10. +
+

The principle behind wildwood - at least in its current iteration - is that for the inference game to work, there has to be, for every entity, a ‘true name’ or unique identifier consensually agreed by each agent for each entity. The parser, obviously, can only parse noun phrases, so to be able to resolve noun phrases to true names there must be an API for the parser to pass a noun phrase, possibly with some context, to a knowledge accessor and receive a true name back.

\ No newline at end of file diff --git a/docs/codox/wwui.core.html b/docs/codox/wwui.core.html index 457c841..cd42eb1 100644 --- a/docs/codox/wwui.core.html +++ b/docs/codox/wwui.core.html @@ -1,3 +1,3 @@ -wwui.core documentation

wwui.core

TODO: write docs

-main

(-main & args)

I don’t do a whole lot … yet.

\ No newline at end of file +wwui.core documentation

wwui.core

TODO: write docs

-main

(-main & args)

I don’t do a whole lot … yet.

\ No newline at end of file diff --git a/docs/codox/wwui.parser.html b/docs/codox/wwui.parser.html new file mode 100644 index 0000000..fbb52e2 --- /dev/null +++ b/docs/codox/wwui.parser.html @@ -0,0 +1,9 @@ + +wwui.parser documentation

wwui.parser

TODO: write docs

coll-or-nil?

macro

(coll-or-nil? o)

TODO: write docs

get-sentences

TODO: write docs

grammar

The objective of this grammar is to allow us to take a sequence of tagged symbols, and produce a higher-level tagging of parts of speech, and ultimately propositions, from them.

+

NOTE THAT tags in this grammar are always keywords, to distinguish them from OpenNLP tags, which tag individual tokens and are represented as strings.

identify

(identify parse-tree knowledge-accessor)

TODO: write docs

name-find

TODO: write docs

normalise

(normalise parse-tree ka)

TODO: write docs

pos-tag

TODO: write docs

propositions

(propositions tagged-sentence)(propositions tagged-sentence knowledge-accessor)

Given a tagged-sentence, return a list of propositions detected in that sentence; if knowledge-accessor is passed, try to resolve names and noun phrases to entities known to that knowledge accessor.

+

TODO: Note that if :subject, :object or :locator resolves to multiple objects, then that is essentially one proposition for each unique combination. This is not yet implemented!

propositions-from-file

(propositions-from-file file-path)

TODO: write docs

rdp-extend

(rdp-extend tagged-sentence grammar goal)

Seek a phrase which satisfies this goal (expected to be a collection of tags) in this tagged-sentence using this grammar.

+

Return a cons comprising 1. the first matching phrase for the goal, or nil if no match; 2. the tail of the sentence when the parts comprising the phrase are removed.

rdp-seek

(rdp-seek tagged-sentence grammar goal)

Seek a phrase which satisfies this goal (expected to be a keyword) in this tagged-sentence using this grammar.

+

Return a cons comprising 1. the first matching phrase for the goal, tagged with the goal, or nil if no match; 2. the tail of the sentence when the parts comprising the phrase are removed.

reparse

(reparse tagged-sentence grammar goal)

Reparse this tagged-sentence using this grammar to seek this goal. Parse greedily, seeking the most extended goal.

+

Return a sequence comprising 1. the first matching phrase for the goal, tagged with the goal, or nil if no match; 2. the tail of the sentence when the parts comprising the phrase are removed.

+

This function is called reparse because: 1. it is designed to parse sentences which have already been parsed by OpenNLP: it will not work on raw sentences; 2. it is a recursive descent parser.

tag

macro

(tag tagged-token)

The tag, on a tagged-token, is just the second element. Written as a macro for readability.

tokenize

TODO: write docs

\ No newline at end of file diff --git a/test/wwui/propositions_test.clj b/test/wwui/parser_test.clj similarity index 98% rename from test/wwui/propositions_test.clj rename to test/wwui/parser_test.clj index 20131ef..85b2e2c 100644 --- a/test/wwui/propositions_test.clj +++ b/test/wwui/parser_test.clj @@ -1,6 +1,6 @@ -(ns wwui.propositions-test +(ns wwui.parser-test (:require [clojure.test :refer :all] - [wwui.propositions :refer :all] + [wwui.parser :refer :all] [taoensso.timbre :as log :refer [set-level!]])) (log/set-level! :error)