From fffaf0cca048736d6794da0d27416dfe6d62fcef Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Sat, 9 Nov 2013 00:39:20 +0000 Subject: [PATCH] Basically finished. Not capitalising initial words of sentences; not printing output to an output file even if one is specified; not honouring the help flag. --- project.clj | 3 +- src/milkwood_clj/analyse.clj | 10 ++-- src/milkwood_clj/core.clj | 29 ++++++++--- src/milkwood_clj/synthesise.clj | 89 ++++++++++++++++++++++++++++++--- src/milkwood_clj/utils.clj | 11 +++- test/milkwood_clj/core_test.clj | 6 +-- 6 files changed, 126 insertions(+), 22 deletions(-) diff --git a/project.clj b/project.clj index cd06700..4f1709e 100644 --- a/project.clj +++ b/project.clj @@ -3,7 +3,8 @@ :url "http://example.com/FIXME" :license {:name "Eclipse Public License" :url "http://www.eclipse.org/legal/epl-v10.html"} - :dependencies [[org.clojure/clojure "1.5.1"]] + :dependencies [[org.clojure/clojure "1.5.1"] + [org.clojure/tools.cli "0.2.4"]] :main milkwood-clj.core :jvm-opts ["-Xss4m"] :profiles {:uberjar {:aot :all}}) diff --git a/src/milkwood_clj/analyse.clj b/src/milkwood_clj/analyse.clj index 05f5bf5..86664f6 100644 --- a/src/milkwood_clj/analyse.clj +++ b/src/milkwood_clj/analyse.clj @@ -14,7 +14,12 @@ true (hash-map (first path) (compose-rule (rest path))))) -(defn merge-rules [these those] +(defn merge-rules + "Merge two rule trees. + + these: a rule tree; + those: a rule tree." + [these those] (utils/deep-merge-with set/union these those)) (defn add-rule @@ -51,5 +56,4 @@ file: the path name of a file to read; depth: the depth of rules/length of window we're considering" [file depth] - (analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+\'s|\w+|\p{Punct}" (slurp file))) depth)) - + (analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+\'[st]|\w+|\p{Punct}" (slurp file))) depth)) diff --git a/src/milkwood_clj/core.clj b/src/milkwood_clj/core.clj index 2c7a1a3..f7085da 100644 --- a/src/milkwood_clj/core.clj +++ b/src/milkwood_clj/core.clj @@ -1,18 +1,33 @@ (ns milkwood-clj.core (require [milkwood-clj.analyse :as analyse] - [milkwood-clj.synthesise :as sythesise] + [milkwood-clj.synthesise :as synthesise] [clojure.set :as set]) + (:use [clojure.tools.cli :only [cli]]) (:gen-class)) (defn -main - "I don't do a whole lot ... yet." + "Parse command line arguments and kick off the process." [& args] - (println "Hello, World!")) + (let [[arguments _ banner] (cli args ["-f" "--file" "The path name of the file to analyse (string)"] + ["-l" "--output-length" + "The length in tokens of the output to generate (integer)" + :parse-fn #(Integer. %) + :default 100] + ["-h" "--help" "Print this text and exit" :flag true] + ["-o" "--output" "The path name of the file to write to, if any (string)"] + ["-t" "--tuple-length" + "The length of the sequences to analyse it into (integer)" + :parse-fn #(Integer. %) + :default 2]) file (arguments :file)] + (cond + (= file nil) (print banner) + (arguments :help) (print banner) + true (synthesise/write-output + (synthesise/compose-nonsense + (analyse/analyse-file file (arguments :tuple-length)) + (arguments :output-length)))) + (prn "\n"))) -;;;; read side - also probably a separate file - - -;;;; write side diff --git a/src/milkwood_clj/synthesise.clj b/src/milkwood_clj/synthesise.clj index e20a73c..b503913 100644 --- a/src/milkwood_clj/synthesise.clj +++ b/src/milkwood_clj/synthesise.clj @@ -4,6 +4,8 @@ (:gen-class)) +(def end-magic-token "END") + (defn next-tokens "Given these rules and this path, return a list of valid next tokens to emit. @@ -11,14 +13,87 @@ path: a flat sequence of tokens." [rules path] (cond - (empty? (rest path)) (shuffle (keys (rules (first path)))) + (empty? (rest path)) (keys (rules (first path))) (empty? (rules (first path))) nil true (next-tokens (rules (first path)) (rest path)))) -(defn compose-nonsense - [window token depth length] - (cond - (= length 0) nil - ) +(defn compose-prologue + "Generate a prologue before entering the rule driven part of the synthesis + process. The prologue needs to be as long as the rule-tree depth, to allow + it to be used as a glance back window. + TODO: Should really generate something which is a gramatically legal start + of an English sentence. - ) + rules: a rule tree; + depth: the depth of that rule tree." + [rules depth] + (cond + (= depth 0) nil + true (let [token (first (shuffle (keys rules)))] + (cons token (compose-prologue (rules token) (- depth 1)))))) + + + +(defn compose-nonsense + "Try to compose this much nonsense given these rules and this glance-back window. + This is a rather messy hack around Clojure's curiously awkward rules on mutually + recursive functions. The second and third arities of this function (4 args and + 5 args respectively) are in effect mutually recursive functions. The 4 args form + extends the output, the five args form hunts sideways among the options. The first, + two arg arity is simply a convenient entry point. + + rules: a rule tree; + window: a glance back at the last tokens emitted; + depth: the target size of the glance back window, which should be one less than + the depth of the rule tree; + length: the number of tokens of nonsense which should be generated; + options: candidates to be the next token" + ([rules length] + (compose-nonsense rules nil (- (utils/rule-tree-depth rules) 1) length)) + ([rules window depth length] + (let [window-size (count window) options (next-tokens rules window)] + (cond + ;; if no more length, we're done. We need to return something non-null to indicate we haven't failed. + (= length 0) (list end-magic-token) + ;; if we've no window, we need to choose one and continue. + (= window nil) + (let [prologue (compose-prologue rules depth)] + (flatten (list prologue (compose-nonsense rules prologue depth (- length depth))))) + ;; if we've a window but no options, we're stuffed. + (= options nil) nil + true (compose-nonsense rules window depth (- length 1) (shuffle options))))) + ([rules window depth length options] + (cond + ;; if I've run out of options, I've failed. + (empty? options) nil + ;; if I still have some options... + true + ;; try the first of them... + (let [nonsense (compose-nonsense rules (utils/slide-window window (first options) depth) depth length)] + (cond + ;; if that fails, try the others... + (empty? nonsense) (compose-nonsense rules window depth length (rest options)) + ;; but if it succeeds we're good. + true (cons (first options) nonsense)) + )))) + +(defn write-token + [token] + "Write a single token to the output, doing some basic orthographic tricks. + + token: the token to write." + (cond + (= token end-magic-token) nil + (re-find #"^[.!?]$" token) (do (print token) (cond (= (rand-int 5) 0) (print "\n\n"))) + (re-find #"^\p{Punct}$" token) (print token) + true (print (str " " token)))) + +(defn write-output + "Write this output, doing little orthographic tricks to make it look superficially + like real English text. + TODO: does not yet work. Should take an optional second argument, + the file to write to if any (default to standard out). + + output: a sequence of tokens to write." + [output] + (dorun (map write-token output))) diff --git a/src/milkwood_clj/utils.clj b/src/milkwood_clj/utils.clj index 3a1df5d..19eba96 100644 --- a/src/milkwood_clj/utils.clj +++ b/src/milkwood_clj/utils.clj @@ -3,7 +3,16 @@ [clojure.set :as set]) (:gen-class)) -;;;; utilities - probably in the fullness of time a separate file +(defn rule-tree-depth + "Return the depth of this rule tree. + + rules: a rule tree. + NOTE: it is possible that there might be some short rules in a rule tree, so this + implementation is not entirely safe." + [rules] + (cond + (empty? rules) 0 + true (+ 1 (apply min (map (fn [key] (rule-tree-depth (rules key))) (keys rules)))))) (defn slide-window diff --git a/test/milkwood_clj/core_test.clj b/test/milkwood_clj/core_test.clj index 1c0e0d4..fb0aacc 100644 --- a/test/milkwood_clj/core_test.clj +++ b/test/milkwood_clj/core_test.clj @@ -2,6 +2,6 @@ (:require [clojure.test :refer :all] [milkwood-clj.core :refer :all])) -(deftest a-test - (testing "Nothing worth testing in core, yet." - (is (= 1 1)))) +;;(deftest a-test +;; (testing "Nothing worth testing in core, yet." +;; (is (= 1 1))))