Basically finished. Not capitalising initial words of sentences; not
printing output to an output file even if one is specified; not honouring the help flag.
This commit is contained in:
parent
5141419efa
commit
fffaf0cca0
|
@ -3,7 +3,8 @@
|
||||||
:url "http://example.com/FIXME"
|
:url "http://example.com/FIXME"
|
||||||
:license {:name "Eclipse Public License"
|
:license {:name "Eclipse Public License"
|
||||||
:url "http://www.eclipse.org/legal/epl-v10.html"}
|
:url "http://www.eclipse.org/legal/epl-v10.html"}
|
||||||
:dependencies [[org.clojure/clojure "1.5.1"]]
|
:dependencies [[org.clojure/clojure "1.5.1"]
|
||||||
|
[org.clojure/tools.cli "0.2.4"]]
|
||||||
:main milkwood-clj.core
|
:main milkwood-clj.core
|
||||||
:jvm-opts ["-Xss4m"]
|
:jvm-opts ["-Xss4m"]
|
||||||
:profiles {:uberjar {:aot :all}})
|
:profiles {:uberjar {:aot :all}})
|
||||||
|
|
|
@ -14,7 +14,12 @@
|
||||||
true (hash-map (first path) (compose-rule (rest path)))))
|
true (hash-map (first path) (compose-rule (rest path)))))
|
||||||
|
|
||||||
|
|
||||||
(defn merge-rules [these those]
|
(defn merge-rules
|
||||||
|
"Merge two rule trees.
|
||||||
|
|
||||||
|
these: a rule tree;
|
||||||
|
those: a rule tree."
|
||||||
|
[these those]
|
||||||
(utils/deep-merge-with set/union these those))
|
(utils/deep-merge-with set/union these those))
|
||||||
|
|
||||||
(defn add-rule
|
(defn add-rule
|
||||||
|
@ -51,5 +56,4 @@
|
||||||
file: the path name of a file to read;
|
file: the path name of a file to read;
|
||||||
depth: the depth of rules/length of window we're considering"
|
depth: the depth of rules/length of window we're considering"
|
||||||
[file depth]
|
[file depth]
|
||||||
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+\'s|\w+|\p{Punct}" (slurp file))) depth))
|
(analyse-tokens nil nil (map (fn [string] (.toLowerCase string)) (re-seq #"\w+\'[st]|\w+|\p{Punct}" (slurp file))) depth))
|
||||||
|
|
||||||
|
|
|
@ -1,18 +1,33 @@
|
||||||
(ns milkwood-clj.core
|
(ns milkwood-clj.core
|
||||||
(require
|
(require
|
||||||
[milkwood-clj.analyse :as analyse]
|
[milkwood-clj.analyse :as analyse]
|
||||||
[milkwood-clj.synthesise :as sythesise]
|
[milkwood-clj.synthesise :as synthesise]
|
||||||
[clojure.set :as set])
|
[clojure.set :as set])
|
||||||
|
(:use [clojure.tools.cli :only [cli]])
|
||||||
(:gen-class))
|
(:gen-class))
|
||||||
|
|
||||||
|
|
||||||
(defn -main
|
(defn -main
|
||||||
"I don't do a whole lot ... yet."
|
"Parse command line arguments and kick off the process."
|
||||||
[& args]
|
[& args]
|
||||||
(println "Hello, World!"))
|
(let [[arguments _ banner] (cli args ["-f" "--file" "The path name of the file to analyse (string)"]
|
||||||
|
["-l" "--output-length"
|
||||||
|
"The length in tokens of the output to generate (integer)"
|
||||||
|
:parse-fn #(Integer. %)
|
||||||
|
:default 100]
|
||||||
|
["-h" "--help" "Print this text and exit" :flag true]
|
||||||
|
["-o" "--output" "The path name of the file to write to, if any (string)"]
|
||||||
|
["-t" "--tuple-length"
|
||||||
|
"The length of the sequences to analyse it into (integer)"
|
||||||
|
:parse-fn #(Integer. %)
|
||||||
|
:default 2]) file (arguments :file)]
|
||||||
|
(cond
|
||||||
|
(= file nil) (print banner)
|
||||||
|
(arguments :help) (print banner)
|
||||||
|
true (synthesise/write-output
|
||||||
|
(synthesise/compose-nonsense
|
||||||
|
(analyse/analyse-file file (arguments :tuple-length))
|
||||||
|
(arguments :output-length))))
|
||||||
|
(prn "\n")))
|
||||||
|
|
||||||
|
|
||||||
;;;; read side - also probably a separate file
|
|
||||||
|
|
||||||
|
|
||||||
;;;; write side
|
|
||||||
|
|
|
@ -4,6 +4,8 @@
|
||||||
(:gen-class))
|
(:gen-class))
|
||||||
|
|
||||||
|
|
||||||
|
(def end-magic-token "END")
|
||||||
|
|
||||||
(defn next-tokens
|
(defn next-tokens
|
||||||
"Given these rules and this path, return a list of valid next tokens to emit.
|
"Given these rules and this path, return a list of valid next tokens to emit.
|
||||||
|
|
||||||
|
@ -11,14 +13,87 @@
|
||||||
path: a flat sequence of tokens."
|
path: a flat sequence of tokens."
|
||||||
[rules path]
|
[rules path]
|
||||||
(cond
|
(cond
|
||||||
(empty? (rest path)) (shuffle (keys (rules (first path))))
|
(empty? (rest path)) (keys (rules (first path)))
|
||||||
(empty? (rules (first path))) nil
|
(empty? (rules (first path))) nil
|
||||||
true (next-tokens (rules (first path)) (rest path))))
|
true (next-tokens (rules (first path)) (rest path))))
|
||||||
|
|
||||||
(defn compose-nonsense
|
(defn compose-prologue
|
||||||
[window token depth length]
|
"Generate a prologue before entering the rule driven part of the synthesis
|
||||||
(cond
|
process. The prologue needs to be as long as the rule-tree depth, to allow
|
||||||
(= length 0) nil
|
it to be used as a glance back window.
|
||||||
)
|
TODO: Should really generate something which is a gramatically legal start
|
||||||
|
of an English sentence.
|
||||||
|
|
||||||
)
|
rules: a rule tree;
|
||||||
|
depth: the depth of that rule tree."
|
||||||
|
[rules depth]
|
||||||
|
(cond
|
||||||
|
(= depth 0) nil
|
||||||
|
true (let [token (first (shuffle (keys rules)))]
|
||||||
|
(cons token (compose-prologue (rules token) (- depth 1))))))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
(defn compose-nonsense
|
||||||
|
"Try to compose this much nonsense given these rules and this glance-back window.
|
||||||
|
This is a rather messy hack around Clojure's curiously awkward rules on mutually
|
||||||
|
recursive functions. The second and third arities of this function (4 args and
|
||||||
|
5 args respectively) are in effect mutually recursive functions. The 4 args form
|
||||||
|
extends the output, the five args form hunts sideways among the options. The first,
|
||||||
|
two arg arity is simply a convenient entry point.
|
||||||
|
|
||||||
|
rules: a rule tree;
|
||||||
|
window: a glance back at the last tokens emitted;
|
||||||
|
depth: the target size of the glance back window, which should be one less than
|
||||||
|
the depth of the rule tree;
|
||||||
|
length: the number of tokens of nonsense which should be generated;
|
||||||
|
options: candidates to be the next token"
|
||||||
|
([rules length]
|
||||||
|
(compose-nonsense rules nil (- (utils/rule-tree-depth rules) 1) length))
|
||||||
|
([rules window depth length]
|
||||||
|
(let [window-size (count window) options (next-tokens rules window)]
|
||||||
|
(cond
|
||||||
|
;; if no more length, we're done. We need to return something non-null to indicate we haven't failed.
|
||||||
|
(= length 0) (list end-magic-token)
|
||||||
|
;; if we've no window, we need to choose one and continue.
|
||||||
|
(= window nil)
|
||||||
|
(let [prologue (compose-prologue rules depth)]
|
||||||
|
(flatten (list prologue (compose-nonsense rules prologue depth (- length depth)))))
|
||||||
|
;; if we've a window but no options, we're stuffed.
|
||||||
|
(= options nil) nil
|
||||||
|
true (compose-nonsense rules window depth (- length 1) (shuffle options)))))
|
||||||
|
([rules window depth length options]
|
||||||
|
(cond
|
||||||
|
;; if I've run out of options, I've failed.
|
||||||
|
(empty? options) nil
|
||||||
|
;; if I still have some options...
|
||||||
|
true
|
||||||
|
;; try the first of them...
|
||||||
|
(let [nonsense (compose-nonsense rules (utils/slide-window window (first options) depth) depth length)]
|
||||||
|
(cond
|
||||||
|
;; if that fails, try the others...
|
||||||
|
(empty? nonsense) (compose-nonsense rules window depth length (rest options))
|
||||||
|
;; but if it succeeds we're good.
|
||||||
|
true (cons (first options) nonsense))
|
||||||
|
))))
|
||||||
|
|
||||||
|
(defn write-token
|
||||||
|
[token]
|
||||||
|
"Write a single token to the output, doing some basic orthographic tricks.
|
||||||
|
|
||||||
|
token: the token to write."
|
||||||
|
(cond
|
||||||
|
(= token end-magic-token) nil
|
||||||
|
(re-find #"^[.!?]$" token) (do (print token) (cond (= (rand-int 5) 0) (print "\n\n")))
|
||||||
|
(re-find #"^\p{Punct}$" token) (print token)
|
||||||
|
true (print (str " " token))))
|
||||||
|
|
||||||
|
(defn write-output
|
||||||
|
"Write this output, doing little orthographic tricks to make it look superficially
|
||||||
|
like real English text.
|
||||||
|
TODO: does not yet work. Should take an optional second argument,
|
||||||
|
the file to write to if any (default to standard out).
|
||||||
|
|
||||||
|
output: a sequence of tokens to write."
|
||||||
|
[output]
|
||||||
|
(dorun (map write-token output)))
|
||||||
|
|
|
@ -3,7 +3,16 @@
|
||||||
[clojure.set :as set])
|
[clojure.set :as set])
|
||||||
(:gen-class))
|
(:gen-class))
|
||||||
|
|
||||||
;;;; utilities - probably in the fullness of time a separate file
|
(defn rule-tree-depth
|
||||||
|
"Return the depth of this rule tree.
|
||||||
|
|
||||||
|
rules: a rule tree.
|
||||||
|
NOTE: it is possible that there might be some short rules in a rule tree, so this
|
||||||
|
implementation is not entirely safe."
|
||||||
|
[rules]
|
||||||
|
(cond
|
||||||
|
(empty? rules) 0
|
||||||
|
true (+ 1 (apply min (map (fn [key] (rule-tree-depth (rules key))) (keys rules))))))
|
||||||
|
|
||||||
|
|
||||||
(defn slide-window
|
(defn slide-window
|
||||||
|
|
|
@ -2,6 +2,6 @@
|
||||||
(:require [clojure.test :refer :all]
|
(:require [clojure.test :refer :all]
|
||||||
[milkwood-clj.core :refer :all]))
|
[milkwood-clj.core :refer :all]))
|
||||||
|
|
||||||
(deftest a-test
|
;;(deftest a-test
|
||||||
(testing "Nothing worth testing in core, yet."
|
;; (testing "Nothing worth testing in core, yet."
|
||||||
(is (= 1 1))))
|
;; (is (= 1 1))))
|
||||||
|
|
Loading…
Reference in a new issue