Yet more polishing and primping. Added function top-and-tail, which

seeks to ensure that if possible the output starts at the beginning
of a sentence and ends at the end of one.
This commit is contained in:
Simon Brooke 2013-11-09 15:04:22 +00:00
parent 953259b379
commit fd36f8e1ca
3 changed files with 83 additions and 9 deletions

View file

@ -4,8 +4,10 @@
[clojure.set :as set]) [clojure.set :as set])
(:gen-class)) (:gen-class))
(def token-pattern (def ^:const token-pattern
"Regular expression used to split input into tokens." "Regular expression used to split input into tokens.
TODO: note that backslash-w captures underscores as word characters.
Probably better to use [a-zA-Z]*."
;; #"\w+\'[stdm]|\w+|\p{Punct}" ;; #"\w+\'[stdm]|\w+|\p{Punct}"
#"\w+['-]\w+|\w+|\p{Punct}" #"\w+['-]\w+|\w+|\p{Punct}"
) )

View file

@ -3,12 +3,30 @@
[milkwood-clj.utils :as utils]) [milkwood-clj.utils :as utils])
(:gen-class)) (:gen-class))
(def ^:const av-sentences-per-para
"Average number of sentences in a paragraph"
5)
(def end-magic-token (def ^:const end-magic-token
"A token to mark the end of the generated test, used to "A token to mark the end of the generated test, used to
distinguish completion from failure." distinguish completion from failure."
"ENDMAGICTOKEN") "ENDMAGICTOKEN")
(def ^:const end-of-sentence-pattern
"Pattern which matches end of sentence tokens."
#"^[.!?]$")
(def ^:const punctuation-pattern
"Pattern which matches punctuation."
#"^\p{Punct}$")
(defn end-of-sentence? [token]
(re-find end-of-sentence-pattern token))
(defn punctuation? [token]
(re-find punctuation-pattern token))
(defn next-tokens (defn next-tokens
"Given these rules and this path, return a list of valid next tokens to emit. "Given these rules and this path, return a list of valid next tokens to emit.
@ -80,16 +98,60 @@
true (cons (first options) nonsense)) true (cons (first options) nonsense))
)))) ))))
(defn top-and-tail
"Top and tail this sequence of tokens so that it starts at the beginning of a sentence
and ends at the end of one.
output: a flat sequence of tokens"
([output]
(top-and-tail output false (not (empty? (remove nil? (map end-of-sentence? output))))))
([output topped? end-in-sight?]
(cond
;; if there is no output, we're done.
(empty? output) nil
;; if there are no end-of-sentence markers in the output, return the output and we're done.
(not end-in-sight?) output
;; if we've topped the output...
topped?
(cond
;; if the first thing in the output is an end-of-sentence marker, continue, checking whether there's another.
(end-of-sentence? (first output))
(let [another? (not (empty? (remove nil? (map end-of-sentence? (rest output)))))]
(cond
;; if there is another end-of-sentence yet to find, continue.
another? (cons (first output) (top-and-tail (rest output) topped? another?))
;; otherwise, we're done.
true (list (first output))))
;; otherwise just continue.
true (cons (first output) (top-and-tail (rest output) topped? end-in-sight?)))
;; if the first thing in the output is an end-of-sentence marker, we've 'topped' and want the rest.
(end-of-sentence? (first output))
(top-and-tail (rest output) true (not (empty? (remove nil? (map end-of-sentence? (rest output))))))
;; else discard the head and continue
true
(top-and-tail (rest output)) topped? end-in-sight?)))
(defn write-token (defn write-token
[token] [token]
"Write a single token to the output, doing some basic orthographic tricks. "Write a single token to the output, performing some basic orthographic tricks.
token: the token to write." token: the token to write."
(cond (cond
(= token end-magic-token) nil (= token end-magic-token)
(re-find #"^[.!?]$" token) (do (print token) (cond (= (rand-int 5) 0) (print "\n\n"))) ;; suppress the end magic token.
(re-find #"^\p{Punct}$" token) (print token) nil
true (print (str " " token)))) (end-of-sentence? token)
;; end of sentence: suppress leading space and possibly terminate paragraph.
(do (print token)
(cond
(= (rand-int av-sentences-per-para) 0) (print "\n\n")))
(punctuation? token)
;; other punctuation: suppress leading whitespace.
(print token)
true
;; everything else, print leading space and token.
(print (str " " token))))
(defn write-output (defn write-output
"Write this output, doing little orthographic tricks to make it look superficially "Write this output, doing little orthographic tricks to make it look superficially
@ -99,4 +161,5 @@
output: a sequence of tokens to write." output: a sequence of tokens to write."
[output] [output]
(dorun (map write-token output))) (dorun (map write-token (top-and-tail output)))
(print "\n\n"))

View file

@ -0,0 +1,9 @@
(ns milkwood-clj.synthesise-test
(:require [clojure.test :refer :all]
[milkwood-clj.synthesise :refer :all]))
(deftest top-and-tail-test
(testing "Test top and tailing of output"
(is (= (top-and-tail '("a" "b" "c" "?" "d" "e" "f" "." "g" "h" "i" "!")) '("d" "e" "f" "." "g" "h" "i" "!")))
(is (= (top-and-tail '("a" "b" "c" "?" "d" "e" "f" "." "g" "h" "i")) '("d" "e" "f" ".")))
))