From fd36f8e1ca4d48977ec66da49c027df1e2902245 Mon Sep 17 00:00:00 2001
From: Simon Brooke <simon@journeyman.cc>
Date: Sat, 9 Nov 2013 15:04:22 +0000
Subject: [PATCH] Yet more polishing and primping. Added function top-and-tail,
 which seeks to ensure that if possible the output starts at the beginning of
 a sentence and ends at the end of one.

---
 src/milkwood_clj/analyse.clj          |  6 ++-
 src/milkwood_clj/synthesise.clj       | 77 ++++++++++++++++++++++++---
 test/milkwood_clj/synthesise_test.clj |  9 ++++
 3 files changed, 83 insertions(+), 9 deletions(-)
 create mode 100644 test/milkwood_clj/synthesise_test.clj

diff --git a/src/milkwood_clj/analyse.clj b/src/milkwood_clj/analyse.clj
index 41f7e02..168c2be 100644
--- a/src/milkwood_clj/analyse.clj
+++ b/src/milkwood_clj/analyse.clj
@@ -4,8 +4,10 @@
    [clojure.set :as set])
   (:gen-class))
 
-(def token-pattern
-  "Regular expression used to split input into tokens."
+(def ^:const token-pattern
+  "Regular expression used to split input into tokens.
+   TODO: note that backslash-w captures underscores as word characters.
+   Probably better to use [a-zA-Z]*."
 ;;  #"\w+\'[stdm]|\w+|\p{Punct}"
     #"\w+['-]\w+|\w+|\p{Punct}"
   )
diff --git a/src/milkwood_clj/synthesise.clj b/src/milkwood_clj/synthesise.clj
index cb25510..9aa8ea8 100644
--- a/src/milkwood_clj/synthesise.clj
+++ b/src/milkwood_clj/synthesise.clj
@@ -3,12 +3,30 @@
    [milkwood-clj.utils :as utils])
   (:gen-class))
 
+(def ^:const av-sentences-per-para
+     "Average number of sentences in a paragraph"
+     5)
 
-(def end-magic-token
+(def ^:const end-magic-token
   "A token to mark the end of the generated test, used to
   distinguish completion from failure."
   "ENDMAGICTOKEN")
 
+(def ^:const end-of-sentence-pattern
+  "Pattern which matches end of sentence tokens."
+  #"^[.!?]$")
+
+(def ^:const punctuation-pattern
+  "Pattern which matches punctuation."
+  #"^\p{Punct}$")
+
+
+(defn end-of-sentence? [token]
+  (re-find end-of-sentence-pattern token))
+
+(defn punctuation? [token]
+  (re-find punctuation-pattern token))
+
 (defn next-tokens
   "Given these rules and this path, return a list of valid next tokens to emit.
 
@@ -80,16 +98,60 @@
           true (cons (first options) nonsense))
          ))))
 
+(defn top-and-tail
+  "Top and tail this sequence of tokens so that it starts at the beginning of a sentence
+   and ends at the end of one.
+
+   output: a flat sequence of tokens"
+   ([output]
+    (top-and-tail output false (not (empty? (remove nil? (map end-of-sentence? output))))))
+   ([output topped? end-in-sight?]
+    (cond
+     ;; if there is no output, we're done.
+     (empty? output) nil
+     ;; if there are no end-of-sentence markers in the output, return the output and we're done.
+     (not end-in-sight?) output
+     ;; if we've topped the output...
+     topped?
+     (cond
+       ;; if the first thing in the output is an end-of-sentence marker, continue, checking whether there's another.
+      (end-of-sentence? (first output))
+      (let [another? (not (empty? (remove nil? (map end-of-sentence? (rest output)))))]
+        (cond
+         ;; if there is another end-of-sentence yet to find, continue.
+         another? (cons (first output) (top-and-tail (rest output) topped? another?))
+         ;; otherwise, we're done.
+         true (list (first output))))
+      ;; otherwise just continue.
+      true (cons (first output) (top-and-tail (rest output) topped? end-in-sight?)))
+     ;; if the first thing in the output is an end-of-sentence marker, we've 'topped' and want the rest.
+     (end-of-sentence? (first output))
+     (top-and-tail (rest output) true (not (empty? (remove nil? (map end-of-sentence? (rest output))))))
+     ;; else discard the head and continue
+     true
+     (top-and-tail (rest output)) topped? end-in-sight?)))
+
+
 (defn write-token
   [token]
-  "Write a single token to the output, doing some basic orthographic tricks.
+  "Write a single token to the output, performing some basic orthographic tricks.
 
    token: the token to write."
   (cond
-   (= token end-magic-token) nil
-   (re-find #"^[.!?]$" token) (do (print token) (cond (= (rand-int 5) 0) (print "\n\n")))
-   (re-find #"^\p{Punct}$" token) (print token)
-   true (print (str " " token))))
+   (= token end-magic-token)
+   ;; suppress the end magic token.
+   nil
+   (end-of-sentence? token)
+   ;; end of sentence: suppress leading space and possibly terminate paragraph.
+   (do (print token)
+     (cond
+      (= (rand-int av-sentences-per-para) 0) (print "\n\n")))
+   (punctuation? token)
+   ;; other punctuation: suppress leading whitespace.
+   (print token)
+   true
+   ;; everything else, print leading space and token.
+   (print (str " " token))))
 
 (defn write-output
   "Write this output, doing little orthographic tricks to make it look superficially
@@ -99,4 +161,5 @@
 
    output: a sequence of tokens to write."
   [output]
-  (dorun (map write-token output)))
+  (dorun (map write-token (top-and-tail output)))
+  (print "\n\n"))
diff --git a/test/milkwood_clj/synthesise_test.clj b/test/milkwood_clj/synthesise_test.clj
new file mode 100644
index 0000000..71b161a
--- /dev/null
+++ b/test/milkwood_clj/synthesise_test.clj
@@ -0,0 +1,9 @@
+(ns milkwood-clj.synthesise-test
+  (:require [clojure.test :refer :all]
+            [milkwood-clj.synthesise :refer :all]))
+
+(deftest top-and-tail-test
+  (testing "Test top and tailing of output"
+    (is (= (top-and-tail '("a" "b" "c" "?" "d" "e" "f" "." "g" "h" "i" "!")) '("d" "e" "f" "." "g" "h" "i" "!")))
+    (is (= (top-and-tail '("a" "b" "c" "?" "d" "e" "f" "." "g" "h" "i")) '("d" "e" "f" ".")))
+    ))