From 80cc2e4335c17da39ada23eec8ecc587a0787057 Mon Sep 17 00:00:00 2001
From: Simon Brooke <simon@journeyman.cc>
Date: Tue, 30 Apr 2019 20:11:44 +0100
Subject: [PATCH] Blogger scraper tidied up and documented.

---
 .gitignore                       |  2 ++
 README.md                        | 18 +++++++++++++-
 doc/intro.md                     | 19 +++++++++++----
 src/html_to_md/blogger_to_md.clj | 41 ++++++++++++++++++++++++++++++++
 src/html_to_md/core.clj          | 12 +++++++++-
 src/html_to_md/html_to_md.clj    |  2 +-
 6 files changed, 87 insertions(+), 7 deletions(-)
 create mode 100644 src/html_to_md/blogger_to_md.clj

diff --git a/.gitignore b/.gitignore
index 1094326..9c58742 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,5 @@ pom.xml.asc
 .hgignore
 .hg/
 *~
+
+test\.md
diff --git a/README.md b/README.md
index f025d14..7807864 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,7 @@
 # html-to-md
 
-A Clojure library designed to convert (Enlivened) HTML to markdown; but, more
+A Clojure library designed to convert
+([Enlive](https://github.com/cgrand/enlive)ned) HTML to markdown; but, more
 generally, a framework for [HT|SG|X]ML transformation.
 
 ## Introduction
@@ -46,6 +47,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If
 (def md (html-to-md url))
 ```
 
+If you are specifically scraping [blogger.com](https://www.blogger.com/")
+pages, you may *try* the following recipe:
+
+```clojure
+(require '[html-to-md.core :refer [blogger-to-md]])
+
+(blogger-to-md url output-file)
+```
+
+It works for my blogger pages. However, I'm not sure to what extent the
+skinning of blogger pages is pure CSS (in which case my recipe should work
+for yours) and to what extent it's HTML templating (in which case it
+probably won't). Results not guaranteed, if it doesn't work you get to
+keep all the pieces.
+
 ## Extending the transformer
 
 In principle, the transformer can transform any [HT|SG|X]ML markup into any
diff --git a/doc/intro.md b/doc/intro.md
index 43afb64..05d012c 100644
--- a/doc/intro.md
+++ b/doc/intro.md
@@ -1,9 +1,5 @@
 # Introduction to html-to-md
 
-TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
-
-## Introduction
-
 The itch I'm trying to scratch at present is to transform
 [Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown;
 but my architecture for doing this is to build a completely general [HT|SG|X]ML
@@ -45,6 +41,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If
 (def md (html-to-md url))
 ```
 
+If you are specifically scraping [blogger.com](https://www.blogger.com/")
+pages, you may *try* the following recipe:
+
+```clojure
+(require '[html-to-md.core :refer [blogger-to-md]])
+
+(blogger-to-md url output-file)
+```
+
+It works for my blogger pages. However, I'm not sure to what extent the
+skinning of blogger pages is pure CSS (in which case my recipe should work
+for yours) and to what extent it's HTML templating (in which case it
+probably won't). Results not guaranteed, if it doesn't work you get to
+keep all the pieces.
+
 ## Extending the transformer
 
 In principle, the transformer can transform any [HT|SG|X]ML markup into any
diff --git a/src/html_to_md/blogger_to_md.clj b/src/html_to_md/blogger_to_md.clj
new file mode 100644
index 0000000..2d0c236
--- /dev/null
+++ b/src/html_to_md/blogger_to_md.clj
@@ -0,0 +1,41 @@
+(ns html-to-md.blogger-to-md
+    (:require [clojure.string :as s]
+              [html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]]
+              [html-to-md.transformer :refer [process]]
+              [net.cgrand.enlive-html :as html]))
+
+(defn blogger-scraper
+    "Processor which scrapes the actual post content out of a blogger page.
+    *NOTE:* This was written to scrape *my* blogger pages, yours may be
+    different!"
+    [e d]
+    (let [title (first (html/select e [:h3.post-title]))
+          content (html/select e [:div.post-body])]
+        (if (and title content)
+            (apply
+                str
+                (cons
+                    (markdown-header title d 1)
+                    (process content d))))))
+
+(defn image-table-processor
+    "Blogger's horrible tag soup wraps images in tables. Is this table such
+    a table? If so extract the image from it and process it to markdown;
+    otherwise, fall back on what `markdown-dispatcher` would do with the
+    table (which is currently nothing, but that will change)."
+    [e d]
+    (let [caption (process (first (html/select e [:td.tr-caption])) d)
+          alt (if caption (s/trim (apply str caption)))
+          image (first (html/select e [:img]))
+          src (if image (-> image :attrs :src))]
+        (if image
+            (str "![image: " alt "](" src ")")
+            (process e markdown-dispatcher))))
+
+
+(def blogger-dispatcher
+    "Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and
+    `:html` dispatches overridden."
+    (assoc markdown-dispatcher
+        :html blogger-scraper
+        :table image-table-processor))
diff --git a/src/html_to_md/core.clj b/src/html_to_md/core.clj
index ede8350..6434218 100644
--- a/src/html_to_md/core.clj
+++ b/src/html_to_md/core.clj
@@ -1,6 +1,7 @@
 (ns html-to-md.core
     (:require [html-to-md.transformer :refer [transform process]]
-              [html-to-md.html-to-md :refer [markdown-dispatcher]]))
+              [html-to-md.html-to-md :refer [markdown-dispatcher]]
+              [html-to-md.blogger-to-md :refer [blogger-dispatcher]]))
 
 (defn html-to-md
     "Transform the HTML document referenced by `url` into Markdown, and write
@@ -9,3 +10,12 @@
      (apply str (transform url markdown-dispatcher)))
     ([url output]
      (spit output (html-to-md url))))
+
+(defn blogger-to-md
+    "Transform the Blogger post referenced by `url` into Markdown, and write
+    it to `output`, if supplied. *NOTE:* This was written to scrape *my*
+    blogger pages, yours may be different!"
+    ([url]
+     (apply str (transform url blogger-dispatcher)))
+    ([url output]
+     (spit output (blogger-to-md url))))
diff --git a/src/html_to_md/html_to_md.clj b/src/html_to_md/html_to_md.clj
index c4d6ea7..27d3650 100644
--- a/src/html_to_md/html_to_md.clj
+++ b/src/html_to_md/html_to_md.clj
@@ -18,7 +18,7 @@
     "Process the line-break element `e`, so beloved of tag-soupers, into
     markdown"
     [e d]
-    "\n\n")
+    "\n")
 
 (defn markdown-code
     "Process the code or samp `e` into markdown, using dispatcher `d`."