Blogger scraper tidied up and documented.

This commit is contained in:
Simon Brooke 2019-04-30 20:11:44 +01:00
parent cb801b193f
commit 80cc2e4335
6 changed files with 87 additions and 7 deletions

View file

@ -0,0 +1,41 @@
(ns html-to-md.blogger-to-md
(:require [clojure.string :as s]
[html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]]
[html-to-md.transformer :refer [process]]
[net.cgrand.enlive-html :as html]))
(defn blogger-scraper
"Processor which scrapes the actual post content out of a blogger page.
*NOTE:* This was written to scrape *my* blogger pages, yours may be
different!"
[e d]
(let [title (first (html/select e [:h3.post-title]))
content (html/select e [:div.post-body])]
(if (and title content)
(apply
str
(cons
(markdown-header title d 1)
(process content d))))))
(defn image-table-processor
"Blogger's horrible tag soup wraps images in tables. Is this table such
a table? If so extract the image from it and process it to markdown;
otherwise, fall back on what `markdown-dispatcher` would do with the
table (which is currently nothing, but that will change)."
[e d]
(let [caption (process (first (html/select e [:td.tr-caption])) d)
alt (if caption (s/trim (apply str caption)))
image (first (html/select e [:img]))
src (if image (-> image :attrs :src))]
(if image
(str "![image: " alt "](" src ")")
(process e markdown-dispatcher))))
(def blogger-dispatcher
"Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and
`:html` dispatches overridden."
(assoc markdown-dispatcher
:html blogger-scraper
:table image-table-processor))

View file

@ -1,6 +1,7 @@
(ns html-to-md.core
(:require [html-to-md.transformer :refer [transform process]]
[html-to-md.html-to-md :refer [markdown-dispatcher]]))
[html-to-md.html-to-md :refer [markdown-dispatcher]]
[html-to-md.blogger-to-md :refer [blogger-dispatcher]]))
(defn html-to-md
"Transform the HTML document referenced by `url` into Markdown, and write
@ -9,3 +10,12 @@
(apply str (transform url markdown-dispatcher)))
([url output]
(spit output (html-to-md url))))
(defn blogger-to-md
"Transform the Blogger post referenced by `url` into Markdown, and write
it to `output`, if supplied. *NOTE:* This was written to scrape *my*
blogger pages, yours may be different!"
([url]
(apply str (transform url blogger-dispatcher)))
([url output]
(spit output (blogger-to-md url))))

View file

@ -18,7 +18,7 @@
"Process the line-break element `e`, so beloved of tag-soupers, into
markdown"
[e d]
"\n\n")
"\n")
(defn markdown-code
"Process the code or samp `e` into markdown, using dispatcher `d`."