Blogger scraper tidied up and documented.
This commit is contained in:
parent
cb801b193f
commit
80cc2e4335
6 changed files with 87 additions and 7 deletions
41
src/html_to_md/blogger_to_md.clj
Normal file
41
src/html_to_md/blogger_to_md.clj
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
(ns html-to-md.blogger-to-md
|
||||
(:require [clojure.string :as s]
|
||||
[html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]]
|
||||
[html-to-md.transformer :refer [process]]
|
||||
[net.cgrand.enlive-html :as html]))
|
||||
|
||||
(defn blogger-scraper
|
||||
"Processor which scrapes the actual post content out of a blogger page.
|
||||
*NOTE:* This was written to scrape *my* blogger pages, yours may be
|
||||
different!"
|
||||
[e d]
|
||||
(let [title (first (html/select e [:h3.post-title]))
|
||||
content (html/select e [:div.post-body])]
|
||||
(if (and title content)
|
||||
(apply
|
||||
str
|
||||
(cons
|
||||
(markdown-header title d 1)
|
||||
(process content d))))))
|
||||
|
||||
(defn image-table-processor
|
||||
"Blogger's horrible tag soup wraps images in tables. Is this table such
|
||||
a table? If so extract the image from it and process it to markdown;
|
||||
otherwise, fall back on what `markdown-dispatcher` would do with the
|
||||
table (which is currently nothing, but that will change)."
|
||||
[e d]
|
||||
(let [caption (process (first (html/select e [:td.tr-caption])) d)
|
||||
alt (if caption (s/trim (apply str caption)))
|
||||
image (first (html/select e [:img]))
|
||||
src (if image (-> image :attrs :src))]
|
||||
(if image
|
||||
(str "")
|
||||
(process e markdown-dispatcher))))
|
||||
|
||||
|
||||
(def blogger-dispatcher
|
||||
"Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and
|
||||
`:html` dispatches overridden."
|
||||
(assoc markdown-dispatcher
|
||||
:html blogger-scraper
|
||||
:table image-table-processor))
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
(ns html-to-md.core
|
||||
(:require [html-to-md.transformer :refer [transform process]]
|
||||
[html-to-md.html-to-md :refer [markdown-dispatcher]]))
|
||||
[html-to-md.html-to-md :refer [markdown-dispatcher]]
|
||||
[html-to-md.blogger-to-md :refer [blogger-dispatcher]]))
|
||||
|
||||
(defn html-to-md
|
||||
"Transform the HTML document referenced by `url` into Markdown, and write
|
||||
|
|
@ -9,3 +10,12 @@
|
|||
(apply str (transform url markdown-dispatcher)))
|
||||
([url output]
|
||||
(spit output (html-to-md url))))
|
||||
|
||||
(defn blogger-to-md
|
||||
"Transform the Blogger post referenced by `url` into Markdown, and write
|
||||
it to `output`, if supplied. *NOTE:* This was written to scrape *my*
|
||||
blogger pages, yours may be different!"
|
||||
([url]
|
||||
(apply str (transform url blogger-dispatcher)))
|
||||
([url output]
|
||||
(spit output (blogger-to-md url))))
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@
|
|||
"Process the line-break element `e`, so beloved of tag-soupers, into
|
||||
markdown"
|
||||
[e d]
|
||||
"\n\n")
|
||||
"\n")
|
||||
|
||||
(defn markdown-code
|
||||
"Process the code or samp `e` into markdown, using dispatcher `d`."
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue