diff --git a/.gitignore b/.gitignore index 1094326..9c58742 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ pom.xml.asc .hgignore .hg/ *~ + +test\.md diff --git a/README.md b/README.md index f025d14..7807864 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # html-to-md -A Clojure library designed to convert (Enlivened) HTML to markdown; but, more +A Clojure library designed to convert +([Enlive](https://github.com/cgrand/enlive)ned) HTML to markdown; but, more generally, a framework for [HT|SG|X]ML transformation. ## Introduction @@ -46,6 +47,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If (def md (html-to-md url)) ``` +If you are specifically scraping [blogger.com](https://www.blogger.com/") +pages, you may *try* the following recipe: + +```clojure +(require '[html-to-md.core :refer [blogger-to-md]]) + +(blogger-to-md url output-file) +``` + +It works for my blogger pages. However, I'm not sure to what extent the +skinning of blogger pages is pure CSS (in which case my recipe should work +for yours) and to what extent it's HTML templating (in which case it +probably won't). Results not guaranteed, if it doesn't work you get to +keep all the pieces. + ## Extending the transformer In principle, the transformer can transform any [HT|SG|X]ML markup into any diff --git a/doc/intro.md b/doc/intro.md index 43afb64..05d012c 100644 --- a/doc/intro.md +++ b/doc/intro.md @@ -1,9 +1,5 @@ # Introduction to html-to-md -TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) - -## Introduction - The itch I'm trying to scratch at present is to transform [Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown; but my architecture for doing this is to build a completely general [HT|SG|X]ML @@ -45,6 +41,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If (def md (html-to-md url)) ``` +If you are specifically scraping [blogger.com](https://www.blogger.com/") +pages, you may *try* the following recipe: + +```clojure +(require '[html-to-md.core :refer [blogger-to-md]]) + +(blogger-to-md url output-file) +``` + +It works for my blogger pages. However, I'm not sure to what extent the +skinning of blogger pages is pure CSS (in which case my recipe should work +for yours) and to what extent it's HTML templating (in which case it +probably won't). Results not guaranteed, if it doesn't work you get to +keep all the pieces. + ## Extending the transformer In principle, the transformer can transform any [HT|SG|X]ML markup into any diff --git a/src/html_to_md/blogger_to_md.clj b/src/html_to_md/blogger_to_md.clj new file mode 100644 index 0000000..2d0c236 --- /dev/null +++ b/src/html_to_md/blogger_to_md.clj @@ -0,0 +1,41 @@ +(ns html-to-md.blogger-to-md + (:require [clojure.string :as s] + [html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]] + [html-to-md.transformer :refer [process]] + [net.cgrand.enlive-html :as html])) + +(defn blogger-scraper + "Processor which scrapes the actual post content out of a blogger page. + *NOTE:* This was written to scrape *my* blogger pages, yours may be + different!" + [e d] + (let [title (first (html/select e [:h3.post-title])) + content (html/select e [:div.post-body])] + (if (and title content) + (apply + str + (cons + (markdown-header title d 1) + (process content d)))))) + +(defn image-table-processor + "Blogger's horrible tag soup wraps images in tables. Is this table such + a table? If so extract the image from it and process it to markdown; + otherwise, fall back on what `markdown-dispatcher` would do with the + table (which is currently nothing, but that will change)." + [e d] + (let [caption (process (first (html/select e [:td.tr-caption])) d) + alt (if caption (s/trim (apply str caption))) + image (first (html/select e [:img])) + src (if image (-> image :attrs :src))] + (if image + (str "![image: " alt "](" src ")") + (process e markdown-dispatcher)))) + + +(def blogger-dispatcher + "Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and + `:html` dispatches overridden." + (assoc markdown-dispatcher + :html blogger-scraper + :table image-table-processor)) diff --git a/src/html_to_md/core.clj b/src/html_to_md/core.clj index ede8350..6434218 100644 --- a/src/html_to_md/core.clj +++ b/src/html_to_md/core.clj @@ -1,6 +1,7 @@ (ns html-to-md.core (:require [html-to-md.transformer :refer [transform process]] - [html-to-md.html-to-md :refer [markdown-dispatcher]])) + [html-to-md.html-to-md :refer [markdown-dispatcher]] + [html-to-md.blogger-to-md :refer [blogger-dispatcher]])) (defn html-to-md "Transform the HTML document referenced by `url` into Markdown, and write @@ -9,3 +10,12 @@ (apply str (transform url markdown-dispatcher))) ([url output] (spit output (html-to-md url)))) + +(defn blogger-to-md + "Transform the Blogger post referenced by `url` into Markdown, and write + it to `output`, if supplied. *NOTE:* This was written to scrape *my* + blogger pages, yours may be different!" + ([url] + (apply str (transform url blogger-dispatcher))) + ([url output] + (spit output (blogger-to-md url)))) diff --git a/src/html_to_md/html_to_md.clj b/src/html_to_md/html_to_md.clj index c4d6ea7..27d3650 100644 --- a/src/html_to_md/html_to_md.clj +++ b/src/html_to_md/html_to_md.clj @@ -18,7 +18,7 @@ "Process the line-break element `e`, so beloved of tag-soupers, into markdown" [e d] - "\n\n") + "\n") (defn markdown-code "Process the code or samp `e` into markdown, using dispatcher `d`."