From c9f7f29f0ff9a12a6cb2d18c5f5d22248cb94d3b Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 16:44:28 +0100 Subject: [PATCH 1/6] Upversioned to 0.2.0-SNAPSHOT --- project.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/project.clj b/project.clj index fb5716d..654fd11 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject html-to-md "0.1.0" +(defproject html-to-md "0.2.0-SNAPSHOT" :description "Convert (Enlivened) HTML to markdown; but, more generally, a framework for [HT|SG|X]ML transformation." :url "https://github.com/simon-brooke/html-to-md" :license {:name "Eclipse Public License" From 7bc60a0bbb99cd4b947d73e6995f03d46e2a2072 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 17:51:36 +0100 Subject: [PATCH 2/6] Woohoo! Transform works. --- src/html_to_md/transformer.clj | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/html_to_md/transformer.clj b/src/html_to_md/transformer.clj index 6a42fc6..b08ef9d 100644 --- a/src/html_to_md/transformer.clj +++ b/src/html_to_md/transformer.clj @@ -26,26 +26,35 @@ (if processor (apply processor (list element dispatcher)) (map #(process % dispatcher) (:content element)))) - (string? element) element)) + + (string? element) element + (or (seq? element) (vector? element)) + (map #(process % dispatcher) element))) + +(defn- transformer-dispatch + [a _] + (class a)) (defmulti transform "Transform the `obj` which is my first argument using the `dispatcher` which is my second argument." - [class class] :default :default) + #'transformer-dispatch + :default :default) (defmethod transform :default [obj dispatcher] (process obj dispatcher)) -(defmethod transform [java.net.URI Object] [uri dispatcher] +(defmethod transform java.net.URI [uri dispatcher] (process (html/html-resource uri) dispatcher)) -(defmethod transform [java.net.URL Object] [url dispatcher] +(defmethod transform java.net.URL [url dispatcher] (transform (.toURI url) dispatcher)) -(defmethod transform [String Object] [s dispatcher] +(defmethod transform String [s dispatcher] (let [url (try (java.net.URL. s) (catch Exception any))] (if url (transform url dispatcher) ;; otherwise, if s is not a URL, consider it as an HTML fragment, ;; parse and process it (process (tagsoup/parser (java.io.StringReader s)) dispatcher) ))) + From 81a7337eb3d9866f85fef30465466027b4bd7ce5 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 18:35:34 +0100 Subject: [PATCH 3/6] Lots of improvements from running against real live tag soup --- project.clj | 3 +- src/html_to_md/html_to_md.clj | 50 +++++++++++++++++------------ src/html_to_md/transformer.clj | 2 +- test/html_to_md/html_to_md_test.clj | 2 +- 4 files changed, 33 insertions(+), 24 deletions(-) diff --git a/project.clj b/project.clj index 654fd11..4edbf3a 100644 --- a/project.clj +++ b/project.clj @@ -5,6 +5,7 @@ :url "http://www.eclipse.org/legal/epl-v10.html"} :dependencies [[org.clojure/clojure "1.8.0"] [enlive "1.1.6"]] - :plugins [[lein-codox "0.10.3"]] + :plugins [[lein-codox "0.10.3"] + [lein-release "1.0.5"]] :lein-release {:deploy-via :clojars} :signing {:gpg-key "Simon Brooke (Stultus in monte) "}) diff --git a/src/html_to_md/html_to_md.clj b/src/html_to_md/html_to_md.clj index 5bc9716..662a23d 100644 --- a/src/html_to_md/html_to_md.clj +++ b/src/html_to_md/html_to_md.clj @@ -7,15 +7,18 @@ (defn markdown-a "Process the anchor element `e` into markdown, using dispatcher `d`." [e d] - (apply - str - (flatten - (list - "[" - (map #(process % d) (:content e)) - "](" - (-> e :attrs :href) - ")")))) + (str + "[" + (s/trim (apply str (process (:content e) d))) + "](" + (-> e :attrs :href) + ")")) + +(defn markdown-br + "Process the line-break element `e`, so beloved of tag-soupers, into + markdown" + [e d] + "\n\n") (defn markdown-code "Process the code or samp `e` into markdown, using dispatcher `d`." @@ -51,15 +54,12 @@ "Process the header element `e` into markdown, with level `level`, using dispatcher `d`." [e d level] - (apply - str - (flatten - (list - "\n" - (take level (repeat "#")) - " " - (map #(process % d) (:content e)) - "\n")))) + (str + "\n" + (apply str (take level (repeat "#"))) + " " + (s/trim (apply str (process (:content e) d))) + "\n")) (defn markdown-h1 "Process the header element `e` into markdown, with level 1, using @@ -105,7 +105,7 @@ (defn markdown-img "Process this image element `e` into markdown, using dispatcher `d`." [e d] - (str "![" (-> e :attrs :alt) "](" (-> e :attrs :src) ")")) + (str "![image: " (-> e :attrs :alt) "](" (-> e :attrs :src) ")")) (defn markdown-ol "Process this ordered list element `e` into markdown, using dispatcher @@ -120,10 +120,15 @@ str (flatten (list "\n" (inc %2) ". " (process %1 d)))) - (:content e) + (html/select e [:li]) (range)))) "\n\n")) +(defn markdown-omit + "Don't process the element `e` into markdown, but return `nil`." + [e d] + nil) + (defn markdown-pre "Process the preformatted emphasis element `e` into markdown, using dispatcher `d`." @@ -155,13 +160,14 @@ str (flatten (list "\n* " (process % d)))) - (:content e)))) + (html/select e [:li])))) "\n\n")) (def markdown-dispatcher {:a markdown-a :b markdown-strong + :br markdown-br :code markdown-code :body markdown-default :div markdown-div @@ -179,8 +185,10 @@ :p markdown-div :pre markdown-pre :samp markdown-code + :script markdown-omit :span markdown-default :strong markdown-strong + :style markdown-omit :ul markdown-ul }) diff --git a/src/html_to_md/transformer.clj b/src/html_to_md/transformer.clj index b08ef9d..931343c 100644 --- a/src/html_to_md/transformer.clj +++ b/src/html_to_md/transformer.clj @@ -29,7 +29,7 @@ (string? element) element (or (seq? element) (vector? element)) - (map #(process % dispatcher) element))) + (doall (map #(process % dispatcher) element)))) (defn- transformer-dispatch [a _] diff --git a/test/html_to_md/html_to_md_test.clj b/test/html_to_md/html_to_md_test.clj index 0778ec7..b328976 100644 --- a/test/html_to_md/html_to_md_test.clj +++ b/test/html_to_md/html_to_md_test.clj @@ -73,7 +73,7 @@ (deftest img-test (testing "Image tag." - (let [expected "![Hello dere!](http://foo.bar/image.png)" + (let [expected "![image: Hello dere!](http://foo.bar/image.png)" actual (process {:tag :img :attrs {:src "http://foo.bar/image.png" From cb801b193f5d115a89a4cfe75fce94f5e6bfbcfe Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 20:05:46 +0100 Subject: [PATCH 4/6] Added the blogger scraper. --- README.md | 27 ++++---- doc/intro.md | 113 +++++++++++++++++++++++++++++++++ src/html_to_md/core.clj | 15 +++-- src/html_to_md/html_to_md.clj | 1 + src/html_to_md/transformer.clj | 4 +- 5 files changed, 140 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8060139..f025d14 100644 --- a/README.md +++ b/README.md @@ -21,29 +21,29 @@ To use this library in your project, add the following leiningen dependency: To use it in your namespace, require: - [html-to-md/transformer :refer [transform process]] - [html-to-md/html-to-md :refer [markdown-dispatcher]] + [html-to-md.core :refer [html-to-md]] + +For default usage, that's all you need. To play more sophisticated tricks, +consider: + + [html-to-md.transformer :refer [transform process]] + [html-to-md.html-to-md :refer [markdown-dispatcher]] The intended usage is as follows: ```clojure -(require '[html-to-md.transformer :refer [transform]]) -(require '[html-to-md.html-to-md :refer [markdown-dispatcher]]) +(require '[html-to-md.core :refer [html-to-md]]) -(transform URL markdown-dispatcher) +(html-to-md url output-file) ``` -Where URL is any URL that references an HTML, SGML, XHTML or XML document. -However, my fancy multi-method doesn't work yet and may well be the wrong -approach, so for now use +This will read (X)HTML from `url` and write Markdown to `output-file`. If +`output-file` is not supplied, it will return the markdown as a string: ```clojure +(require '[html-to-md.core :refer [html-to-md]]) -(require '[html-to-md.transformer :refer [process]]) -(require '[html-to-md.html-to-md :refer [markdown-dispatcher]]) -(require '[net.cgrand.enlive-html :as html]) - -(process (html/html-resource URL) markdown-dispatcher) +(def md (html-to-md url)) ``` ## Extending the transformer @@ -66,3 +66,4 @@ Copyright © 2019 Simon Brooke Distributed under the Eclipse Public License either version 1.0 or (at your option) any later version. + diff --git a/doc/intro.md b/doc/intro.md index df2a804..43afb64 100644 --- a/doc/intro.md +++ b/doc/intro.md @@ -1,3 +1,116 @@ # Introduction to html-to-md TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) + +## Introduction + +The itch I'm trying to scratch at present is to transform +[Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown; +but my architecture for doing this is to build a completely general [HT|SG|X]ML +transformation framework and then specialise it. + +**WARNING:** this is presently alpha-quality code, although it does have fair +unit test coverage. + +## Usage + +To use this library in your project, add the following leiningen dependency: + + [org.clojars.simon_brooke/html-to-md "0.1.0"] + +To use it in your namespace, require: + + [html-to-md.core :refer [html-to-md]] + +For default usage, that's all you need. To play more sophisticated tricks, +consider: + + [html-to-md.transformer :refer [transform process]] + [html-to-md.html-to-md :refer [markdown-dispatcher]] + +The intended usage is as follows: + +```clojure +(require '[html-to-md.core :refer [html-to-md]]) + +(html-to-md url output-file) +``` + +This will read (X)HTML from `url` and write Markdown to `output-file`. If +`output-file` is not supplied, it will return the markdown as a string: + +```clojure +(require '[html-to-md.core :refer [html-to-md]]) + +(def md (html-to-md url)) +``` + +## Extending the transformer + +In principle, the transformer can transform any [HT|SG|X]ML markup into any +other, or into any textual form. To extend it to do something other than +markdown, supply a **dispatcher**. A dispatcher is essentially a function of one +argument, a [HT|SG|X]ML tag represented as a Clojure keyword, which returns +a **processor,** which should be a function of two arguments, an element assumed +to have that tag, and a dispatcher. The processor should return the value that +you want elements of that tag transformed into. + +Thus the `html-to-md.html-to-md` namespace comprises a number of *processor* +functions, such as this one: + +```clojure +(defn markdown-a + "Process the anchor element `e` into markdown, using dispatcher `d`." + [e d] + (str + "[" + (s/trim (apply str (process (:content e) d))) + "](" + (-> e :attrs :href) + ")")) +``` + +and a *dispatcher* map: + +```clojure +(def markdown-dispatcher + "A despatcher for transforming (X)HTML into Markdown." + {:a markdown-a + :b markdown-strong + :br markdown-br + :code markdown-code + :body markdown-default + :div markdown-div + :em markdown-em + :h1 markdown-h1 + :h2 markdown-h2 + :h3 markdown-h3 + :h4 markdown-h4 + :h5 markdown-h5 + :h6 markdown-h6 + :html markdown-html + :i markdown-em + :img markdown-img + :ol markdown-ol + :p markdown-div + :pre markdown-pre + :samp markdown-code + :script markdown-omit + :span markdown-default + :strong markdown-strong + :style markdown-omit + :ul markdown-ul + }) +``` + +Obviously it is convenient to write dispatchers as maps, but it isn't required +that you do so: anything which, given a keyword, will return a processor, will +work. + +## License + +Copyright © 2019 Simon Brooke + +Distributed under the Eclipse Public License either version 1.0 or (at +your option) any later version. + diff --git a/src/html_to_md/core.clj b/src/html_to_md/core.clj index 27460d3..ede8350 100644 --- a/src/html_to_md/core.clj +++ b/src/html_to_md/core.clj @@ -1,6 +1,11 @@ -(ns html-to-md.core) +(ns html-to-md.core + (:require [html-to-md.transformer :refer [transform process]] + [html-to-md.html-to-md :refer [markdown-dispatcher]])) -(defn foo - "I don't do a whole lot." - [x] - (println x "Hello, World!")) +(defn html-to-md + "Transform the HTML document referenced by `url` into Markdown, and write + it to `output`, if supplied." + ([url] + (apply str (transform url markdown-dispatcher))) + ([url output] + (spit output (html-to-md url)))) diff --git a/src/html_to_md/html_to_md.clj b/src/html_to_md/html_to_md.clj index 662a23d..c4d6ea7 100644 --- a/src/html_to_md/html_to_md.clj +++ b/src/html_to_md/html_to_md.clj @@ -165,6 +165,7 @@ (def markdown-dispatcher + "A despatcher for transforming (X)HTML into Markdown." {:a markdown-a :b markdown-strong :br markdown-br diff --git a/src/html_to_md/transformer.clj b/src/html_to_md/transformer.clj index 931343c..5a15981 100644 --- a/src/html_to_md/transformer.clj +++ b/src/html_to_md/transformer.clj @@ -29,7 +29,7 @@ (string? element) element (or (seq? element) (vector? element)) - (doall (map #(process % dispatcher) element)))) + (remove nil? (map #(process % dispatcher) element)))) (defn- transformer-dispatch [a _] @@ -45,7 +45,7 @@ (process obj dispatcher)) (defmethod transform java.net.URI [uri dispatcher] - (process (html/html-resource uri) dispatcher)) + (remove nil? (process (html/html-resource uri) dispatcher))) (defmethod transform java.net.URL [url dispatcher] (transform (.toURI url) dispatcher)) From 80cc2e4335c17da39ada23eec8ecc587a0787057 Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 20:11:44 +0100 Subject: [PATCH 5/6] Blogger scraper tidied up and documented. --- .gitignore | 2 ++ README.md | 18 +++++++++++++- doc/intro.md | 19 +++++++++++---- src/html_to_md/blogger_to_md.clj | 41 ++++++++++++++++++++++++++++++++ src/html_to_md/core.clj | 12 +++++++++- src/html_to_md/html_to_md.clj | 2 +- 6 files changed, 87 insertions(+), 7 deletions(-) create mode 100644 src/html_to_md/blogger_to_md.clj diff --git a/.gitignore b/.gitignore index 1094326..9c58742 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ pom.xml.asc .hgignore .hg/ *~ + +test\.md diff --git a/README.md b/README.md index f025d14..7807864 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # html-to-md -A Clojure library designed to convert (Enlivened) HTML to markdown; but, more +A Clojure library designed to convert +([Enlive](https://github.com/cgrand/enlive)ned) HTML to markdown; but, more generally, a framework for [HT|SG|X]ML transformation. ## Introduction @@ -46,6 +47,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If (def md (html-to-md url)) ``` +If you are specifically scraping [blogger.com](https://www.blogger.com/") +pages, you may *try* the following recipe: + +```clojure +(require '[html-to-md.core :refer [blogger-to-md]]) + +(blogger-to-md url output-file) +``` + +It works for my blogger pages. However, I'm not sure to what extent the +skinning of blogger pages is pure CSS (in which case my recipe should work +for yours) and to what extent it's HTML templating (in which case it +probably won't). Results not guaranteed, if it doesn't work you get to +keep all the pieces. + ## Extending the transformer In principle, the transformer can transform any [HT|SG|X]ML markup into any diff --git a/doc/intro.md b/doc/intro.md index 43afb64..05d012c 100644 --- a/doc/intro.md +++ b/doc/intro.md @@ -1,9 +1,5 @@ # Introduction to html-to-md -TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) - -## Introduction - The itch I'm trying to scratch at present is to transform [Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown; but my architecture for doing this is to build a completely general [HT|SG|X]ML @@ -45,6 +41,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If (def md (html-to-md url)) ``` +If you are specifically scraping [blogger.com](https://www.blogger.com/") +pages, you may *try* the following recipe: + +```clojure +(require '[html-to-md.core :refer [blogger-to-md]]) + +(blogger-to-md url output-file) +``` + +It works for my blogger pages. However, I'm not sure to what extent the +skinning of blogger pages is pure CSS (in which case my recipe should work +for yours) and to what extent it's HTML templating (in which case it +probably won't). Results not guaranteed, if it doesn't work you get to +keep all the pieces. + ## Extending the transformer In principle, the transformer can transform any [HT|SG|X]ML markup into any diff --git a/src/html_to_md/blogger_to_md.clj b/src/html_to_md/blogger_to_md.clj new file mode 100644 index 0000000..2d0c236 --- /dev/null +++ b/src/html_to_md/blogger_to_md.clj @@ -0,0 +1,41 @@ +(ns html-to-md.blogger-to-md + (:require [clojure.string :as s] + [html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]] + [html-to-md.transformer :refer [process]] + [net.cgrand.enlive-html :as html])) + +(defn blogger-scraper + "Processor which scrapes the actual post content out of a blogger page. + *NOTE:* This was written to scrape *my* blogger pages, yours may be + different!" + [e d] + (let [title (first (html/select e [:h3.post-title])) + content (html/select e [:div.post-body])] + (if (and title content) + (apply + str + (cons + (markdown-header title d 1) + (process content d)))))) + +(defn image-table-processor + "Blogger's horrible tag soup wraps images in tables. Is this table such + a table? If so extract the image from it and process it to markdown; + otherwise, fall back on what `markdown-dispatcher` would do with the + table (which is currently nothing, but that will change)." + [e d] + (let [caption (process (first (html/select e [:td.tr-caption])) d) + alt (if caption (s/trim (apply str caption))) + image (first (html/select e [:img])) + src (if image (-> image :attrs :src))] + (if image + (str "![image: " alt "](" src ")") + (process e markdown-dispatcher)))) + + +(def blogger-dispatcher + "Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and + `:html` dispatches overridden." + (assoc markdown-dispatcher + :html blogger-scraper + :table image-table-processor)) diff --git a/src/html_to_md/core.clj b/src/html_to_md/core.clj index ede8350..6434218 100644 --- a/src/html_to_md/core.clj +++ b/src/html_to_md/core.clj @@ -1,6 +1,7 @@ (ns html-to-md.core (:require [html-to-md.transformer :refer [transform process]] - [html-to-md.html-to-md :refer [markdown-dispatcher]])) + [html-to-md.html-to-md :refer [markdown-dispatcher]] + [html-to-md.blogger-to-md :refer [blogger-dispatcher]])) (defn html-to-md "Transform the HTML document referenced by `url` into Markdown, and write @@ -9,3 +10,12 @@ (apply str (transform url markdown-dispatcher))) ([url output] (spit output (html-to-md url)))) + +(defn blogger-to-md + "Transform the Blogger post referenced by `url` into Markdown, and write + it to `output`, if supplied. *NOTE:* This was written to scrape *my* + blogger pages, yours may be different!" + ([url] + (apply str (transform url blogger-dispatcher))) + ([url output] + (spit output (blogger-to-md url)))) diff --git a/src/html_to_md/html_to_md.clj b/src/html_to_md/html_to_md.clj index c4d6ea7..27d3650 100644 --- a/src/html_to_md/html_to_md.clj +++ b/src/html_to_md/html_to_md.clj @@ -18,7 +18,7 @@ "Process the line-break element `e`, so beloved of tag-soupers, into markdown" [e d] - "\n\n") + "\n") (defn markdown-code "Process the code or samp `e` into markdown, using dispatcher `d`." From d7015dc68cc5e749a44a3f12ca48049e846c7fde Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 20:14:38 +0100 Subject: [PATCH 6/6] Upversioned to 0.2.0 --- README.md | 2 +- doc/intro.md | 2 +- project.clj | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 7807864..4a3714a 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ unit test coverage. To use this library in your project, add the following leiningen dependency: - [org.clojars.simon_brooke/html-to-md "0.1.0"] + [org.clojars.simon_brooke/html-to-md "0.2.0"] To use it in your namespace, require: diff --git a/doc/intro.md b/doc/intro.md index 05d012c..f0d81be 100644 --- a/doc/intro.md +++ b/doc/intro.md @@ -12,7 +12,7 @@ unit test coverage. To use this library in your project, add the following leiningen dependency: - [org.clojars.simon_brooke/html-to-md "0.1.0"] + [org.clojars.simon_brooke/html-to-md "0.2.0"] To use it in your namespace, require: diff --git a/project.clj b/project.clj index 4edbf3a..01c1c4c 100644 --- a/project.clj +++ b/project.clj @@ -1,4 +1,4 @@ -(defproject html-to-md "0.2.0-SNAPSHOT" +(defproject html-to-md "0.2.0" :description "Convert (Enlivened) HTML to markdown; but, more generally, a framework for [HT|SG|X]ML transformation." :url "https://github.com/simon-brooke/html-to-md" :license {:name "Eclipse Public License"