HTML to Markdown very largely working.

This commit is contained in:
Simon Brooke 2019-04-30 15:18:30 +01:00
parent b406ef92c0
commit 7f50863d83
4 changed files with 269 additions and 90 deletions

View file

@ -0,0 +1,157 @@
(ns html-to-md.html-to-md
(:require
[clojure.string :as s]
[net.cgrand.enlive-html :as html]
[html-to-md.transformer :refer [process]]))
(defn markdown-a
"Process the anchor element `e` into markdown, using dispatcher `d`."
[e d]
(apply
str
(flatten
(list
"["
(map #(process % d) (:content e))
"]("
(-> e :attrs :href)
")"))))
(defn markdown-strong
"Process the strong emphasis element `e` into markdown, using dispatcher
`d`."
[e d]
(str
"**"
(s/trim (apply str (map #(process % d) (:content e))))
"**"))
(defn markdown-div
"Process the division element `e` into markdown, using dispatcher `d`."
[e d]
(apply
str
(flatten
(list "\n" (map #(process % d) (:content e)) "\n"))))
(defn markdown-em
"Process the emphasis element `e` into markdown, using dispatcher `d`."
[e d]
(str
"*"
(s/trim (apply str (map #(process % d) (:content e))))
"*"))
(defn markdown-header
"Process the header element `e` into markdown, with level `level`,
using dispatcher `d`."
[e d level]
(apply
str
(flatten
(list
"\n"
(take level (repeat "#"))
" "
(map #(process % d) (:content e))
"\n"))))
(defn markdown-h1
"Process the header element `e` into markdown, with level 1, using
dispatcher `d`."
[e d]
(markdown-header e d 1))
(defn markdown-h2
"Process the header element `e` into markdown, with level 2, using
dispatcher `d`."
[e d]
(markdown-header e d 2))
(defn markdown-h3
"Process the header element `e` into markdown, with level 3, using
dispatcher `d`."
[e d]
(markdown-header e d 3))
(defn markdown-h4
"Process the header element `e` into markdown, with level 4, using
dispatcher `d`."
[e d]
(markdown-header e d 4))
(defn markdown-h5
"Process the header element `e` into markdown, with level 5, using
dispatcher `d`."
[e d]
(markdown-header e d 5))
(defn markdown-h6
"Process the header element `e` into markdown, with level 6, using
dispatcher `d`."
[e d]
(markdown-header e d 6))
(defn markdown-html
"Process this HTML element `e` into markdown, using dispatcher `d`."
[e d]
(apply str (process (html/select e [:body]) d) ))
(defn markdown-img
"Process this image element `e` into markdown, using dispatcher `d`."
[e d]
(str "![" (-> e :attrs :alt) "](" (-> e :attrs :src) ")"))
(defn markdown-ol
"Process this ordered list element `e` into markdown, using dispatcher
`d`."
[e d]
(str
"\n"
(apply str
(doall
(map
#(apply
str
(flatten
(list "\n" (inc %2) ". " (process %1 d))))
(:content e)
(range))))
"\n\n"))
(defn markdown-ul
"Process this unordered list element `e` into markdown, using dispatcher
`d`."
[e d]
(str
"\n"
(apply str
(doall
(map
#(apply
str
(flatten
(list "\n* " (process % d))))
(:content e))))
"\n\n"))
(def markdown-dispatcher
{:a markdown-a
:b markdown-strong
:div markdown-div
:em markdown-em
:h1 markdown-h1
:h2 markdown-h2
:h3 markdown-h3
:h4 markdown-h4
:h5 markdown-h5
:h6 markdown-h6
:html markdown-html
:i markdown-em
:img markdown-img
:ol markdown-ol
:strong markdown-strong
:ul markdown-ul
})

View file

@ -1,86 +1,8 @@
(ns html-to-md.transformer
(:require
[clojure.string :as s]
[net.cgrand.enlive-html :as html]
[net.cgrand.tagsoup :as tagsoup]))
(declare process)
(defn markdown-a
"Process the anchor element `e` into markdown"
[e d]
(apply
str
(flatten
(list
"["
(map #(process % d) (:content e))
"]("
(-> e :attrs :href)
")"))))
(defn markdown-strong
[e d]
;; same as `:strong`, q.v.
(str
"**"
(s/trim (apply str (map #(process % d) (:content e))))
"**"))
(defn markdown-div
[e d]
(apply
str
(flatten
(list "\n" (map #(process % d) (:content e)) "\n"))))
(def markdown-dispatcher
{:a markdown-a
:b markdown-strong
:div markdown-div
:em (fn [e d]
;; same as `:i`, q.v.
(str
"*"
(s/trim (apply str (map #(process % d) (:content e))))
"*"))
:h1 (fn [e d]
(apply
str
(flatten
(list "\n# " (map #(process % d) (:content e)) "\n"))))
:h2 (fn [e d]
(apply
str
(flatten
(list "\n## " (map #(process % d) (:content e)) "\n"))))
:h3 (fn [e d]
(apply
str
(flatten
(list "\n### " (map #(process % d) (:content e)) "\n"))))
:h4 (fn [e d]
(apply
str
(flatten
(list
"\n#### "
(map #(process % d) (:content e))
"\n"))))
:h5 (fn [e d]
(apply
str (flatten (list "\n##### " (map #(process % d) (:content e)) "\n"))))
:h6 (fn [e d] (apply str (flatten (list "\n###### " (map #(process % d) (:content e)) "\n"))))
:html (fn [e d] (apply str (process (html/select e [:body]) d) ))
:i (fn [e d] (str "*" (s/trim (apply str (map #(process % d) (:content e)))) "*"))
:img (fn [e d] (str "![" (-> e :attrs :alt) "](" (-> e :attrs :src) ")"))
:strong (fn [e d]
(str
"**"
(s/trim (apply str (map #(process % d) (:content e))))
"**"))
})
(defn process
"Process this `element`, assumed to be a [HT|SG|X]ML element in Enlive
@ -109,26 +31,21 @@
(defmulti transform
"Transform the `obj` which is my first argument using the `dispatcher`
which is my second argument."
(fn [obj dispatcher] (type obj)) :default :default)
[class class] :default :default)
(defmethod transform :default [obj dispatcher]
(process obj dispatcher))
(defmethod transform java.net.URI [uri dispatcher]
(defmethod transform [java.net.URI Object] [uri dispatcher]
(process (html/html-resource uri) dispatcher))
(defmethod transform java.net.URL [url dispatcher]
(defmethod transform [java.net.URL Object] [url dispatcher]
(transform (.toURI url) dispatcher))
(defmethod transform String [s dispatcher]
(defmethod transform [String Object] [s dispatcher]
(let [url (try (java.net.URL. s) (catch Exception any))]
(if url (transform url dispatcher)
;; otherwise, if s is not a URL, consider it as an HTML fragment,
;; parse and process it
(process (tagsoup/parser (java.io.StringReader s)) dispatcher)
)))
(process {:tag :h1 :content ["Hello dere!"]} markdown-dispatcher)
(transform "<h1>Hello dere!</h1>" markdown-despatcher)

View file

@ -2,6 +2,6 @@
(:require [clojure.test :refer :all]
[html-to-md.core :refer :all]))
(deftest a-test
(testing "FIXME, I fail."
(is (= 0 1))))
;; (deftest a-test
;; (testing "FIXME, I fail."
;; (is (= 0 1))))

View file

@ -0,0 +1,105 @@
(ns html-to-md.html-to-md-test
(:require [clojure.test :refer :all]
[html-to-md.transformer :refer [process]]
[html-to-md.html-to-md :refer :all]))
(deftest a-test
(testing "Anchor tag."
(let [expected "[Hello dere!](http://foo.bar)"
actual (process {:tag :a :attrs {:href "http://foo.bar"} :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest b-test
(testing "Bold tag."
(let [expected "**Hello dere!**"
actual (process {:tag :b :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual))))
(testing "STRONG emphasis tag."
(let [expected "**Hello dere!**"
actual (process {:tag :strong :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest div-test
(testing "DIVision tag."
(let [expected "\nHello dere!\n"
actual (process {:tag :div :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest em-test
(testing "EMphasis tag."
(let [expected "*Hello dere!*"
actual (process {:tag :em :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual))))
(testing "Italics tag"
(let [expected "*Hello dere!*"
actual (process {:tag :i :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest h1-test
(testing "Level 1 header tag."
(let [expected "\n# Hello dere!\n"
actual (process {:tag :h1 :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest h2-test
(testing "Level 2 header tag."
(let [expected "\n## Hello dere!\n"
actual (process {:tag :h2 :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest h3-test
(testing "Level 3 header tag."
(let [expected "\n### Hello dere!\n"
actual (process {:tag :h3 :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest h4-test
(testing "Level 4 header tag."
(let [expected "\n#### Hello dere!\n"
actual (process {:tag :h4 :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest h5-test
(testing "Level 5 header tag."
(let [expected "\n##### Hello dere!\n"
actual (process {:tag :h5 :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest h6-test
(testing "Level 6 header tag."
(let [expected "\n###### Hello dere!\n"
actual (process {:tag :h6 :content ["Hello dere!"]} markdown-dispatcher)]
(is (= expected actual)))))
(deftest img-test
(testing "Image tag."
(let [expected "![Hello dere!](http://foo.bar/image.png)"
actual (process
{:tag :img
:attrs {:src "http://foo.bar/image.png"
:alt "Hello dere!"}}
markdown-dispatcher)]
(is (= expected actual)))))
(deftest list-test
(testing "ordered list tag."
(let [expected "\n\n1. foo\n2. bar\n3. ban\n\n"
actual (process
{:tag :ol
:content
[{:tag :li :content ["foo"]}
{:tag :li :content ["bar"]}
{:tag :li :content ["ban"]}]}
markdown-dispatcher)]
(is (= expected actual))))
(testing "umordered list tag."
(let [expected "\n\n* foo\n* bar\n* ban\n\n"
actual (process
{:tag :ul
:content
[{:tag :li :content ["foo"]}
{:tag :li :content ["bar"]}
{:tag :li :content ["ban"]}]}
markdown-dispatcher)]
(is (= expected actual)))))