HTML to Markdown very largely working.
This commit is contained in:
		
							parent
							
								
									b406ef92c0
								
							
						
					
					
						commit
						7f50863d83
					
				
							
								
								
									
										157
									
								
								src/html_to_md/html_to_md.clj
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								src/html_to_md/html_to_md.clj
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,157 @@ | |||
| (ns html-to-md.html-to-md | ||||
|     (:require | ||||
|         [clojure.string :as s] | ||||
|         [net.cgrand.enlive-html :as html] | ||||
|         [html-to-md.transformer :refer [process]])) | ||||
| 
 | ||||
| (defn markdown-a | ||||
|     "Process the anchor element `e` into markdown, using dispatcher `d`." | ||||
|     [e d] | ||||
|     (apply | ||||
|         str | ||||
|         (flatten | ||||
|             (list | ||||
|                 "[" | ||||
|                 (map #(process % d) (:content e)) | ||||
|                 "](" | ||||
|                 (-> e :attrs :href) | ||||
|                 ")")))) | ||||
| 
 | ||||
| (defn markdown-strong | ||||
|     "Process the strong emphasis element `e` into markdown, using dispatcher | ||||
|     `d`." | ||||
|     [e d] | ||||
|     (str | ||||
|         "**" | ||||
|         (s/trim (apply str (map #(process % d) (:content e)))) | ||||
|         "**")) | ||||
| 
 | ||||
| (defn markdown-div | ||||
|     "Process the division element `e` into markdown, using dispatcher `d`." | ||||
|     [e d] | ||||
|     (apply | ||||
|         str | ||||
|         (flatten | ||||
|             (list "\n" (map #(process % d) (:content e)) "\n")))) | ||||
| 
 | ||||
| (defn markdown-em | ||||
|     "Process the emphasis element `e` into markdown, using dispatcher `d`." | ||||
|     [e d] | ||||
|     (str | ||||
|         "*" | ||||
|         (s/trim (apply str (map #(process % d) (:content e)))) | ||||
|         "*")) | ||||
| 
 | ||||
| (defn markdown-header | ||||
|     "Process the header element `e` into markdown, with level `level`, | ||||
|     using dispatcher `d`." | ||||
|     [e d level] | ||||
|     (apply | ||||
|         str | ||||
|         (flatten | ||||
|             (list | ||||
|                 "\n" | ||||
|             (take level (repeat "#")) | ||||
|                 " " | ||||
|             (map #(process % d) (:content e)) | ||||
|                 "\n")))) | ||||
| 
 | ||||
| (defn markdown-h1 | ||||
|     "Process the header element `e` into markdown, with level 1, using | ||||
|     dispatcher `d`." | ||||
|     [e d] | ||||
|     (markdown-header e d 1)) | ||||
| 
 | ||||
| (defn markdown-h2 | ||||
|     "Process the header element `e` into markdown, with level 2, using | ||||
|     dispatcher `d`." | ||||
|     [e d] | ||||
|     (markdown-header e d 2)) | ||||
| 
 | ||||
| (defn markdown-h3 | ||||
|     "Process the header element `e` into markdown, with level 3, using | ||||
|     dispatcher `d`." | ||||
|     [e d] | ||||
|     (markdown-header e d 3)) | ||||
| 
 | ||||
| (defn markdown-h4 | ||||
|     "Process the header element `e` into markdown, with level 4, using | ||||
|     dispatcher `d`." | ||||
|     [e d] | ||||
|     (markdown-header e d 4)) | ||||
| 
 | ||||
| (defn markdown-h5 | ||||
|     "Process the header element `e` into markdown, with level 5, using | ||||
|     dispatcher `d`." | ||||
|     [e d] | ||||
|     (markdown-header e d 5)) | ||||
| 
 | ||||
| (defn markdown-h6 | ||||
|     "Process the header element `e` into markdown, with level 6, using | ||||
|     dispatcher `d`." | ||||
|     [e d] | ||||
|     (markdown-header e d 6)) | ||||
| 
 | ||||
| (defn markdown-html | ||||
|     "Process this HTML element `e` into markdown, using dispatcher `d`." | ||||
|     [e d] | ||||
|     (apply str (process (html/select e [:body]) d) )) | ||||
| 
 | ||||
| (defn markdown-img | ||||
|     "Process this image element `e` into markdown, using dispatcher `d`." | ||||
|     [e d] | ||||
|     (str " ")")) | ||||
| 
 | ||||
| (defn markdown-ol | ||||
|     "Process this ordered list element `e` into markdown, using dispatcher | ||||
|     `d`." | ||||
|     [e d] | ||||
|     (str | ||||
|         "\n" | ||||
|         (apply str | ||||
|                (doall | ||||
|                    (map | ||||
|                        #(apply | ||||
|                             str | ||||
|                             (flatten | ||||
|                                 (list "\n" (inc %2) ". " (process %1 d)))) | ||||
|                        (:content e) | ||||
|                        (range)))) | ||||
|         "\n\n")) | ||||
| 
 | ||||
| (defn markdown-ul | ||||
|     "Process this unordered list element `e` into markdown, using dispatcher | ||||
|     `d`." | ||||
|     [e d] | ||||
|     (str | ||||
|         "\n" | ||||
|         (apply str | ||||
|                (doall | ||||
|                    (map | ||||
|                        #(apply | ||||
|                             str | ||||
|                             (flatten | ||||
|                                 (list "\n* " (process % d)))) | ||||
|                        (:content e)))) | ||||
|         "\n\n")) | ||||
| 
 | ||||
| 
 | ||||
| (def markdown-dispatcher | ||||
|     {:a markdown-a | ||||
|      :b markdown-strong | ||||
|      :div markdown-div | ||||
|      :em markdown-em | ||||
|      :h1 markdown-h1 | ||||
|      :h2 markdown-h2 | ||||
|      :h3 markdown-h3 | ||||
|      :h4 markdown-h4 | ||||
|      :h5 markdown-h5 | ||||
|      :h6 markdown-h6 | ||||
|      :html markdown-html | ||||
|      :i markdown-em | ||||
|      :img markdown-img | ||||
|      :ol markdown-ol | ||||
|      :strong markdown-strong | ||||
|      :ul markdown-ul | ||||
|      }) | ||||
| 
 | ||||
|  | @ -1,86 +1,8 @@ | |||
| (ns html-to-md.transformer | ||||
|   (:require | ||||
|       [clojure.string :as s] | ||||
|       [net.cgrand.enlive-html :as html] | ||||
|       [net.cgrand.tagsoup :as tagsoup])) | ||||
| 
 | ||||
| (declare process) | ||||
| 
 | ||||
| (defn markdown-a | ||||
|     "Process the anchor element `e` into markdown" | ||||
|     [e d] | ||||
|     (apply | ||||
|         str | ||||
|         (flatten | ||||
|             (list | ||||
|                 "[" | ||||
|                 (map #(process % d) (:content e)) | ||||
|                 "](" | ||||
|                 (-> e :attrs :href) | ||||
|                 ")")))) | ||||
| 
 | ||||
| (defn markdown-strong | ||||
|     [e d] | ||||
|     ;; same as `:strong`, q.v. | ||||
|     (str | ||||
|         "**" | ||||
|         (s/trim (apply str (map #(process % d) (:content e)))) | ||||
|         "**")) | ||||
| 
 | ||||
| (defn markdown-div | ||||
|     [e d] | ||||
|     (apply | ||||
|         str | ||||
|         (flatten | ||||
|             (list "\n" (map #(process % d) (:content e)) "\n")))) | ||||
| 
 | ||||
| 
 | ||||
| (def markdown-dispatcher | ||||
|     {:a markdown-a | ||||
|      :b markdown-strong | ||||
|      :div markdown-div | ||||
|      :em (fn [e d] | ||||
|              ;; same as `:i`, q.v. | ||||
|              (str | ||||
|                  "*" | ||||
|                  (s/trim (apply str (map #(process % d) (:content e)))) | ||||
|                  "*")) | ||||
|      :h1 (fn [e d] | ||||
|              (apply | ||||
|                  str | ||||
|                  (flatten | ||||
|                      (list "\n# " (map #(process % d) (:content e)) "\n")))) | ||||
|      :h2 (fn [e d] | ||||
|              (apply | ||||
|                  str | ||||
|                  (flatten | ||||
|                      (list "\n## " (map #(process % d) (:content e)) "\n")))) | ||||
|      :h3 (fn [e d] | ||||
|              (apply | ||||
|                  str | ||||
|                  (flatten | ||||
|                      (list "\n### " (map #(process % d) (:content e)) "\n")))) | ||||
|      :h4 (fn [e d] | ||||
|              (apply | ||||
|                  str | ||||
|                  (flatten | ||||
|                      (list | ||||
|                          "\n#### " | ||||
|                          (map #(process % d) (:content e)) | ||||
|                          "\n")))) | ||||
|      :h5 (fn [e d] | ||||
|              (apply | ||||
|                  str (flatten (list "\n##### " (map #(process % d) (:content e)) "\n")))) | ||||
|      :h6 (fn [e d] (apply str (flatten (list "\n###### " (map #(process % d) (:content e)) "\n")))) | ||||
|      :html (fn [e d] (apply str (process (html/select e [:body]) d) )) | ||||
|      :i (fn [e d] (str "*" (s/trim (apply str (map #(process % d) (:content e)))) "*")) | ||||
|      :img (fn [e d] (str " ")")) | ||||
|      :strong (fn [e d] | ||||
|                  (str | ||||
|                      "**" | ||||
|                      (s/trim (apply str (map #(process % d) (:content e)))) | ||||
|                      "**")) | ||||
|      }) | ||||
| 
 | ||||
| (defn process | ||||
|     "Process this `element`, assumed to be a [HT|SG|X]ML element in Enlive | ||||
|  | @ -109,26 +31,21 @@ | |||
| (defmulti transform | ||||
|     "Transform the `obj` which is my first argument using the `dispatcher` | ||||
|     which is my second argument." | ||||
|     (fn [obj dispatcher] (type obj)) :default :default) | ||||
|     [class class] :default :default) | ||||
| 
 | ||||
| (defmethod transform :default [obj dispatcher] | ||||
|     (process obj dispatcher)) | ||||
| 
 | ||||
| (defmethod transform java.net.URI [uri dispatcher] | ||||
| (defmethod transform [java.net.URI Object] [uri dispatcher] | ||||
|     (process (html/html-resource uri) dispatcher)) | ||||
| 
 | ||||
| (defmethod transform java.net.URL [url dispatcher] | ||||
| (defmethod transform [java.net.URL Object] [url dispatcher] | ||||
|     (transform (.toURI url) dispatcher)) | ||||
| 
 | ||||
| (defmethod transform String [s dispatcher] | ||||
| (defmethod transform [String Object] [s dispatcher] | ||||
|     (let [url (try (java.net.URL. s) (catch Exception any))] | ||||
|         (if url (transform url dispatcher) | ||||
|             ;; otherwise, if s is not a URL, consider it as an HTML fragment, | ||||
|             ;; parse and process it | ||||
|             (process (tagsoup/parser (java.io.StringReader s)) dispatcher) | ||||
|             ))) | ||||
| 
 | ||||
| (process {:tag :h1 :content ["Hello dere!"]} markdown-dispatcher) | ||||
| 
 | ||||
| 
 | ||||
| (transform "<h1>Hello dere!</h1>" markdown-despatcher) | ||||
|  |  | |||
|  | @ -2,6 +2,6 @@ | |||
|   (:require [clojure.test :refer :all] | ||||
|             [html-to-md.core :refer :all])) | ||||
| 
 | ||||
| (deftest a-test | ||||
|   (testing "FIXME, I fail." | ||||
|     (is (= 0 1)))) | ||||
| ;; (deftest a-test | ||||
| ;;   (testing "FIXME, I fail." | ||||
| ;;     (is (= 0 1)))) | ||||
|  |  | |||
							
								
								
									
										105
									
								
								test/html_to_md/html_to_md_test.clj
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										105
									
								
								test/html_to_md/html_to_md_test.clj
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,105 @@ | |||
| (ns html-to-md.html-to-md-test | ||||
|     (:require [clojure.test :refer :all] | ||||
|               [html-to-md.transformer :refer [process]] | ||||
|               [html-to-md.html-to-md :refer :all])) | ||||
| 
 | ||||
| (deftest a-test | ||||
|     (testing "Anchor tag." | ||||
|         (let [expected "[Hello dere!](http://foo.bar)" | ||||
|               actual (process {:tag :a :attrs {:href "http://foo.bar"} :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest b-test | ||||
|     (testing "Bold tag." | ||||
|         (let [expected "**Hello dere!**" | ||||
|               actual (process {:tag :b :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual)))) | ||||
|     (testing "STRONG emphasis tag." | ||||
|         (let [expected "**Hello dere!**" | ||||
|               actual (process {:tag :strong :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest div-test | ||||
|     (testing "DIVision tag." | ||||
|         (let [expected "\nHello dere!\n" | ||||
|               actual (process {:tag :div :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest em-test | ||||
|     (testing "EMphasis tag." | ||||
|         (let [expected "*Hello dere!*" | ||||
|               actual (process {:tag :em :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual)))) | ||||
|     (testing "Italics tag" | ||||
|         (let [expected "*Hello dere!*" | ||||
|               actual (process {:tag :i :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest h1-test | ||||
|     (testing "Level 1 header tag." | ||||
|         (let [expected "\n# Hello dere!\n" | ||||
|               actual (process {:tag :h1 :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest h2-test | ||||
|     (testing "Level 2 header tag." | ||||
|         (let [expected "\n## Hello dere!\n" | ||||
|               actual (process {:tag :h2 :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest h3-test | ||||
|     (testing "Level 3 header tag." | ||||
|         (let [expected "\n### Hello dere!\n" | ||||
|               actual (process {:tag :h3 :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest h4-test | ||||
|     (testing "Level 4 header tag." | ||||
|         (let [expected "\n#### Hello dere!\n" | ||||
|               actual (process {:tag :h4 :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest h5-test | ||||
|     (testing "Level 5 header tag." | ||||
|         (let [expected "\n##### Hello dere!\n" | ||||
|               actual (process {:tag :h5 :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest h6-test | ||||
|     (testing "Level 6 header tag." | ||||
|         (let [expected "\n###### Hello dere!\n" | ||||
|               actual (process {:tag :h6 :content ["Hello dere!"]} markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest img-test | ||||
|     (testing "Image tag." | ||||
|         (let [expected "" | ||||
|               actual (process | ||||
|                          {:tag :img | ||||
|                           :attrs {:src "http://foo.bar/image.png" | ||||
|                                   :alt "Hello dere!"}} | ||||
|                          markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
| (deftest list-test | ||||
|     (testing "ordered list tag." | ||||
|         (let [expected "\n\n1. foo\n2. bar\n3. ban\n\n" | ||||
|               actual (process | ||||
|                          {:tag :ol | ||||
|                           :content | ||||
|                           [{:tag :li :content ["foo"]} | ||||
|                            {:tag :li :content ["bar"]} | ||||
|                            {:tag :li :content ["ban"]}]} | ||||
|                          markdown-dispatcher)] | ||||
|             (is (= expected actual)))) | ||||
|     (testing "umordered list tag." | ||||
|         (let [expected "\n\n* foo\n* bar\n* ban\n\n" | ||||
|               actual (process | ||||
|                          {:tag :ul | ||||
|                           :content | ||||
|                           [{:tag :li :content ["foo"]} | ||||
|                            {:tag :li :content ["bar"]} | ||||
|                            {:tag :li :content ["ban"]}]} | ||||
|                          markdown-dispatcher)] | ||||
|             (is (= expected actual))))) | ||||
| 
 | ||||
		Loading…
	
		Reference in a new issue