From cb801b193f5d115a89a4cfe75fce94f5e6bfbcfe Mon Sep 17 00:00:00 2001 From: Simon Brooke Date: Tue, 30 Apr 2019 20:05:46 +0100 Subject: [PATCH] Added the blogger scraper. --- README.md | 27 ++++---- doc/intro.md | 113 +++++++++++++++++++++++++++++++++ src/html_to_md/core.clj | 15 +++-- src/html_to_md/html_to_md.clj | 1 + src/html_to_md/transformer.clj | 4 +- 5 files changed, 140 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 8060139..f025d14 100644 --- a/README.md +++ b/README.md @@ -21,29 +21,29 @@ To use this library in your project, add the following leiningen dependency: To use it in your namespace, require: - [html-to-md/transformer :refer [transform process]] - [html-to-md/html-to-md :refer [markdown-dispatcher]] + [html-to-md.core :refer [html-to-md]] + +For default usage, that's all you need. To play more sophisticated tricks, +consider: + + [html-to-md.transformer :refer [transform process]] + [html-to-md.html-to-md :refer [markdown-dispatcher]] The intended usage is as follows: ```clojure -(require '[html-to-md.transformer :refer [transform]]) -(require '[html-to-md.html-to-md :refer [markdown-dispatcher]]) +(require '[html-to-md.core :refer [html-to-md]]) -(transform URL markdown-dispatcher) +(html-to-md url output-file) ``` -Where URL is any URL that references an HTML, SGML, XHTML or XML document. -However, my fancy multi-method doesn't work yet and may well be the wrong -approach, so for now use +This will read (X)HTML from `url` and write Markdown to `output-file`. If +`output-file` is not supplied, it will return the markdown as a string: ```clojure +(require '[html-to-md.core :refer [html-to-md]]) -(require '[html-to-md.transformer :refer [process]]) -(require '[html-to-md.html-to-md :refer [markdown-dispatcher]]) -(require '[net.cgrand.enlive-html :as html]) - -(process (html/html-resource URL) markdown-dispatcher) +(def md (html-to-md url)) ``` ## Extending the transformer @@ -66,3 +66,4 @@ Copyright © 2019 Simon Brooke Distributed under the Eclipse Public License either version 1.0 or (at your option) any later version. + diff --git a/doc/intro.md b/doc/intro.md index df2a804..43afb64 100644 --- a/doc/intro.md +++ b/doc/intro.md @@ -1,3 +1,116 @@ # Introduction to html-to-md TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) + +## Introduction + +The itch I'm trying to scratch at present is to transform +[Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown; +but my architecture for doing this is to build a completely general [HT|SG|X]ML +transformation framework and then specialise it. + +**WARNING:** this is presently alpha-quality code, although it does have fair +unit test coverage. + +## Usage + +To use this library in your project, add the following leiningen dependency: + + [org.clojars.simon_brooke/html-to-md "0.1.0"] + +To use it in your namespace, require: + + [html-to-md.core :refer [html-to-md]] + +For default usage, that's all you need. To play more sophisticated tricks, +consider: + + [html-to-md.transformer :refer [transform process]] + [html-to-md.html-to-md :refer [markdown-dispatcher]] + +The intended usage is as follows: + +```clojure +(require '[html-to-md.core :refer [html-to-md]]) + +(html-to-md url output-file) +``` + +This will read (X)HTML from `url` and write Markdown to `output-file`. If +`output-file` is not supplied, it will return the markdown as a string: + +```clojure +(require '[html-to-md.core :refer [html-to-md]]) + +(def md (html-to-md url)) +``` + +## Extending the transformer + +In principle, the transformer can transform any [HT|SG|X]ML markup into any +other, or into any textual form. To extend it to do something other than +markdown, supply a **dispatcher**. A dispatcher is essentially a function of one +argument, a [HT|SG|X]ML tag represented as a Clojure keyword, which returns +a **processor,** which should be a function of two arguments, an element assumed +to have that tag, and a dispatcher. The processor should return the value that +you want elements of that tag transformed into. + +Thus the `html-to-md.html-to-md` namespace comprises a number of *processor* +functions, such as this one: + +```clojure +(defn markdown-a + "Process the anchor element `e` into markdown, using dispatcher `d`." + [e d] + (str + "[" + (s/trim (apply str (process (:content e) d))) + "](" + (-> e :attrs :href) + ")")) +``` + +and a *dispatcher* map: + +```clojure +(def markdown-dispatcher + "A despatcher for transforming (X)HTML into Markdown." + {:a markdown-a + :b markdown-strong + :br markdown-br + :code markdown-code + :body markdown-default + :div markdown-div + :em markdown-em + :h1 markdown-h1 + :h2 markdown-h2 + :h3 markdown-h3 + :h4 markdown-h4 + :h5 markdown-h5 + :h6 markdown-h6 + :html markdown-html + :i markdown-em + :img markdown-img + :ol markdown-ol + :p markdown-div + :pre markdown-pre + :samp markdown-code + :script markdown-omit + :span markdown-default + :strong markdown-strong + :style markdown-omit + :ul markdown-ul + }) +``` + +Obviously it is convenient to write dispatchers as maps, but it isn't required +that you do so: anything which, given a keyword, will return a processor, will +work. + +## License + +Copyright © 2019 Simon Brooke + +Distributed under the Eclipse Public License either version 1.0 or (at +your option) any later version. + diff --git a/src/html_to_md/core.clj b/src/html_to_md/core.clj index 27460d3..ede8350 100644 --- a/src/html_to_md/core.clj +++ b/src/html_to_md/core.clj @@ -1,6 +1,11 @@ -(ns html-to-md.core) +(ns html-to-md.core + (:require [html-to-md.transformer :refer [transform process]] + [html-to-md.html-to-md :refer [markdown-dispatcher]])) -(defn foo - "I don't do a whole lot." - [x] - (println x "Hello, World!")) +(defn html-to-md + "Transform the HTML document referenced by `url` into Markdown, and write + it to `output`, if supplied." + ([url] + (apply str (transform url markdown-dispatcher))) + ([url output] + (spit output (html-to-md url)))) diff --git a/src/html_to_md/html_to_md.clj b/src/html_to_md/html_to_md.clj index 662a23d..c4d6ea7 100644 --- a/src/html_to_md/html_to_md.clj +++ b/src/html_to_md/html_to_md.clj @@ -165,6 +165,7 @@ (def markdown-dispatcher + "A despatcher for transforming (X)HTML into Markdown." {:a markdown-a :b markdown-strong :br markdown-br diff --git a/src/html_to_md/transformer.clj b/src/html_to_md/transformer.clj index 931343c..5a15981 100644 --- a/src/html_to_md/transformer.clj +++ b/src/html_to_md/transformer.clj @@ -29,7 +29,7 @@ (string? element) element (or (seq? element) (vector? element)) - (doall (map #(process % dispatcher) element)))) + (remove nil? (map #(process % dispatcher) element)))) (defn- transformer-dispatch [a _] @@ -45,7 +45,7 @@ (process obj dispatcher)) (defmethod transform java.net.URI [uri dispatcher] - (process (html/html-resource uri) dispatcher)) + (remove nil? (process (html/html-resource uri) dispatcher))) (defmethod transform java.net.URL [url dispatcher] (transform (.toURI url) dispatcher))