Blogger scraper tidied up and documented.
This commit is contained in:
parent
cb801b193f
commit
80cc2e4335
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -11,3 +11,5 @@ pom.xml.asc
|
|||
.hgignore
|
||||
.hg/
|
||||
*~
|
||||
|
||||
test\.md
|
||||
|
|
18
README.md
18
README.md
|
@ -1,6 +1,7 @@
|
|||
# html-to-md
|
||||
|
||||
A Clojure library designed to convert (Enlivened) HTML to markdown; but, more
|
||||
A Clojure library designed to convert
|
||||
([Enlive](https://github.com/cgrand/enlive)ned) HTML to markdown; but, more
|
||||
generally, a framework for [HT|SG|X]ML transformation.
|
||||
|
||||
## Introduction
|
||||
|
@ -46,6 +47,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If
|
|||
(def md (html-to-md url))
|
||||
```
|
||||
|
||||
If you are specifically scraping [blogger.com](https://www.blogger.com/")
|
||||
pages, you may *try* the following recipe:
|
||||
|
||||
```clojure
|
||||
(require '[html-to-md.core :refer [blogger-to-md]])
|
||||
|
||||
(blogger-to-md url output-file)
|
||||
```
|
||||
|
||||
It works for my blogger pages. However, I'm not sure to what extent the
|
||||
skinning of blogger pages is pure CSS (in which case my recipe should work
|
||||
for yours) and to what extent it's HTML templating (in which case it
|
||||
probably won't). Results not guaranteed, if it doesn't work you get to
|
||||
keep all the pieces.
|
||||
|
||||
## Extending the transformer
|
||||
|
||||
In principle, the transformer can transform any [HT|SG|X]ML markup into any
|
||||
|
|
19
doc/intro.md
19
doc/intro.md
|
@ -1,9 +1,5 @@
|
|||
# Introduction to html-to-md
|
||||
|
||||
TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
|
||||
|
||||
## Introduction
|
||||
|
||||
The itch I'm trying to scratch at present is to transform
|
||||
[Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown;
|
||||
but my architecture for doing this is to build a completely general [HT|SG|X]ML
|
||||
|
@ -45,6 +41,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If
|
|||
(def md (html-to-md url))
|
||||
```
|
||||
|
||||
If you are specifically scraping [blogger.com](https://www.blogger.com/")
|
||||
pages, you may *try* the following recipe:
|
||||
|
||||
```clojure
|
||||
(require '[html-to-md.core :refer [blogger-to-md]])
|
||||
|
||||
(blogger-to-md url output-file)
|
||||
```
|
||||
|
||||
It works for my blogger pages. However, I'm not sure to what extent the
|
||||
skinning of blogger pages is pure CSS (in which case my recipe should work
|
||||
for yours) and to what extent it's HTML templating (in which case it
|
||||
probably won't). Results not guaranteed, if it doesn't work you get to
|
||||
keep all the pieces.
|
||||
|
||||
## Extending the transformer
|
||||
|
||||
In principle, the transformer can transform any [HT|SG|X]ML markup into any
|
||||
|
|
41
src/html_to_md/blogger_to_md.clj
Normal file
41
src/html_to_md/blogger_to_md.clj
Normal file
|
@ -0,0 +1,41 @@
|
|||
(ns html-to-md.blogger-to-md
|
||||
(:require [clojure.string :as s]
|
||||
[html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]]
|
||||
[html-to-md.transformer :refer [process]]
|
||||
[net.cgrand.enlive-html :as html]))
|
||||
|
||||
(defn blogger-scraper
|
||||
"Processor which scrapes the actual post content out of a blogger page.
|
||||
*NOTE:* This was written to scrape *my* blogger pages, yours may be
|
||||
different!"
|
||||
[e d]
|
||||
(let [title (first (html/select e [:h3.post-title]))
|
||||
content (html/select e [:div.post-body])]
|
||||
(if (and title content)
|
||||
(apply
|
||||
str
|
||||
(cons
|
||||
(markdown-header title d 1)
|
||||
(process content d))))))
|
||||
|
||||
(defn image-table-processor
|
||||
"Blogger's horrible tag soup wraps images in tables. Is this table such
|
||||
a table? If so extract the image from it and process it to markdown;
|
||||
otherwise, fall back on what `markdown-dispatcher` would do with the
|
||||
table (which is currently nothing, but that will change)."
|
||||
[e d]
|
||||
(let [caption (process (first (html/select e [:td.tr-caption])) d)
|
||||
alt (if caption (s/trim (apply str caption)))
|
||||
image (first (html/select e [:img]))
|
||||
src (if image (-> image :attrs :src))]
|
||||
(if image
|
||||
(str "")
|
||||
(process e markdown-dispatcher))))
|
||||
|
||||
|
||||
(def blogger-dispatcher
|
||||
"Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and
|
||||
`:html` dispatches overridden."
|
||||
(assoc markdown-dispatcher
|
||||
:html blogger-scraper
|
||||
:table image-table-processor))
|
|
@ -1,6 +1,7 @@
|
|||
(ns html-to-md.core
|
||||
(:require [html-to-md.transformer :refer [transform process]]
|
||||
[html-to-md.html-to-md :refer [markdown-dispatcher]]))
|
||||
[html-to-md.html-to-md :refer [markdown-dispatcher]]
|
||||
[html-to-md.blogger-to-md :refer [blogger-dispatcher]]))
|
||||
|
||||
(defn html-to-md
|
||||
"Transform the HTML document referenced by `url` into Markdown, and write
|
||||
|
@ -9,3 +10,12 @@
|
|||
(apply str (transform url markdown-dispatcher)))
|
||||
([url output]
|
||||
(spit output (html-to-md url))))
|
||||
|
||||
(defn blogger-to-md
|
||||
"Transform the Blogger post referenced by `url` into Markdown, and write
|
||||
it to `output`, if supplied. *NOTE:* This was written to scrape *my*
|
||||
blogger pages, yours may be different!"
|
||||
([url]
|
||||
(apply str (transform url blogger-dispatcher)))
|
||||
([url output]
|
||||
(spit output (blogger-to-md url))))
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
"Process the line-break element `e`, so beloved of tag-soupers, into
|
||||
markdown"
|
||||
[e d]
|
||||
"\n\n")
|
||||
"\n")
|
||||
|
||||
(defn markdown-code
|
||||
"Process the code or samp `e` into markdown, using dispatcher `d`."
|
||||
|
|
Loading…
Reference in a new issue