Blogger scraper tidied up and documented.
This commit is contained in:
parent
cb801b193f
commit
80cc2e4335
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -11,3 +11,5 @@ pom.xml.asc
|
||||||
.hgignore
|
.hgignore
|
||||||
.hg/
|
.hg/
|
||||||
*~
|
*~
|
||||||
|
|
||||||
|
test\.md
|
||||||
|
|
18
README.md
18
README.md
|
@ -1,6 +1,7 @@
|
||||||
# html-to-md
|
# html-to-md
|
||||||
|
|
||||||
A Clojure library designed to convert (Enlivened) HTML to markdown; but, more
|
A Clojure library designed to convert
|
||||||
|
([Enlive](https://github.com/cgrand/enlive)ned) HTML to markdown; but, more
|
||||||
generally, a framework for [HT|SG|X]ML transformation.
|
generally, a framework for [HT|SG|X]ML transformation.
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
@ -46,6 +47,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If
|
||||||
(def md (html-to-md url))
|
(def md (html-to-md url))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you are specifically scraping [blogger.com](https://www.blogger.com/")
|
||||||
|
pages, you may *try* the following recipe:
|
||||||
|
|
||||||
|
```clojure
|
||||||
|
(require '[html-to-md.core :refer [blogger-to-md]])
|
||||||
|
|
||||||
|
(blogger-to-md url output-file)
|
||||||
|
```
|
||||||
|
|
||||||
|
It works for my blogger pages. However, I'm not sure to what extent the
|
||||||
|
skinning of blogger pages is pure CSS (in which case my recipe should work
|
||||||
|
for yours) and to what extent it's HTML templating (in which case it
|
||||||
|
probably won't). Results not guaranteed, if it doesn't work you get to
|
||||||
|
keep all the pieces.
|
||||||
|
|
||||||
## Extending the transformer
|
## Extending the transformer
|
||||||
|
|
||||||
In principle, the transformer can transform any [HT|SG|X]ML markup into any
|
In principle, the transformer can transform any [HT|SG|X]ML markup into any
|
||||||
|
|
19
doc/intro.md
19
doc/intro.md
|
@ -1,9 +1,5 @@
|
||||||
# Introduction to html-to-md
|
# Introduction to html-to-md
|
||||||
|
|
||||||
TODO: write [great documentation](http://jacobian.org/writing/what-to-write/)
|
|
||||||
|
|
||||||
## Introduction
|
|
||||||
|
|
||||||
The itch I'm trying to scratch at present is to transform
|
The itch I'm trying to scratch at present is to transform
|
||||||
[Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown;
|
[Blogger.com](http://www.blogger.com)'s dreadful tag-soup markup into markdown;
|
||||||
but my architecture for doing this is to build a completely general [HT|SG|X]ML
|
but my architecture for doing this is to build a completely general [HT|SG|X]ML
|
||||||
|
@ -45,6 +41,21 @@ This will read (X)HTML from `url` and write Markdown to `output-file`. If
|
||||||
(def md (html-to-md url))
|
(def md (html-to-md url))
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you are specifically scraping [blogger.com](https://www.blogger.com/")
|
||||||
|
pages, you may *try* the following recipe:
|
||||||
|
|
||||||
|
```clojure
|
||||||
|
(require '[html-to-md.core :refer [blogger-to-md]])
|
||||||
|
|
||||||
|
(blogger-to-md url output-file)
|
||||||
|
```
|
||||||
|
|
||||||
|
It works for my blogger pages. However, I'm not sure to what extent the
|
||||||
|
skinning of blogger pages is pure CSS (in which case my recipe should work
|
||||||
|
for yours) and to what extent it's HTML templating (in which case it
|
||||||
|
probably won't). Results not guaranteed, if it doesn't work you get to
|
||||||
|
keep all the pieces.
|
||||||
|
|
||||||
## Extending the transformer
|
## Extending the transformer
|
||||||
|
|
||||||
In principle, the transformer can transform any [HT|SG|X]ML markup into any
|
In principle, the transformer can transform any [HT|SG|X]ML markup into any
|
||||||
|
|
41
src/html_to_md/blogger_to_md.clj
Normal file
41
src/html_to_md/blogger_to_md.clj
Normal file
|
@ -0,0 +1,41 @@
|
||||||
|
(ns html-to-md.blogger-to-md
|
||||||
|
(:require [clojure.string :as s]
|
||||||
|
[html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]]
|
||||||
|
[html-to-md.transformer :refer [process]]
|
||||||
|
[net.cgrand.enlive-html :as html]))
|
||||||
|
|
||||||
|
(defn blogger-scraper
|
||||||
|
"Processor which scrapes the actual post content out of a blogger page.
|
||||||
|
*NOTE:* This was written to scrape *my* blogger pages, yours may be
|
||||||
|
different!"
|
||||||
|
[e d]
|
||||||
|
(let [title (first (html/select e [:h3.post-title]))
|
||||||
|
content (html/select e [:div.post-body])]
|
||||||
|
(if (and title content)
|
||||||
|
(apply
|
||||||
|
str
|
||||||
|
(cons
|
||||||
|
(markdown-header title d 1)
|
||||||
|
(process content d))))))
|
||||||
|
|
||||||
|
(defn image-table-processor
|
||||||
|
"Blogger's horrible tag soup wraps images in tables. Is this table such
|
||||||
|
a table? If so extract the image from it and process it to markdown;
|
||||||
|
otherwise, fall back on what `markdown-dispatcher` would do with the
|
||||||
|
table (which is currently nothing, but that will change)."
|
||||||
|
[e d]
|
||||||
|
(let [caption (process (first (html/select e [:td.tr-caption])) d)
|
||||||
|
alt (if caption (s/trim (apply str caption)))
|
||||||
|
image (first (html/select e [:img]))
|
||||||
|
src (if image (-> image :attrs :src))]
|
||||||
|
(if image
|
||||||
|
(str "")
|
||||||
|
(process e markdown-dispatcher))))
|
||||||
|
|
||||||
|
|
||||||
|
(def blogger-dispatcher
|
||||||
|
"Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and
|
||||||
|
`:html` dispatches overridden."
|
||||||
|
(assoc markdown-dispatcher
|
||||||
|
:html blogger-scraper
|
||||||
|
:table image-table-processor))
|
|
@ -1,6 +1,7 @@
|
||||||
(ns html-to-md.core
|
(ns html-to-md.core
|
||||||
(:require [html-to-md.transformer :refer [transform process]]
|
(:require [html-to-md.transformer :refer [transform process]]
|
||||||
[html-to-md.html-to-md :refer [markdown-dispatcher]]))
|
[html-to-md.html-to-md :refer [markdown-dispatcher]]
|
||||||
|
[html-to-md.blogger-to-md :refer [blogger-dispatcher]]))
|
||||||
|
|
||||||
(defn html-to-md
|
(defn html-to-md
|
||||||
"Transform the HTML document referenced by `url` into Markdown, and write
|
"Transform the HTML document referenced by `url` into Markdown, and write
|
||||||
|
@ -9,3 +10,12 @@
|
||||||
(apply str (transform url markdown-dispatcher)))
|
(apply str (transform url markdown-dispatcher)))
|
||||||
([url output]
|
([url output]
|
||||||
(spit output (html-to-md url))))
|
(spit output (html-to-md url))))
|
||||||
|
|
||||||
|
(defn blogger-to-md
|
||||||
|
"Transform the Blogger post referenced by `url` into Markdown, and write
|
||||||
|
it to `output`, if supplied. *NOTE:* This was written to scrape *my*
|
||||||
|
blogger pages, yours may be different!"
|
||||||
|
([url]
|
||||||
|
(apply str (transform url blogger-dispatcher)))
|
||||||
|
([url output]
|
||||||
|
(spit output (blogger-to-md url))))
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
"Process the line-break element `e`, so beloved of tag-soupers, into
|
"Process the line-break element `e`, so beloved of tag-soupers, into
|
||||||
markdown"
|
markdown"
|
||||||
[e d]
|
[e d]
|
||||||
"\n\n")
|
"\n")
|
||||||
|
|
||||||
(defn markdown-code
|
(defn markdown-code
|
||||||
"Process the code or samp `e` into markdown, using dispatcher `d`."
|
"Process the code or samp `e` into markdown, using dispatcher `d`."
|
||||||
|
|
Loading…
Reference in a new issue