Merge branch 'release/0.3.0'

This commit is contained in:
Simon Brooke 2019-05-01 14:10:30 +01:00
commit b00a4e4890
18 changed files with 943 additions and 12 deletions

View file

@ -18,7 +18,7 @@ unit test coverage.
To use this library in your project, add the following leiningen dependency: To use this library in your project, add the following leiningen dependency:
[org.clojars.simon_brooke/html-to-md "0.2.0"] [org.clojars.simon_brooke/html-to-md "0.3.0"]
To use it in your namespace, require: To use it in your namespace, require:

View file

@ -12,7 +12,7 @@ unit test coverage.
To use this library in your project, add the following leiningen dependency: To use this library in your project, add the following leiningen dependency:
[org.clojars.simon_brooke/html-to-md "0.2.0"] [org.clojars.simon_brooke/html-to-md "0.3.0"]
To use it in your namespace, require: To use it in your namespace, require:

551
docs/css/default.css Normal file
View file

@ -0,0 +1,551 @@
body {
font-family: Helvetica, Arial, sans-serif;
font-size: 15px;
}
pre, code {
font-family: Monaco, DejaVu Sans Mono, Consolas, monospace;
font-size: 9pt;
margin: 15px 0;
}
h1 {
font-weight: normal;
font-size: 29px;
margin: 10px 0 2px 0;
padding: 0;
}
h2 {
font-weight: normal;
font-size: 25px;
}
h5.license {
margin: 9px 0 22px 0;
color: #555;
font-weight: normal;
font-size: 12px;
font-style: italic;
}
.document h1, .namespace-index h1 {
font-size: 32px;
margin-top: 12px;
}
#header, #content, .sidebar {
position: fixed;
}
#header {
top: 0;
left: 0;
right: 0;
height: 22px;
color: #f5f5f5;
padding: 5px 7px;
}
#content {
top: 32px;
right: 0;
bottom: 0;
overflow: auto;
background: #fff;
color: #333;
padding: 0 18px;
}
.sidebar {
position: fixed;
top: 32px;
bottom: 0;
overflow: auto;
}
.sidebar.primary {
background: #e2e2e2;
border-right: solid 1px #cccccc;
left: 0;
width: 250px;
}
.sidebar.secondary {
background: #f2f2f2;
border-right: solid 1px #d7d7d7;
left: 251px;
width: 200px;
}
#content.namespace-index, #content.document {
left: 251px;
}
#content.namespace-docs {
left: 452px;
}
#content.document {
padding-bottom: 10%;
}
#header {
background: #3f3f3f;
box-shadow: 0 0 8px rgba(0, 0, 0, 0.4);
z-index: 100;
}
#header h1 {
margin: 0;
padding: 0;
font-size: 18px;
font-weight: lighter;
text-shadow: -1px -1px 0px #333;
}
#header h1 .project-version {
font-weight: normal;
}
.project-version {
padding-left: 0.15em;
}
#header a, .sidebar a {
display: block;
text-decoration: none;
}
#header a {
color: #f5f5f5;
}
.sidebar a {
color: #333;
}
#header h2 {
float: right;
font-size: 9pt;
font-weight: normal;
margin: 4px 3px;
padding: 0;
color: #bbb;
}
#header h2 a {
display: inline;
}
.sidebar h3 {
margin: 0;
padding: 10px 13px 0 13px;
font-size: 19px;
font-weight: lighter;
}
.sidebar h3 a {
color: #444;
}
.sidebar h3.no-link {
color: #636363;
}
.sidebar ul {
padding: 7px 0 6px 0;
margin: 0;
}
.sidebar ul.index-link {
padding-bottom: 4px;
}
.sidebar li {
display: block;
vertical-align: middle;
}
.sidebar li a, .sidebar li .no-link {
border-left: 3px solid transparent;
padding: 0 10px;
white-space: nowrap;
}
.sidebar li .no-link {
display: block;
color: #777;
font-style: italic;
}
.sidebar li .inner {
display: inline-block;
padding-top: 7px;
height: 24px;
}
.sidebar li a, .sidebar li .tree {
height: 31px;
}
.depth-1 .inner { padding-left: 2px; }
.depth-2 .inner { padding-left: 6px; }
.depth-3 .inner { padding-left: 20px; }
.depth-4 .inner { padding-left: 34px; }
.depth-5 .inner { padding-left: 48px; }
.depth-6 .inner { padding-left: 62px; }
.sidebar li .tree {
display: block;
float: left;
position: relative;
top: -10px;
margin: 0 4px 0 0;
padding: 0;
}
.sidebar li.depth-1 .tree {
display: none;
}
.sidebar li .tree .top, .sidebar li .tree .bottom {
display: block;
margin: 0;
padding: 0;
width: 7px;
}
.sidebar li .tree .top {
border-left: 1px solid #aaa;
border-bottom: 1px solid #aaa;
height: 19px;
}
.sidebar li .tree .bottom {
height: 22px;
}
.sidebar li.branch .tree .bottom {
border-left: 1px solid #aaa;
}
.sidebar.primary li.current a {
border-left: 3px solid #a33;
color: #a33;
}
.sidebar.secondary li.current a {
border-left: 3px solid #33a;
color: #33a;
}
.namespace-index h2 {
margin: 30px 0 0 0;
}
.namespace-index h3 {
font-size: 16px;
font-weight: bold;
margin-bottom: 0;
}
.namespace-index .topics {
padding-left: 30px;
margin: 11px 0 0 0;
}
.namespace-index .topics li {
padding: 5px 0;
}
.namespace-docs h3 {
font-size: 18px;
font-weight: bold;
}
.public h3 {
margin: 0;
float: left;
}
.usage {
clear: both;
}
.public {
margin: 0;
border-top: 1px solid #e0e0e0;
padding-top: 14px;
padding-bottom: 6px;
}
.public:last-child {
margin-bottom: 20%;
}
.members .public:last-child {
margin-bottom: 0;
}
.members {
margin: 15px 0;
}
.members h4 {
color: #555;
font-weight: normal;
font-variant: small-caps;
margin: 0 0 5px 0;
}
.members .inner {
padding-top: 5px;
padding-left: 12px;
margin-top: 2px;
margin-left: 7px;
border-left: 1px solid #bbb;
}
#content .members .inner h3 {
font-size: 12pt;
}
.members .public {
border-top: none;
margin-top: 0;
padding-top: 6px;
padding-bottom: 0;
}
.members .public:first-child {
padding-top: 0;
}
h4.type,
h4.dynamic,
h4.added,
h4.deprecated {
float: left;
margin: 3px 10px 15px 0;
font-size: 15px;
font-weight: bold;
font-variant: small-caps;
}
.public h4.type,
.public h4.dynamic,
.public h4.added,
.public h4.deprecated {
font-size: 13px;
font-weight: bold;
margin: 3px 0 0 10px;
}
.members h4.type,
.members h4.added,
.members h4.deprecated {
margin-top: 1px;
}
h4.type {
color: #717171;
}
h4.dynamic {
color: #9933aa;
}
h4.added {
color: #508820;
}
h4.deprecated {
color: #880000;
}
.namespace {
margin-bottom: 30px;
}
.namespace:last-child {
margin-bottom: 10%;
}
.index {
padding: 0;
font-size: 80%;
margin: 15px 0;
line-height: 16px;
}
.index * {
display: inline;
}
.index p {
padding-right: 3px;
}
.index li {
padding-right: 5px;
}
.index ul {
padding-left: 0;
}
.type-sig {
clear: both;
color: #088;
}
.type-sig pre {
padding-top: 10px;
margin: 0;
}
.usage code {
display: block;
color: #008;
margin: 2px 0;
}
.usage code:first-child {
padding-top: 10px;
}
p {
margin: 15px 0;
}
.public p:first-child, .public pre.plaintext {
margin-top: 12px;
}
.doc {
margin: 0 0 26px 0;
clear: both;
}
.public .doc {
margin: 0;
}
.namespace-index .doc {
margin-bottom: 20px;
}
.namespace-index .namespace .doc {
margin-bottom: 10px;
}
.markdown p, .markdown li, .markdown dt, .markdown dd, .markdown td {
line-height: 22px;
}
.markdown li {
padding: 2px 0;
}
.markdown h2 {
font-weight: normal;
font-size: 25px;
margin: 30px 0 10px 0;
}
.markdown h3 {
font-weight: normal;
font-size: 20px;
margin: 30px 0 0 0;
}
.markdown h4 {
font-size: 15px;
margin: 22px 0 -4px 0;
}
.doc, .public, .namespace .index {
max-width: 680px;
overflow-x: visible;
}
.markdown pre > code {
display: block;
padding: 10px;
}
.markdown pre > code, .src-link a {
border: 1px solid #e4e4e4;
border-radius: 2px;
}
.markdown code:not(.hljs), .src-link a {
background: #f6f6f6;
}
pre.deps {
display: inline-block;
margin: 0 10px;
border: 1px solid #e4e4e4;
border-radius: 2px;
padding: 10px;
background-color: #f6f6f6;
}
.markdown hr {
border-style: solid;
border-top: none;
color: #ccc;
}
.doc ul, .doc ol {
padding-left: 30px;
}
.doc table {
border-collapse: collapse;
margin: 0 10px;
}
.doc table td, .doc table th {
border: 1px solid #dddddd;
padding: 4px 6px;
}
.doc table th {
background: #f2f2f2;
}
.doc dl {
margin: 0 10px 20px 10px;
}
.doc dl dt {
font-weight: bold;
margin: 0;
padding: 3px 0;
border-bottom: 1px solid #ddd;
}
.doc dl dd {
padding: 5px 0;
margin: 0 0 5px 10px;
}
.doc abbr {
border-bottom: 1px dotted #333;
font-variant: none;
cursor: help;
}
.src-link {
margin-bottom: 15px;
}
.src-link a {
font-size: 70%;
padding: 1px 4px;
text-decoration: none;
color: #5555bb;
}

97
docs/css/highlight.css Normal file
View file

@ -0,0 +1,97 @@
/*
github.com style (c) Vasily Polovnyov <vast@whiteants.net>
*/
.hljs {
display: block;
overflow-x: auto;
padding: 0.5em;
color: #333;
background: #f8f8f8;
}
.hljs-comment,
.hljs-quote {
color: #998;
font-style: italic;
}
.hljs-keyword,
.hljs-selector-tag,
.hljs-subst {
color: #333;
font-weight: bold;
}
.hljs-number,
.hljs-literal,
.hljs-variable,
.hljs-template-variable,
.hljs-tag .hljs-attr {
color: #008080;
}
.hljs-string,
.hljs-doctag {
color: #d14;
}
.hljs-title,
.hljs-section,
.hljs-selector-id {
color: #900;
font-weight: bold;
}
.hljs-subst {
font-weight: normal;
}
.hljs-type,
.hljs-class .hljs-title {
color: #458;
font-weight: bold;
}
.hljs-tag,
.hljs-name,
.hljs-attribute {
color: #000080;
font-weight: normal;
}
.hljs-regexp,
.hljs-link {
color: #009926;
}
.hljs-symbol,
.hljs-bullet {
color: #990073;
}
.hljs-built_in,
.hljs-builtin-name {
color: #0086b3;
}
.hljs-meta {
color: #999;
font-weight: bold;
}
.hljs-deletion {
background: #fdd;
}
.hljs-addition {
background: #dfd;
}
.hljs-emphasis {
font-style: italic;
}
.hljs-strong {
font-weight: bold;
}

View file

@ -0,0 +1,3 @@
<!DOCTYPE html PUBLIC ""
"">
<html><head><meta charset="UTF-8" /><title>html-to-md.blogger-to-md documentation</title><link rel="stylesheet" type="text/css" href="css/default.css" /><link rel="stylesheet" type="text/css" href="css/highlight.css" /><script type="text/javascript" src="js/highlight.min.js"></script><script type="text/javascript" src="js/jquery.min.js"></script><script type="text/javascript" src="js/page_effects.js"></script><script>hljs.initHighlightingOnLoad();</script></head><body><div id="header"><h2>Generated by <a href="https://github.com/weavejester/codox">Codox</a></h2><h1><a href="index.html"><span class="project-title"><span class="project-name">Html-to-md</span> <span class="project-version">0.2.0</span></span></a></h1></div><div class="sidebar primary"><h3 class="no-link"><span class="inner">Project</span></h3><ul class="index-link"><li class="depth-1 "><a href="index.html"><div class="inner">Index</div></a></li></ul><h3 class="no-link"><span class="inner">Topics</span></h3><ul><li class="depth-1 "><a href="intro.html"><div class="inner"><span>Introduction to html-to-md</span></div></a></li></ul><h3 class="no-link"><span class="inner">Namespaces</span></h3><ul><li class="depth-1"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></div></li><li class="depth-2 branch current"><a href="html-to-md.blogger-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>blogger-to-md</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.core.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>core</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.html-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></a></li><li class="depth-2"><a href="html-to-md.transformer.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>transformer</span></div></a></li></ul></div><div class="sidebar secondary"><h3><a href="#top"><span class="inner">Public Vars</span></a></h3><ul><li class="depth-1"><a href="html-to-md.blogger-to-md.html#var-blogger-dispatcher"><div class="inner"><span>blogger-dispatcher</span></div></a></li><li class="depth-1"><a href="html-to-md.blogger-to-md.html#var-blogger-scraper"><div class="inner"><span>blogger-scraper</span></div></a></li><li class="depth-1"><a href="html-to-md.blogger-to-md.html#var-image-table-processor"><div class="inner"><span>image-table-processor</span></div></a></li></ul></div><div class="namespace-docs" id="content"><h1 class="anchor" id="top">html-to-md.blogger-to-md</h1><div class="doc"><div class="markdown"><p>Convert blogger posts to Markdown format, omitting all the Blogger chrome and navigation.</p></div></div><div class="public anchor" id="var-blogger-dispatcher"><h3>blogger-dispatcher</h3><div class="usage"></div><div class="doc"><div class="markdown"><p>Adaptation of <code>markdown-dispatcher</code>, q.v., with the <code>:table</code> and <code>:html</code> dispatches overridden.</p></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/blogger_to_md.clj#L38">view source</a></div></div><div class="public anchor" id="var-blogger-scraper"><h3>blogger-scraper</h3><div class="usage"><code>(blogger-scraper e d)</code></div><div class="doc"><div class="markdown"><p>Processor which scrapes the actual post content out of a blogger page. <em>NOTE:</em> This was written to scrape <em>my</em> blogger pages, yours may be different!</p></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/blogger_to_md.clj#L9">view source</a></div></div><div class="public anchor" id="var-image-table-processor"><h3>image-table-processor</h3><div class="usage"><code>(image-table-processor e d)</code></div><div class="doc"><div class="markdown"><p>Bloggers horrible tag soup wraps images in tables. Is this table such a table? If so extract the image from it and process it to markdown; otherwise, fall back on what <code>markdown-dispatcher</code> would do with the table (which is currently nothing, but that will change).</p></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/blogger_to_md.clj#L23">view source</a></div></div></div></body></html>

View file

@ -0,0 +1,3 @@
<!DOCTYPE html PUBLIC ""
"">
<html><head><meta charset="UTF-8" /><title>html-to-md.core documentation</title><link rel="stylesheet" type="text/css" href="css/default.css" /><link rel="stylesheet" type="text/css" href="css/highlight.css" /><script type="text/javascript" src="js/highlight.min.js"></script><script type="text/javascript" src="js/jquery.min.js"></script><script type="text/javascript" src="js/page_effects.js"></script><script>hljs.initHighlightingOnLoad();</script></head><body><div id="header"><h2>Generated by <a href="https://github.com/weavejester/codox">Codox</a></h2><h1><a href="index.html"><span class="project-title"><span class="project-name">Html-to-md</span> <span class="project-version">0.2.0</span></span></a></h1></div><div class="sidebar primary"><h3 class="no-link"><span class="inner">Project</span></h3><ul class="index-link"><li class="depth-1 "><a href="index.html"><div class="inner">Index</div></a></li></ul><h3 class="no-link"><span class="inner">Topics</span></h3><ul><li class="depth-1 "><a href="intro.html"><div class="inner"><span>Introduction to html-to-md</span></div></a></li></ul><h3 class="no-link"><span class="inner">Namespaces</span></h3><ul><li class="depth-1"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></div></li><li class="depth-2 branch"><a href="html-to-md.blogger-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>blogger-to-md</span></div></a></li><li class="depth-2 branch current"><a href="html-to-md.core.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>core</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.html-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></a></li><li class="depth-2"><a href="html-to-md.transformer.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>transformer</span></div></a></li></ul></div><div class="sidebar secondary"><h3><a href="#top"><span class="inner">Public Vars</span></a></h3><ul><li class="depth-1"><a href="html-to-md.core.html#var-blogger-to-md"><div class="inner"><span>blogger-to-md</span></div></a></li><li class="depth-1"><a href="html-to-md.core.html#var-html-to-md"><div class="inner"><span>html-to-md</span></div></a></li></ul></div><div class="namespace-docs" id="content"><h1 class="anchor" id="top">html-to-md.core</h1><div class="doc"><div class="markdown"><p>Top level functions intended for very simple use.</p></div></div><div class="public anchor" id="var-blogger-to-md"><h3>blogger-to-md</h3><div class="usage"><code>(blogger-to-md url)</code><code>(blogger-to-md url output)</code></div><div class="doc"><div class="markdown"><p>Transform the Blogger post referenced by <code>url</code> into Markdown, and write it to <code>output</code>, if supplied. <em>NOTE:</em> This was written to scrape <em>my</em> blogger pages, yours may be different!</p></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/core.clj#L15">view source</a></div></div><div class="public anchor" id="var-html-to-md"><h3>html-to-md</h3><div class="usage"><code>(html-to-md url)</code><code>(html-to-md url output)</code></div><div class="doc"><div class="markdown"><p>Transform the HTML document referenced by <code>url</code> into Markdown, and write it to <code>output</code>, if supplied.</p></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/core.clj#L7">view source</a></div></div></div></body></html>

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,26 @@
<!DOCTYPE html PUBLIC ""
"">
<html><head><meta charset="UTF-8" /><title>html-to-md.transformer documentation</title><link rel="stylesheet" type="text/css" href="css/default.css" /><link rel="stylesheet" type="text/css" href="css/highlight.css" /><script type="text/javascript" src="js/highlight.min.js"></script><script type="text/javascript" src="js/jquery.min.js"></script><script type="text/javascript" src="js/page_effects.js"></script><script>hljs.initHighlightingOnLoad();</script></head><body><div id="header"><h2>Generated by <a href="https://github.com/weavejester/codox">Codox</a></h2><h1><a href="index.html"><span class="project-title"><span class="project-name">Html-to-md</span> <span class="project-version">0.2.0</span></span></a></h1></div><div class="sidebar primary"><h3 class="no-link"><span class="inner">Project</span></h3><ul class="index-link"><li class="depth-1 "><a href="index.html"><div class="inner">Index</div></a></li></ul><h3 class="no-link"><span class="inner">Topics</span></h3><ul><li class="depth-1 "><a href="intro.html"><div class="inner"><span>Introduction to html-to-md</span></div></a></li></ul><h3 class="no-link"><span class="inner">Namespaces</span></h3><ul><li class="depth-1"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></div></li><li class="depth-2 branch"><a href="html-to-md.blogger-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>blogger-to-md</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.core.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>core</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.html-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></a></li><li class="depth-2 current"><a href="html-to-md.transformer.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>transformer</span></div></a></li></ul></div><div class="sidebar secondary"><h3><a href="#top"><span class="inner">Public Vars</span></a></h3><ul><li class="depth-1"><a href="html-to-md.transformer.html#var-process"><div class="inner"><span>process</span></div></a></li><li class="depth-1"><a href="html-to-md.transformer.html#var-transform"><div class="inner"><span>transform</span></div></a></li></ul></div><div class="namespace-docs" id="content"><h1 class="anchor" id="top">html-to-md.transformer</h1><div class="doc"><div class="markdown"><p>The actual transformation engine, which is actually far more general than just something to generate <a href="https://daringfireball.net/projects/markdown/">Markdown</a>. It isnt as general as <a href="https://www.w3.org/standards/xml/transformation">XSL-T</a> but can nevertheless do a great deal of transformation on [HT|SG|X]ML documents.</p>
<h2><a href="#terminology" name="terminology"></a>Terminology</h2>
<p>In this documentation the following terminology is used:</p>
<ul>
<li><strong>dispatcher</strong>: a <code>dispatcher</code> is a function (or more probably a map) which takes one argument, the tag of the element as a keyword, and returns a <code>processor</code>, q.v.</li>
<li><strong>processor</strong>: a <code>processor</code> is a function of two arguments, an <a href="https://github.com/cgrand/enlive">Enlive</a> encoded (X)HTML element and a <code>dispatcher</code> as described above, which processes elements into the desired format.</li>
</ul>
<h2><a href="#generality" name="generality"></a>Generality</h2>
<p><strong>NOTE</strong> that while <code>processors</code> within the <code>html-to-md</code> package generally process elements into strings (since Markdown is a text format), when processing into an XML format it will generally be preferable that <code>processors</code> should return Enlive style elements.</p></div></div><div class="public anchor" id="var-process"><h3>process</h3><div class="usage"><code>(process element dispatcher)</code></div><div class="doc"><div class="markdown"><p>Process this <code>element</code>, assumed to be a [HT|SG|X]ML element in <a href="https://github.com/cgrand/enlive">Enlive</a> encoding, using this <code>dispatcher</code>,</p>
<p>Such a function should take two arguments, the <code>element</code> itself and a dispatcher which will normally (but not necessarily) be the <code>dispatcher</code> supplied to this function.</p>
<p>If the dispatcher returns <code>nil</code>, the default behaviour is that <code>process</code> is mapped over the content of the element.</p>
<p>If <code>element</code> is not an [HT|SG|X]ML element in Enlive encoding as descibed above, then</p>
<ol>
<li>if the <code>element</code> is a string, returns that string unaltered;</li>
<li>if the <code>element</code> is a sequence or vector, maps <code>process</code> across the members of the sequence;</li>
<li>otherwise, returns <code>nil</code>.</li>
</ol></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/transformer.clj#L32">view source</a></div></div><div class="public anchor" id="var-transform"><h3>transform</h3><h4 class="type">multimethod</h4><div class="usage"></div><div class="doc"><div class="markdown"><p>Transform the <code>obj</code> which is my first argument using the <code>dispatcher</code> which is my second argument. <code>obj</code> can be:</p>
<ol>
<li>A URL or URI;</li>
<li>A string representation of a URL or URI;</li>
<li>A string representation of an (X)HTML fragment;</li>
<li>An <a href="https://github.com/cgrand/enlive">Enlive</a> encoded (X)HTML element;</li>
<li>A sequence of <a href="https://github.com/cgrand/enlive">Enlive</a> encoded (X)HTML elements.</li>
</ol></div></div><div class="src-link"><a href="https://github.com/simon-brooke/html-to-md/blob/master/src/html_to_md/transformer.clj#L69">view source</a></div></div></div></body></html>

3
docs/index.html Normal file

File diff suppressed because one or more lines are too long

79
docs/intro.html Normal file
View file

@ -0,0 +1,79 @@
<!DOCTYPE html PUBLIC ""
"">
<html><head><meta charset="UTF-8" /><title>Introduction to html-to-md</title><link rel="stylesheet" type="text/css" href="css/default.css" /><link rel="stylesheet" type="text/css" href="css/highlight.css" /><script type="text/javascript" src="js/highlight.min.js"></script><script type="text/javascript" src="js/jquery.min.js"></script><script type="text/javascript" src="js/page_effects.js"></script><script>hljs.initHighlightingOnLoad();</script></head><body><div id="header"><h2>Generated by <a href="https://github.com/weavejester/codox">Codox</a></h2><h1><a href="index.html"><span class="project-title"><span class="project-name">Html-to-md</span> <span class="project-version">0.2.0</span></span></a></h1></div><div class="sidebar primary"><h3 class="no-link"><span class="inner">Project</span></h3><ul class="index-link"><li class="depth-1 "><a href="index.html"><div class="inner">Index</div></a></li></ul><h3 class="no-link"><span class="inner">Topics</span></h3><ul><li class="depth-1 current"><a href="intro.html"><div class="inner"><span>Introduction to html-to-md</span></div></a></li></ul><h3 class="no-link"><span class="inner">Namespaces</span></h3><ul><li class="depth-1"><div class="no-link"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></div></li><li class="depth-2 branch"><a href="html-to-md.blogger-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>blogger-to-md</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.core.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>core</span></div></a></li><li class="depth-2 branch"><a href="html-to-md.html-to-md.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>html-to-md</span></div></a></li><li class="depth-2"><a href="html-to-md.transformer.html"><div class="inner"><span class="tree"><span class="top"></span><span class="bottom"></span></span><span>transformer</span></div></a></li></ul></div><div class="document" id="content"><div class="doc"><div class="markdown"><h1><a href="#introduction-to-html-to-md" name="introduction-to-html-to-md"></a>Introduction to html-to-md</h1>
<p>The itch Im trying to scratch at present is to transform <a href="http://www.blogger.com">Blogger.com</a>s dreadful tag-soup markup into markdown; but my architecture for doing this is to build a completely general [HT|SG|X]ML transformation framework and then specialise it.</p>
<p><strong>WARNING:</strong> this is presently alpha-quality code, although it does have fair unit test coverage.</p>
<h2><a href="#usage" name="usage"></a>Usage</h2>
<p>To use this library in your project, add the following leiningen dependency:</p>
<pre><code>[org.clojars.simon_brooke/html-to-md "0.2.0"]
</code></pre>
<p>To use it in your namespace, require:</p>
<pre><code>[html-to-md.core :refer [html-to-md]]
</code></pre>
<p>For default usage, thats all you need. To play more sophisticated tricks, consider:</p>
<pre><code>[html-to-md.transformer :refer [transform process]]
[html-to-md.html-to-md :refer [markdown-dispatcher]]
</code></pre>
<p>The intended usage is as follows:</p>
<pre><code class="clojure">(require '[html-to-md.core :refer [html-to-md]])
(html-to-md url output-file)
</code></pre>
<p>This will read (X)HTML from <code>url</code> and write Markdown to <code>output-file</code>. If <code>output-file</code> is not supplied, it will return the markdown as a string:</p>
<pre><code class="clojure">(require '[html-to-md.core :refer [html-to-md]])
(def md (html-to-md url))
</code></pre>
<p>If you are specifically scraping <a href="https://www.blogger.com/" "="">blogger.com</a> pages, you may <em>try</em> the following recipe:</p>
<pre><code class="clojure">(require '[html-to-md.core :refer [blogger-to-md]])
(blogger-to-md url output-file)
</code></pre>
<p>It works for my blogger pages. However, Im not sure to what extent the skinning of blogger pages is pure CSS (in which case my recipe should work for yours) and to what extent its HTML templating (in which case it probably wont). Results not guaranteed, if it doesnt work you get to keep all the pieces.</p>
<h2><a href="#extending-the-transformer" name="extending-the-transformer"></a>Extending the transformer</h2>
<p>In principle, the transformer can transform any [HT|SG|X]ML markup into any other, or into any textual form. To extend it to do something other than markdown, supply a <strong>dispatcher</strong>. A dispatcher is essentially a function of one argument, a [HT|SG|X]ML tag represented as a Clojure keyword, which returns a <strong>processor,</strong> which should be a function of two arguments, an element assumed to have that tag, and a dispatcher. The processor should return the value that you want elements of that tag transformed into.</p>
<p>Thus the <code>html-to-md.html-to-md</code> namespace comprises a number of <em>processor</em> functions, such as this one:</p>
<pre><code class="clojure">(defn markdown-a
"Process the anchor element `e` into markdown, using dispatcher `d`."
[e d]
(str
"["
(s/trim (apply str (process (:content e) d)))
"]("
(-&gt; e :attrs :href)
")"))
</code></pre>
<p>and a <em>dispatcher</em> map:</p>
<pre><code class="clojure">(def markdown-dispatcher
"A despatcher for transforming (X)HTML into Markdown."
{:a markdown-a
:b markdown-strong
:br markdown-br
:code markdown-code
:body markdown-default
:div markdown-div
:em markdown-em
:h1 markdown-h1
:h2 markdown-h2
:h3 markdown-h3
:h4 markdown-h4
:h5 markdown-h5
:h6 markdown-h6
:html markdown-html
:i markdown-em
:img markdown-img
:ol markdown-ol
:p markdown-div
:pre markdown-pre
:samp markdown-code
:script markdown-omit
:span markdown-default
:strong markdown-strong
:style markdown-omit
:ul markdown-ul
})
</code></pre>
<p>Obviously it is convenient to write dispatchers as maps, but it isnt required that you do so: anything which, given a keyword, will return a processor, will work.</p>
<h2><a href="#license" name="license"></a>License</h2>
<p>Copyright © 2019 Simon Brooke <a href="mailto:simon@journeyman.cc">simon@journeyman.cc</a></p>
<p>Distributed under the Eclipse Public License either version 1.0 or (at your option) any later version.</p></div></div></div></body></html>

2
docs/js/highlight.min.js vendored Normal file

File diff suppressed because one or more lines are too long

4
docs/js/jquery.min.js vendored Normal file

File diff suppressed because one or more lines are too long

112
docs/js/page_effects.js Normal file
View file

@ -0,0 +1,112 @@
function visibleInParent(element) {
var position = $(element).position().top
return position > -50 && position < ($(element).offsetParent().height() - 50)
}
function hasFragment(link, fragment) {
return $(link).attr("href").indexOf("#" + fragment) != -1
}
function findLinkByFragment(elements, fragment) {
return $(elements).filter(function(i, e) { return hasFragment(e, fragment)}).first()
}
function scrollToCurrentVarLink(elements) {
var elements = $(elements);
var parent = elements.offsetParent();
if (elements.length == 0) return;
var top = elements.first().position().top;
var bottom = elements.last().position().top + elements.last().height();
if (top >= 0 && bottom <= parent.height()) return;
if (top < 0) {
parent.scrollTop(parent.scrollTop() + top);
}
else if (bottom > parent.height()) {
parent.scrollTop(parent.scrollTop() + bottom - parent.height());
}
}
function setCurrentVarLink() {
$('.secondary a').parent().removeClass('current')
$('.anchor').
filter(function(index) { return visibleInParent(this) }).
each(function(index, element) {
findLinkByFragment(".secondary a", element.id).
parent().
addClass('current')
});
scrollToCurrentVarLink('.secondary .current');
}
var hasStorage = (function() { try { return localStorage.getItem } catch(e) {} }())
function scrollPositionId(element) {
var directory = window.location.href.replace(/[^\/]+\.html$/, '')
return 'scroll::' + $(element).attr('id') + '::' + directory
}
function storeScrollPosition(element) {
if (!hasStorage) return;
localStorage.setItem(scrollPositionId(element) + "::x", $(element).scrollLeft())
localStorage.setItem(scrollPositionId(element) + "::y", $(element).scrollTop())
}
function recallScrollPosition(element) {
if (!hasStorage) return;
$(element).scrollLeft(localStorage.getItem(scrollPositionId(element) + "::x"))
$(element).scrollTop(localStorage.getItem(scrollPositionId(element) + "::y"))
}
function persistScrollPosition(element) {
recallScrollPosition(element)
$(element).scroll(function() { storeScrollPosition(element) })
}
function sidebarContentWidth(element) {
var widths = $(element).find('.inner').map(function() { return $(this).innerWidth() })
return Math.max.apply(Math, widths)
}
function calculateSize(width, snap, margin, minimum) {
if (width == 0) {
return 0
}
else {
return Math.max(minimum, (Math.ceil(width / snap) * snap) + (margin * 2))
}
}
function resizeSidebars() {
var primaryWidth = sidebarContentWidth('.primary')
var secondaryWidth = 0
if ($('.secondary').length != 0) {
secondaryWidth = sidebarContentWidth('.secondary')
}
// snap to grid
primaryWidth = calculateSize(primaryWidth, 32, 13, 160)
secondaryWidth = calculateSize(secondaryWidth, 32, 13, 160)
$('.primary').css('width', primaryWidth)
$('.secondary').css('width', secondaryWidth).css('left', primaryWidth + 1)
if (secondaryWidth > 0) {
$('#content').css('left', primaryWidth + secondaryWidth + 2)
}
else {
$('#content').css('left', primaryWidth + 1)
}
}
$(window).ready(resizeSidebars)
$(window).ready(setCurrentVarLink)
$(window).ready(function() { persistScrollPosition('.primary')})
$(window).ready(function() {
$('#content').scroll(setCurrentVarLink)
$(window).resize(setCurrentVarLink)
})

View file

@ -1,8 +1,12 @@
(defproject html-to-md "0.2.0" (defproject html-to-md "0.3.0"
:description "Convert (Enlivened) HTML to markdown; but, more generally, a framework for [HT|SG|X]ML transformation." :description "Convert (Enlivened) HTML to markdown; but, more generally, a framework for [HT|SG|X]ML transformation."
:url "https://github.com/simon-brooke/html-to-md" :url "https://github.com/simon-brooke/html-to-md"
:license {:name "Eclipse Public License" :license {:name "Eclipse Public License"
:url "http://www.eclipse.org/legal/epl-v10.html"} :url "http://www.eclipse.org/legal/epl-v10.html"}
:codox {:metadata {:doc "**TODO**: write docs"
:doc/format :markdown}
:output-path "docs"
:source-uri "https://github.com/simon-brooke/html-to-md/blob/master/{filepath}#L{line}"}
:dependencies [[org.clojure/clojure "1.8.0"] :dependencies [[org.clojure/clojure "1.8.0"]
[enlive "1.1.6"]] [enlive "1.1.6"]]
:plugins [[lein-codox "0.10.3"] :plugins [[lein-codox "0.10.3"]

View file

@ -1,4 +1,6 @@
(ns html-to-md.blogger-to-md (ns html-to-md.blogger-to-md
"Convert blogger posts to Markdown format, omitting all the Blogger chrome
and navigation."
(:require [clojure.string :as s] (:require [clojure.string :as s]
[html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]] [html-to-md.html-to-md :refer [markdown-dispatcher markdown-header]]
[html-to-md.transformer :refer [process]] [html-to-md.transformer :refer [process]]
@ -34,7 +36,7 @@
(def blogger-dispatcher (def blogger-dispatcher
"Adaptation of `markdown-dispatcher`, q.v., with the `:table`, `:h3` and "Adaptation of `markdown-dispatcher`, q.v., with the `:table` and
`:html` dispatches overridden." `:html` dispatches overridden."
(assoc markdown-dispatcher (assoc markdown-dispatcher
:html blogger-scraper :html blogger-scraper

View file

@ -1,4 +1,5 @@
(ns html-to-md.core (ns html-to-md.core
"Top level functions intended for very simple use."
(:require [html-to-md.transformer :refer [transform process]] (:require [html-to-md.transformer :refer [transform process]]
[html-to-md.html-to-md :refer [markdown-dispatcher]] [html-to-md.html-to-md :refer [markdown-dispatcher]]
[html-to-md.blogger-to-md :refer [blogger-dispatcher]])) [html-to-md.blogger-to-md :refer [blogger-dispatcher]]))

View file

@ -1,4 +1,7 @@
(ns html-to-md.html-to-md (ns html-to-md.html-to-md
"Transform general HTML to
[Markdown](https://daringfireball.net/projects/markdown/), as faithfully
as is reasonably possible."
(:require (:require
[clojure.string :as s] [clojure.string :as s]
[net.cgrand.enlive-html :as html] [net.cgrand.enlive-html :as html]
@ -165,7 +168,7 @@
(def markdown-dispatcher (def markdown-dispatcher
"A despatcher for transforming (X)HTML into Markdown." "A dispatcher for transforming (X)HTML into Markdown."
{:a markdown-a {:a markdown-a
:b markdown-strong :b markdown-strong
:br markdown-br :br markdown-br

View file

@ -1,14 +1,38 @@
(ns html-to-md.transformer (ns html-to-md.transformer
"The actual transformation engine, which is actually far more general
than just something to generate
[Markdown](https://daringfireball.net/projects/markdown/). It isn't as
general as [XSL-T](https://www.w3.org/standards/xml/transformation) but
can nevertheless do a great deal of transformation on [HT|SG|X]ML
documents.
## Terminology
In this documentation the following terminology is used:
* **dispatcher**: a `dispatcher` is a function (or more
probably a map) which takes one argument, the tag of the element as a
keyword, and returns a `processor`, q.v.
* **processor**: a `processor` is a function of two arguments, an
[Enlive](https://github.com/cgrand/enlive) encoded (X)HTML element and
a `dispatcher` as described above, which processes elements into the
desired format.
## Generality
**NOTE** that while `processors` within the `html-to-md` package generally
process elements into strings (since Markdown is a text format), when
processing into an XML format it will generally be preferable that
`processors` should return Enlive style elements."
(:require (:require
[net.cgrand.enlive-html :as html] [net.cgrand.enlive-html :as html]
[net.cgrand.tagsoup :as tagsoup])) [net.cgrand.tagsoup :as tagsoup]))
(defn process (defn process
"Process this `element`, assumed to be a [HT|SG|X]ML element in Enlive "Process this `element`, assumed to be a [HT|SG|X]ML element in
encoding, using this `dispatcher`, assumed to be a function (or more [Enlive](https://github.com/cgrand/enlive)
probably a map) which takes one argument, the tag of the element as encoding, using this `dispatcher`,
keyword, and returns a function which processes elements with that tag.
Such a function should take two arguments, the `element` itself and a Such a function should take two arguments, the `element` itself and a
dispatcher which will normally (but not necessarily) be the `dispatcher` dispatcher which will normally (but not necessarily) be the `dispatcher`
@ -17,8 +41,13 @@
If the dispatcher returns `nil`, the default behaviour is that `process` If the dispatcher returns `nil`, the default behaviour is that `process`
is mapped over the content of the element. is mapped over the content of the element.
If `element` is not an [HT|SG|X]ML element in Enlive encoding or else a If `element` is not an [HT|SG|X]ML element in Enlive encoding as descibed
string, returns `nil`. Strings are returned unaltered." above, then
1. if the `element` is a string, returns that string unaltered;
2. if the `element` is a sequence or vector, maps `process` across the
members of the sequence;
3. otherwise, returns `nil`."
[element dispatcher] [element dispatcher]
(cond (cond
(:tag element) (:tag element)
@ -32,12 +61,21 @@
(remove nil? (map #(process % dispatcher) element)))) (remove nil? (map #(process % dispatcher) element))))
(defn- transformer-dispatch (defn- transformer-dispatch
"Hack to get dispatch on just the first argument to the `transform`
multi-method."
[a _] [a _]
(class a)) (class a))
(defmulti transform (defmulti transform
"Transform the `obj` which is my first argument using the `dispatcher` "Transform the `obj` which is my first argument using the `dispatcher`
which is my second argument." which is my second argument. `obj` can be:
1. A URL or URI;
2. A string representation of a URL or URI;
3. A string representation of an (X)HTML fragment;
4. An [Enlive](https://github.com/cgrand/enlive) encoded (X)HTML element;
5. A sequence of [Enlive](https://github.com/cgrand/enlive) encoded
(X)HTML elements."
#'transformer-dispatch #'transformer-dispatch
:default :default) :default :default)