diff --git a/.gitignore b/.gitignore index 1507fee..925dc48 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ profiles.clj \.rebel_readline_history [0-9a-f]*-init\.clj + +*.log diff --git a/env/dev/resources/config.edn b/env/dev/resources/config.edn index 0967ef4..cadfb9d 100644 --- a/env/dev/resources/config.edn +++ b/env/dev/resources/config.edn @@ -1 +1 @@ -{} +{:tess-data "/usr/local/Cellar/tesseract/4.0.0_1/share/tessdata/"} diff --git a/env/prod/resources/config.edn b/env/prod/resources/config.edn index e24ec21..20d1b62 100644 --- a/env/prod/resources/config.edn +++ b/env/prod/resources/config.edn @@ -1,2 +1,3 @@ {:prod true - :port 3000} + :port 8889 + :tess-data "/usr/share/tesseract-ocr/tessdata"} diff --git a/project.clj b/project.clj index e246c14..3cf7350 100644 --- a/project.clj +++ b/project.clj @@ -1,6 +1,6 @@ (defproject ireadit "0.1.0-SNAPSHOT" - :description "FIXME: write description" + :description "a bot to automatically OCR memes and other text-as-graphics posted to social media" :url "http://example.com/FIXME" :dependencies [[baking-soda "0.2.0" :exclusions [cljsjs/react-bootstrap]] @@ -18,6 +18,7 @@ [cprop "0.1.13"] [day8.re-frame/http-fx "0.1.6"] [funcool/struct "1.3.0"] + [com.github.jai-imageio/jai-imageio-core "1.4.0"] [luminus-immutant "0.2.5"] [luminus-transit "0.1.1"] [luminus/ring-ttl-session "0.3.2"] @@ -40,7 +41,8 @@ [ring-webjars "0.2.0"] [ring/ring-core "1.7.1"] [ring/ring-defaults "0.3.2"] - [selmer "1.12.6"]] + [selmer "1.12.6"] + [net.sourceforge.tess4j/tess4j "4.3.1"]] :min-lein-version "2.0.0" diff --git a/src/clj/ireadit/nrepl.clj b/src/clj/ireadit/nrepl.clj index 0ad85db..fdddd62 100644 --- a/src/clj/ireadit/nrepl.clj +++ b/src/clj/ireadit/nrepl.clj @@ -1,4 +1,6 @@ -(ns ireadit.nrepl +(ns ^{:doc "Meme transcriber: command line support" + :author "Simon Brooke"} + ireadit.nrepl (:require [nrepl.server :as nrepl] [clojure.tools.logging :as log])) diff --git a/src/clj/ireadit/routes/services.clj b/src/clj/ireadit/routes/services.clj index 660d6c6..40b2e86 100644 --- a/src/clj/ireadit/routes/services.clj +++ b/src/clj/ireadit/routes/services.clj @@ -1,45 +1,22 @@ (ns ireadit.routes.services (:require [ring.util.http-response :refer :all] + [cemerick.url :refer (url-decode)] [compojure.api.sweet :refer :all] + [ireadit.tesseractor :refer [ocr]] [schema.core :as s])) (def service-routes (api - {:swagger {:ui "/swagger-ui" - :spec "/swagger.json" - :data {:info {:version "1.0.0" - :title "Sample API" - :description "Sample Services"}}}} - - (context "/api" [] - :tags ["thingie"] - - (GET "/plus" [] - :return Long - :query-params [x :- Long, {y :- Long 1}] - :summary "x+y with query-parameters. y defaults to 1." - (ok (+ x y))) + {:swagger {:ui "/swagger-ui" + :spec "/swagger.json" + :data {:info {:version "1.0.0" + :title "Sample API" + :description "Sample Services"}}}} - (POST "/minus" [] - :return Long - :body-params [x :- Long, y :- Long] - :summary "x-y with body-parameters." - (ok (- x y))) + (context "/api" [] + :tags ["tesseractor"] - (GET "/times/:x/:y" [] - :return Long - :path-params [x :- Long, y :- Long] - :summary "x*y with path-parameters" - (ok (* x y))) - - (POST "/divide" [] - :return Double - :form-params [x :- Long, y :- Long] - :summary "x/y with form-parameters" - (ok (/ x y))) - - (GET "/power" [] - :return Long - :header-params [x :- Long, y :- Long] - :summary "x^y with header-parameters" - (ok (long (Math/pow x y))))))) + (POST "/ocr/:uri" [] + :return String + :path-params [uri :- String] + (ocr (url-decode uri)))))) diff --git a/src/clj/ireadit/tesseractor.clj b/src/clj/ireadit/tesseractor.clj new file mode 100644 index 0000000..e64e1ad --- /dev/null +++ b/src/clj/ireadit/tesseractor.clj @@ -0,0 +1,65 @@ +(ns ^{:doc "Meme transcriber: actual OCR interface" + :author "Simon Brooke"} + ireadit.tesseractor + (:require [clojure.java.io :as io] + [clojure.tools.logging :as log] + [ireadit.config :refer [env]]) + (:import net.sourceforge.tess4j.Tesseract + java.io.File + java.net.URL + javax.imageio.ImageIO)) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;; +;;;; ireadit.tesseractor: actual OCR interface. +;;;; +;;;; This program is free software; you can redistribute it and/or +;;;; modify it under the terms of the GNU General Public License +;;;; as published by the Free Software Foundation; either version 2 +;;;; of the License, or (at your option) any later version. +;;;; +;;;; This program is distributed in the hope that it will be useful, +;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;;;; GNU General Public License for more details. +;;;; +;;;; You should have received a copy of the GNU General Public License +;;;; along with this program; if not, write to the Free Software +;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, +;;;; USA. +;;;; +;;;; Copyright (C) 2016 Simon Brooke for Radical Independence Campaign +;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +;;; Cribbed partly from https://github.com/hugoArregui/tesseract-clojure + +;; (def tesseract-data-dir "/usr/share/tessdata") +;; (def language "eng") +;; (def test-file "eurotext.png") + +(defn prepare-tesseract [data-path] + (let [t (Tesseract.)] + (.setDatapath t data-path) + t)) + + +(def tesseractor (prepare-tesseract (:tess-data env))) + +(defn ocr + "Perform optical charactor representation on `imgage` using the OCR engine + `t`, assuming the ISO 639-3 language `lang`, and return any text found as a + string. `image` may be supplied as a `File`, as `BufferedImage`, or as a + string, in which case it will be treated as a URL." + ([image] + (ocr image tesseractor)) + ([image t] + (ocr image t "eng")) + ([image t lang] + (let [img (if + (string? image) + (ImageIO/read (URL. image)) + image)] + (.setLanguage t lang) + (.doOCR t img)))) + diff --git a/src/cljs/ireadit/events.cljs b/src/cljs/ireadit/events.cljs index 95a123e..6b12de0 100644 --- a/src/cljs/ireadit/events.cljs +++ b/src/cljs/ireadit/events.cljs @@ -36,7 +36,7 @@ (rf/reg-event-fx :fetch-transcription (fn [{db :db} _] - (let [uri (str "http://loriner.journeyman.cc:8888/v1/tesseract/" (url-encode (:url db)))] + (let [uri (str "/api/ocr/" (url-encode (:url db)))] (js/console.log (str "Fetching transcription data: " uri))