Now parses a very substantial amount of SQL.
This commit is contained in:
parent
d3cc5cb11f
commit
9199483f5e
6
.gitignore
vendored
6
.gitignore
vendored
|
@ -1,2 +1,8 @@
|
|||
Leiningen.gitignore
|
||||
\.lein-repl-history
|
||||
|
||||
\.nrepl-port
|
||||
|
||||
target/classes/META-INF/maven/squirrel-parse/squirrel-parse/pom\.properties
|
||||
|
||||
target/
|
||||
|
|
|
@ -1,8 +1,9 @@
|
|||
(ns ^{:doc "A very simple parser which parses production rules."
|
||||
(ns ^{:doc "A parser for SQL."
|
||||
:author "Simon Brooke"}
|
||||
squirrel-parse.parser
|
||||
(:require [instaparse.core :as insta]
|
||||
[clojure.string :refer [split trim triml]]))
|
||||
[clojure.string :refer [join split trim triml upper-case]]
|
||||
[squirrel-parse.utils :refer :all]))
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;
|
||||
|
@ -27,46 +28,143 @@
|
|||
;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
(def grammar
|
||||
(str
|
||||
"TABLE-SPEC := 'create' SPACE 'table' EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;
|
||||
TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ;
|
||||
TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;
|
||||
COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC SPACE COLUMN-CONSTRAINTS ;
|
||||
TABLE-CONSTRAINT := 'foo' ;
|
||||
COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;
|
||||
COLUMN-CONSTRAINT := 'constraint' SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;
|
||||
NOT-NULL-CC := 'not' SPACE 'null' ;
|
||||
NULL-CC := 'null' ;
|
||||
DEFAULT-CC := 'default' SPACE EXPR ;
|
||||
UNIQUE-CC := 'unique' SPACE INDEX-PARAMS ;
|
||||
PRIMARY-CC := 'primary' SPACE 'key' INDEX-PARAMS ;
|
||||
REFERENCES-CC := 'references' SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;
|
||||
REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;
|
||||
REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;
|
||||
REF-MATCH := 'match' SPACE MATCH-TYPE ;
|
||||
REF-ON-DELETE := 'on' SPACE 'delete' SPACE REF-ACTION ;
|
||||
REF-ON-UPDATE := 'on' SPACE 'update' SPACE REF-ACTION ;
|
||||
MATCH-TYPE := 'full' | 'partial' | 'simple' ;
|
||||
INDEX-PARAMS := '' ;
|
||||
REF-ACTION := 'no' SPACE 'action' | 'restrict' | 'cascade' | 'set' SPACE 'null' | 'set' SPACE 'default';
|
||||
EXPR := 'foo' ; "
|
||||
;; there are some things we ignore
|
||||
"EXISTENCE := SPACE 'if' SPACE 'not' SPACE 'exists' | '' ;
|
||||
COMMENT := #'--[~\\n\\r]*' ;"
|
||||
"DATATYPE := 'integer' | 'float' | 'char' LPAR INT-VAL RPAR | 'varchar' LPAR INT-VAL RPAR | 'text' ;
|
||||
INT-VAL := #'[0-9]+' ;"
|
||||
;; there don't seem to be any valid cases where a comma may not be either preceded or succeeded by white-space
|
||||
"COMMA := OPT-SPACE ',' OPT-SPACE ;
|
||||
LPAR := OPT-SPACE '(' OPT-SPACE ;
|
||||
RPAR := OPT-SPACE ')' OPT-SPACE ; "
|
||||
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
|
||||
;; also acceptable wherever OPT-SPACE is acceptable
|
||||
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
|
||||
;; SPACE is mandatory white-space; comments are acceptable here too but there must be a real space character
|
||||
"SPACE := #'\\s+' | OPT-SPACE SPACE | SPACE OPT-SPACE ;
|
||||
NAMES := NAME | NAME COMMA NAMES ;
|
||||
NAME := #'[a-zA-Z][a-zA-Z0-9_]*'"))
|
||||
(def keywords ["action" "alter" "as" "by" "cache" "cascade" "comment" "constraint" "create" "cycle" "day" "default" "delete" "exists"
|
||||
"extension" "false" "from" "full" "go" "hour" "if" "increment" "key" "match" "maxvalue" "minute" "minvalue"
|
||||
"month" "no" "none" "not" "null" "off" "on" "owned" "partial" "primary" "references" "restrict" "schema" "second"
|
||||
"select" "sequence" "set" "simple" "start" "table" "temp" "temporary" "time" "to" "true" "unique" "update" "where" "with" "without"
|
||||
"year" "zone"])
|
||||
|
||||
|
||||
(def keyword-rules
|
||||
(flatten
|
||||
(list
|
||||
(apply str
|
||||
(flatten
|
||||
(list
|
||||
"KEYWORD :="
|
||||
(join " | "
|
||||
(map #(str "KW-" (upper-case (join "-" (split % #"\s+"))))
|
||||
keywords)))))
|
||||
(map #(make-case-insensitive-match-rule % "KW") keywords))))
|
||||
|
||||
|
||||
(def simple-datatypes
|
||||
"datatypes which take no arguments or special syntax"
|
||||
["bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ])
|
||||
|
||||
(def with-integer-parameter-datatypes
|
||||
"datatypes which take a single integer parameter, typically the storage size"
|
||||
["char" "character" "character varying" "varchar"])
|
||||
|
||||
(def with-or-without-timezone-datatypes
|
||||
"datatypes which may optionally take 'with (or without) time zone"
|
||||
["time" "timestamp"])
|
||||
|
||||
(def special-datatypes
|
||||
"Datatypes with complex special grammar"
|
||||
["interval"])
|
||||
|
||||
(def interval-datatype-rules
|
||||
"interval has special complex grammar. TODO: this is not entirely correct; some
|
||||
combinations of 'x to y' are not allowed."
|
||||
(list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;"
|
||||
"INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;"
|
||||
"INTERVAL-FIELD := KW-YEAR | KW-MONTH | KW-DAY | KW-HOUR | KW-MINUTE | KW-SECOND;"))
|
||||
|
||||
(def datatype-rules
|
||||
"All rules for all datatypes"
|
||||
(list
|
||||
(apply str
|
||||
(flatten
|
||||
(list "DATATYPE := "
|
||||
(join " | "
|
||||
(map #(str "DT-" (upper-case (join "-" (split % #"\s+"))))
|
||||
(flatten
|
||||
(list simple-datatypes
|
||||
with-integer-parameter-datatypes
|
||||
with-or-without-timezone-datatypes
|
||||
special-datatypes)))) " ;\n")))
|
||||
(map make-simple-datatype-rule simple-datatypes)
|
||||
(map make-with-integer-parameter-datatype-rule with-integer-parameter-datatypes)
|
||||
(map make-with-or-without-timezone-datatype-rule with-or-without-timezone-datatypes)
|
||||
interval-datatype-rules))
|
||||
|
||||
(def basic-rules
|
||||
(list
|
||||
"STATEMENTS := STATEMENT TERMINATOR | STATEMENT TERMINATOR STATEMENTS ;"
|
||||
"STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT-STMT | EXTENSION-DECL | SEQUENCE-DECL;"
|
||||
"ALTER-STMT := ALTER-TABLE ;"
|
||||
"ALTER-TABLE := KW-ALTER SPACE KW-TABLE SPACE QUAL-NAME SPACE NAME SPACE KW-TO SPACE NAME ;"
|
||||
"SET-STMT := KW-SET SPACE NAME OPT-SPACE '=' OPT-SPACE EXPRESSION ;"
|
||||
"COMMENT-STMT := KW-COMMENT #'[~;]*' ;"
|
||||
"EXTENSION-DECL := KW-CREATE SPACE KW-EXTENSION EXISTENCE SPACE NAME SPACE KW-WITH SPACE KW-SCHEMA SPACE NAME ;"
|
||||
|
||||
;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html
|
||||
"SEQUENCE-DECL := KW-CREATE SPACE PERMANENCE KW-SEQUENCE EXISTENCE SPACE NAME SPACE SEQ-SPEC-ELEMENTS ;"
|
||||
"SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SPACE SEQ-SPEC-ELEMENTS ;"
|
||||
"SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;"
|
||||
"SEQ-SPEC-AS := KW-AS DATATYPE ;"
|
||||
"SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT SPACE INT-VAL | KW-INCREMENT SPACE KW-BY SPACE INT-VAL ;"
|
||||
;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue.
|
||||
"SEQ-SPEC-MIN := KW-NO SPACE KW-MINVALUE | KW-MINVALUE SPACE INT-VAL ;"
|
||||
"SEQ-SPEC-MAX := KW-NO SPACE KW-MAXVALUE | KW-MAXVALUE SPACE INT-VAL ;"
|
||||
"SEQ-SPEC-START := KW-START SPACE KW-WITH SPACE INT-VAL | KW-START SPACE INT-VAL ;"
|
||||
"SEQ-SPEC-CACHE := KW-CACHE SPACE INT-VAL ;"
|
||||
"SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO SPACE KW-CYCLE ;"
|
||||
"SEQ-SPEC-OWNER := KW-OWNED SPACE KW-BY SPACE QUAL-NAME | KW-OWNED SPACE KW-BY SPACE KW-NONE ;"
|
||||
|
||||
;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html
|
||||
;; but by no means all of that is implemented.
|
||||
"TABLE-DECL := KW-CREATE SPACE PERMANENCE KW-TABLE EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;"
|
||||
"TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ; "
|
||||
"TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;"
|
||||
"COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC OPT-SPACE COLUMN-CONSTRAINTS ;"
|
||||
"TABLE-CONSTRAINT := 'TODO' ;"
|
||||
"COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;"
|
||||
"COLUMN-CONSTRAINT := KW-CONSTRAINT SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;"
|
||||
"NOT-NULL-CC := KW-NOT SPACE KW-NULL ;"
|
||||
"NULL-CC := KW-NULL ;"
|
||||
"DEFAULT-CC := KW-DEFAULT SPACE EXPR ;"
|
||||
"UNIQUE-CC := KW-UNIQUE SPACE INDEX-PARAMS ;"
|
||||
"PRIMARY-CC := KW-PRIMARY SPACE KW-KEY INDEX-PARAMS ;"
|
||||
"REFERENCES-CC := KW-REFERENCES SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;"
|
||||
"REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;"
|
||||
"REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;"
|
||||
"REF-MATCH := KW-MATCH SPACE MATCH-TYPE ;"
|
||||
"REF-ON-DELETE := KW-ON SPACE KW-DELETE SPACE REF-ACTION ;"
|
||||
"REF-ON-UPDATE := KW-ON SPACE KW-UPDATE SPACE REF-ACTION ;"
|
||||
"MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;"
|
||||
"INDEX-PARAMS := 'foo' ;"
|
||||
"REF-ACTION := KW-NO SPACE KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET SPACE VALUE;"
|
||||
"EXPR := 'TODO' ; "
|
||||
"EXISTENCE := '' | SPACE KW-IF SPACE KW-NOT SPACE KW-EXISTS ;"
|
||||
"PERMANENCE := '' | KW-TEMP SPACE | KW-TEMPORARY SPACE ;"
|
||||
"COMMENT := #'--[~\\n\\r]*' ;"
|
||||
;; TODO: much more to do here!
|
||||
"EXPRESSION := VALUE | KEYWORD;"
|
||||
;; TODO: same for values.
|
||||
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES;"
|
||||
"INT-VAL := #'[0-9]+' ;"
|
||||
"CHAR-VAL := #'\\047.*\\047' ;"
|
||||
"TRUTH-VAL := KW-TRUE | KW-FALSE ;"
|
||||
"DOT := '.' ;"
|
||||
;; there don't seem to be any valid cases where a comma may not be either preceded or succeeded by white-space
|
||||
"COMMA := OPT-SPACE ',' OPT-SPACE ;"
|
||||
"LPAR := OPT-SPACE '(' OPT-SPACE ;"
|
||||
"RPAR := OPT-SPACE ')' OPT-SPACE ; "
|
||||
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
|
||||
;; also acceptable wherever OPT-SPACE is acceptable
|
||||
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
|
||||
;; SPACE is mandatory white-space; comments are acceptable here too but there must be a real space character
|
||||
"SPACE := #'\\s+' | OPT-SPACE SPACE | SPACE OPT-SPACE ;"
|
||||
"QUAL-NAME := NAME | NAME DOT NAME ;"
|
||||
"NAMES := NAME | NAME COMMA NAMES ;"
|
||||
"NAME := #'[a-zA-Z][a-zA-Z0-9_]*'"
|
||||
"TERMINATOR := OPT-SPACE SEMI-COLON OPT-SPACE | OPT-SPACE KW-GO OPT-SPACE;"
|
||||
"SEMI-COLON := ';';"))
|
||||
|
||||
|
||||
(def grammar (join "\n" (flatten (list basic-rules datatype-rules keyword-rules))))
|
||||
|
||||
|
||||
(defn ignorable?
|
||||
|
|
95
src/squirrel_parse/utils.clj
Normal file
95
src/squirrel_parse/utils.clj
Normal file
|
@ -0,0 +1,95 @@
|
|||
(ns ^{:doc "A parser for SQL: utility functions."
|
||||
:author "Simon Brooke"}
|
||||
squirrel-parse.utils
|
||||
(:require [clojure.string :refer [join split trim triml upper-case]]))
|
||||
|
||||
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
;;;;
|
||||
;;;; squirrel-parse.utils: utility functions supporting the parser.
|
||||
;;;;
|
||||
;;;; This program is free software; you can redistribute it and/or
|
||||
;;;; modify it under the terms of the GNU General Public License
|
||||
;;;; as published by the Free Software Foundation; either version 2
|
||||
;;;; of the License, or (at your option) any later version.
|
||||
;;;;
|
||||
;;;; This program is distributed in the hope that it will be useful,
|
||||
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
;;;; GNU General Public License for more details.
|
||||
;;;;
|
||||
;;;; You should have received a copy of the GNU General Public License
|
||||
;;;; along with this program; if not, write to the Free Software
|
||||
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
||||
;;;; USA.
|
||||
;;;;
|
||||
;;;; Copyright (C) 2018 Simon Brooke
|
||||
;;;;
|
||||
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
||||
|
||||
|
||||
(defn- make-unterminated-case-insensitive-match-rule
|
||||
"Make a grammar rule which matches this `token` case-insensitively,
|
||||
without the terminal semi-colon"
|
||||
[token]
|
||||
(let [subtokens (split token #"\s+")
|
||||
name (join "-" subtokens)]
|
||||
(apply str
|
||||
(flatten
|
||||
(list
|
||||
(upper-case name)
|
||||
" := "
|
||||
(join " SPACE " (map #(str "#'(?i)" % "'") subtokens)))))))
|
||||
|
||||
|
||||
(defn make-case-insensitive-match-rule
|
||||
"Make a grammar rule which matches this `token` case-insensitively, optionally prefixing the
|
||||
name of the rule with this `prefix`"
|
||||
([token prefix]
|
||||
(str prefix "-" (make-case-insensitive-match-rule token)))
|
||||
([token]
|
||||
(str (make-unterminated-case-insensitive-match-rule token) ";")))
|
||||
|
||||
|
||||
(defn make-simple-datatype-rule
|
||||
"Make a rule which matches this `datatype`, for datatypes which take no parameters or special grammar."
|
||||
[token]
|
||||
(make-case-insensitive-match-rule token "DT"))
|
||||
|
||||
|
||||
(defn make-with-integer-parameter-datatype-rule
|
||||
"Make a rule which matches this `datatype`, for datatypes which take a single integer parameter."
|
||||
[token]
|
||||
(str "DT-" (make-unterminated-case-insensitive-match-rule token) " LPAR INT-VAL RPAR ;"))
|
||||
|
||||
|
||||
(defn- make-timezone-clause
|
||||
[match with-tz? with-precision?]
|
||||
(join " SPACE "
|
||||
(list (if with-precision? (str match " LPAR INT-VAL RPAR") match) (if with-tz? "KW-WITH" "KW-WITHOUT") "KW-TIME" "KW-ZONE")))
|
||||
|
||||
|
||||
(defn make-with-or-without-timezone-datatype-rule
|
||||
"Make a rule which matches this `datatype`, for datatypes which may optionally take
|
||||
'with (or without) time zone'."
|
||||
[token]
|
||||
(let [subtokens (split token #"\s+")
|
||||
name (join "-" subtokens)
|
||||
match (join " SPACE " (map #(str "#'(?i)" % "'") subtokens))]
|
||||
(apply str
|
||||
(flatten
|
||||
(list
|
||||
"DT-"
|
||||
(upper-case name)
|
||||
" := "
|
||||
(join
|
||||
" | "
|
||||
(list
|
||||
match
|
||||
(str match " LPAR INT-VAL RPAR")
|
||||
(make-timezone-clause match true false)
|
||||
(make-timezone-clause match true true)
|
||||
(make-timezone-clause match false false)
|
||||
(make-timezone-clause match false true))))))))
|
||||
|
||||
|
7
test/squirrel_parse/parser_test.clj
Normal file
7
test/squirrel_parse/parser_test.clj
Normal file
|
@ -0,0 +1,7 @@
|
|||
(ns squirrel-parse.parser-test
|
||||
(:require [clojure.test :refer :all]
|
||||
[squirrel-parse.parser :refer :all]))
|
||||
|
||||
(deftest a-test
|
||||
(testing "FIXME, I fail."
|
||||
(is (= 0 1))))
|
Loading…
Reference in a new issue