295 lines
14 KiB
Clojure
295 lines
14 KiB
Clojure
(ns ^{:doc "A parser for SQL."
|
|
:author "Simon Brooke"}
|
|
squirrel-parse.parser
|
|
(:require [instaparse.core :as insta]
|
|
[clojure.string :refer [join split trim triml upper-case]]
|
|
[squirrel-parse.utils :refer :all]))
|
|
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
;;;;
|
|
;;;; squirrel-parse.parser: parse SQL into Clojure structures.
|
|
;;;;
|
|
;;;; This program is free software; you can redistribute it and/or
|
|
;;;; modify it under the terms of the GNU General Public License
|
|
;;;; as published by the Free Software Foundation; either version 2
|
|
;;;; of the License, or (at your option) any later version.
|
|
;;;;
|
|
;;;; This program is distributed in the hope that it will be useful,
|
|
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
;;;; GNU General Public License for more details.
|
|
;;;;
|
|
;;;; You should have received a copy of the GNU General Public License
|
|
;;;; along with this program; if not, write to the Free Software
|
|
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
|
|
;;;; USA.
|
|
;;;;
|
|
;;;; Copyright (C) 2018 Simon Brooke
|
|
;;;;
|
|
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
|
|
|
|
|
|
(def keywords
|
|
"Keywords from the subset of Postgresql I have so far needed to support. Feel free to either
|
|
1. Add further Postrgresql keywords, or
|
|
2. Start a new def for non-postgresql keywords."
|
|
["action" "add" "admin" "all" "alter" "as" "asc"
|
|
"by"
|
|
"cache" "cascade" "collate" "collation" "column" "comment" "concurrently" "connection" "constraint" "create" "cycle"
|
|
"data" "day" "default" "delete" "desc" "drop"
|
|
"encrypted" "exists" "extension"
|
|
"false" "first" "foreign" "from" "full"
|
|
"go" "grant" "group"
|
|
"hour"
|
|
"if" "increment" "index" "insert" "in" "into"
|
|
"key"
|
|
"last" "limit"
|
|
"match" "maxvalue" "method" "minute" "minvalue" "month"
|
|
"no" "none" "not" "null" "nulls"
|
|
"off" "on" "only" "owned" "owner"
|
|
"partial" "password" "primary"
|
|
"references" "rename" "restrict" "revoke" "role"
|
|
"schema" "second" "select" "sequence" "set" "simple" "start" "sysid"
|
|
"table" "temp" "temporary" "time" "to" "true" "type"
|
|
"unique" "until" "update" "user" "using"
|
|
"valid" "values"
|
|
"where" "with" "without"
|
|
"year"
|
|
"zone"
|
|
|
|
;; the next group are all role options. I'm not sure whether or not they
|
|
;; really count as keywords. I'm also not sure whether 'NO' should be created
|
|
;; as a special case insensitive prefix, which would save half of these.
|
|
"bypassrls" "createdb" "createrole" "inherit" "login" "nobypassrls" "nocreatedb"
|
|
"nocreaterole" "noinherit" "nologin" "noreplication" "nosuperuser" "replication" "superuser"
|
|
])
|
|
|
|
|
|
(def keyword-rules
|
|
"Rules to match keywords case-insensitively. It is my practice to write SQL in
|
|
lower case, but I know many prefer upper case."
|
|
(flatten
|
|
(list
|
|
(apply str
|
|
(flatten
|
|
(list
|
|
"KEYWORD :="
|
|
(join " | "
|
|
(map #(str "KW-" (upper-case (join "-" (split % #"\s+"))))
|
|
keywords)))))
|
|
(map #(make-case-insensitive-match-rule % "KW") keywords))))
|
|
|
|
|
|
(def simple-datatypes
|
|
"(Postgresql) datatypes which take no arguments or special syntax"
|
|
["bigint" "bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "int" "integer" "money" "numeric" "real" "serial" "text" ])
|
|
|
|
(def with-integer-parameter-datatypes
|
|
"(Postgresql) datatypes which take a single integer parameter, typically the storage size"
|
|
["char" "character" "character varying" "varchar"])
|
|
|
|
(def with-or-without-timezone-datatypes
|
|
"(Postgresql) datatypes which may optionally take 'with (or without) time zone"
|
|
["time" "timestamp"])
|
|
|
|
(def special-datatypes
|
|
"(Postgresql) Datatypes with complex special grammar"
|
|
["interval"])
|
|
|
|
(def interval-datatype-rules
|
|
"`interval` has special complex grammar. TODO: this is not entirely correct; some
|
|
combinations of 'x to y' are not allowed."
|
|
(list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;"
|
|
"INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;"
|
|
"INTERVAL-FIELD := KW-YEAR | KW-MONTH | KW-DAY | KW-HOUR | KW-MINUTE | KW-SECOND;"))
|
|
|
|
(def datatype-rules
|
|
"All rules for all datatypes"
|
|
(list
|
|
(apply str
|
|
(flatten
|
|
(list "DATATYPE := "
|
|
(join " | "
|
|
(map #(str "DT-" (upper-case (join "-" (split % #"\s+"))))
|
|
(flatten
|
|
(list simple-datatypes
|
|
with-integer-parameter-datatypes
|
|
with-or-without-timezone-datatypes
|
|
special-datatypes)))) " ;\n")))
|
|
(map make-simple-datatype-rule simple-datatypes)
|
|
(map make-with-integer-parameter-datatype-rule with-integer-parameter-datatypes)
|
|
(map make-with-or-without-timezone-datatype-rule with-or-without-timezone-datatypes)
|
|
interval-datatype-rules))
|
|
|
|
(def basic-rules
|
|
"Rough general syntax of the subset of Postgresql I have so far needed to support.
|
|
There will be some differences from ANSI 92. Rules are generally expressed as single
|
|
strings. Order is not very significant, but instaparse treats the first rule as special."
|
|
(list
|
|
"STATEMENTS := OPT-SPACE STATEMENT + ;"
|
|
"STATEMENT := CREATE-STMT |
|
|
ALTER-STMT |
|
|
SET-STMT |
|
|
COMMENT |
|
|
DROP-STMT |
|
|
EXTENSION-DECL |
|
|
SEQUENCE-DECL |
|
|
INSERT-STMT |
|
|
PERMISSIONS-STMT;"
|
|
"ALTER-STMT := ALTER-TABLE | ALTER-SEQUENCE ;"
|
|
|
|
"CREATE-STMT := CREATE-TABLE-STMT | CREATE-INDEX-STMT | CREATE-ROLE-STMT ;"
|
|
|
|
"DROP-STMT := KW-DROP OBJ-TYPE EXISTENCE QUAL-NAME TERMINATOR ;"
|
|
"OBJ-TYPE := KW-TABLE | KW-INDEX ;"
|
|
|
|
"SET-STMT := KW-SET NAME EQUALS EXPRESSION TERMINATOR ;"
|
|
"EXTENSION-DECL := KW-CREATE KW-EXTENSION EXISTENCE NAME KW-WITH KW-SCHEMA NAME TERMINATOR ;"
|
|
|
|
;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html
|
|
"SEQUENCE-DECL := KW-CREATE PERMANENCE KW-SEQUENCE EXISTENCE NAME SEQ-SPEC-ELEMENTS TERMINATOR ;"
|
|
"SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT + ;"
|
|
"SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;"
|
|
"SEQ-SPEC-AS := KW-AS DATATYPE ;"
|
|
"SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT INT-VAL | KW-INCREMENT KW-BY INT-VAL ;"
|
|
;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue.
|
|
"SEQ-SPEC-MIN := KW-NO KW-MINVALUE | KW-MINVALUE INT-VAL ;"
|
|
"SEQ-SPEC-MAX := KW-NO KW-MAXVALUE | KW-MAXVALUE INT-VAL ;"
|
|
"SEQ-SPEC-START := KW-START KW-WITH INT-VAL | KW-START INT-VAL ;"
|
|
"SEQ-SPEC-CACHE := KW-CACHE INT-VAL ;"
|
|
"SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO KW-CYCLE ;"
|
|
"SEQ-SPEC-OWNER := KW-OWNED KW-BY QUAL-NAME | KW-OWNED KW-BY KW-NONE ;"
|
|
|
|
;; from https://www.postgresql.org/docs/10/static/sql-altersequence.html
|
|
"ALTER-SEQUENCE := KW-ALTER KW-SEQUENCE EXISTENCE NAME ALTER-SEQ-ELEMENTS TERMINATOR ;"
|
|
"ALTER-SEQ-ELEMENTS := ALTER-SEQ-ELEMENT + ;"
|
|
"ALTER-SEQ-ELEMENT := SEQ-SPEC-ELEMENT | OWNER-TO | SEQ-RENAME | SEQ-SET-SCHEMA ;"
|
|
"OWNER-TO := KW-OWNER KW-TO NAME ;"
|
|
"SEQ-RENAME := KW-RENAME KW-TO NAME ;"
|
|
"SEQ-SET-SCHEMA := KW-SET KW-SCHEMA NAME ;"
|
|
|
|
;; TODO: there is a *lot* more gramar to do here, but I don't need it (yet)
|
|
"INSERT-STMT := KW-INSERT KW-INTO QUAL-NAME KW-VALUES LPAR VALUES RPAR TERMINATOR ;"
|
|
|
|
;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html
|
|
;; but by no means all of that is implemented.
|
|
"CREATE-TABLE-STMT := KW-CREATE PERMANENCE KW-TABLE EXISTENCE NAME LPAR TABLE-SPEC-ELEMENTS RPAR TERMINATOR ;"
|
|
"TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELT-COMMA * TABLE-SPEC-ELEMENT OPT-COMMA; "
|
|
"TABLE-SPEC-ELT-COMMA := TABLE-SPEC-ELEMENT COMMA ;"
|
|
"TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;"
|
|
"COLUMN-SPEC := NAME SPACE DATATYPE COLUMN-CONSTRAINTS ;"
|
|
"TABLE-CONSTRAINT := KW-CONSTRAINT NAME TC-ELEMENT ;"
|
|
"TC-ELEMENT := TC-ELT-FK ;" ;; OR OTHERS...
|
|
"TC-ELT-FK := REFERENCES-CC ;"
|
|
"COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT * ;"
|
|
"COLUMN-CONSTRAINT := KW-CONSTRAINT NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;"
|
|
"NOT-NULL-CC := KW-NOT KW-NULL ;"
|
|
"NULL-CC := KW-NULL ;"
|
|
"DEFAULT-CC := KW-DEFAULT EXPRESSION ;"
|
|
"UNIQUE-CC := KW-UNIQUE INDEX-PARAMS ;"
|
|
"PRIMARY-CC := KW-PRIMARY KW-KEY INDEX-PARAMS ;"
|
|
"REFERENCES-CC := KW-REFERENCES NAME LPAR NAMES RPAR REF-DIRECTIVES | KW-REFERENCES NAME REF-DIRECTIVES | KW-FOREIGN KW-KEY LPAR NAMES RPAR REFERENCES-CC;"
|
|
"REF-DIRECTIVES := REF-DIRECTIVE * ;"
|
|
"REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;"
|
|
"REF-MATCH := KW-MATCH MATCH-TYPE ;"
|
|
"REF-ON-DELETE := KW-ON KW-DELETE REF-ACTION ;"
|
|
"REF-ON-UPDATE := KW-ON KW-UPDATE REF-ACTION ;"
|
|
"MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;"
|
|
"INDEX-PARAMS := EXPRESSION | '' ;"
|
|
"REF-ACTION := KW-NO KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET VALUE;"
|
|
"EXISTENCE := '' | KW-IF KW-NOT KW-EXISTS | KW-IF KW-EXISTS ;"
|
|
"PERMANENCE := '' | KW-TEMP | KW-TEMPORARY ;"
|
|
|
|
"ALTER-TABLE := KW-ALTER KW-TABLE EXISTENCE KW-ONLY ? QUAL-NAME OPT-SPACE ALTER-TABLE-ELEMENTS TERMINATOR ;"
|
|
"ALTER-TABLE-ELEMENTS := ALTER-TABLE-ELEMENT + ;"
|
|
"ALTER-TABLE-ELEMENT := OWNER-TO | ALTER-COLUMN | ADD-CONSTRAINT;"
|
|
"ALTER-COLUMN := KW-ALTER KW-COLUMN NAME ALTER-COL-SPEC ;"
|
|
"ALTER-COL-SPEC := ALTER-COL-TYPE | ALTER-COL-DFLT;"
|
|
"ALTER-COL-TYPE := KW-SET OPT-KW-DATA KW-TYPE DATATYPE | ALTER-COL-TYPE KW-COLLATE NAME | ALTER-COL-TYPE KW-USING EXPRESSION ;"
|
|
"ALTER-COL-DFLT := KW-SET KW-DEFAULT EXPRESSION ;"
|
|
"ADD-CONSTRAINT := KW-ADD COLUMN-CONSTRAINT ;"
|
|
|
|
"CREATE-INDEX-STMT := KW-CREATE OPT-KW-UNIQUE KW-INDEX OPT-KW-CONCURRENTLY NAME ID-METHOD ? KW-ON QUAL-NAME LPAR NAMES RPAR INDEX-DIRECTIVES TERMINATOR;"
|
|
"INDEX-DIRECTIVES := INDEX-DIRECTIVE *;"
|
|
"INDEX-DIRECTIVE := ID-COLLATION ;"
|
|
|
|
"ID-COLLATION := KW-COLLATION NAME "
|
|
"ID-METHOD := KW-USING KW-METHOD NAME ;"
|
|
"ID-NULLS := KW-NULLS KW-FIRST | KW-NULLS KW-LAST;"
|
|
"ID-SORT-ORDER := KW-ASC | KW-DESC ;"
|
|
|
|
;; from https://www.postgresql.org/docs/current/static/sql-createrole.html
|
|
;; https://www.postgresql.org/docs/10/static/sql-createuser.html
|
|
"CREATE-ROLE-STMT := KW-CREATE ROLE NAME ROLE-OPTIONS TERMINATOR;"
|
|
"ROLE := KW-GROUP | KW-ROLE | KW-USER ;"
|
|
"ROLE-OPTIONS := KW-WITH ROLE-OPTIONS | OPT-SPACE ROLE-OPTION *;"
|
|
"ROLE-OPTION := RO-SUPERUSER | RO-CREATEDB | RO-CREATEROLE | RO-INHERIT | RO-REPLIC | RO-BYPASSRLS | RO-LOGIN| RO-CONN-LIMIT | RO-PASSWORD | RO-TIMEOUT | RO-IN-ROLE | RO-ROLE | RO-ADMIN | RO-USER | RO-SYSID;"
|
|
"RO-SUPERUSER := KW-SUPERUSER | KW-NOSUPERUSER ;"
|
|
"RO-CREATEDB := KW-CREATEDB | KW-NOCREATEDB ;"
|
|
"RO-CREATEROLE := KW-CREATEROLE | KW-NOCREATEROLE ;"
|
|
"RO-INHERIT := KW-INHERIT | KW-NOINHERIT ;"
|
|
"RO-REPLIC := KW-REPLICATION | KW-NOREPLICATION ;"
|
|
"RO-BYPASSRLS := KW-BYPASSRLS | KW-NOBYPASSRLS ;"
|
|
"RO-LOGIN := KW-LOGIN | KW-NOLOGIN ;"
|
|
"RO-CONN-LIMIT := KW-CONNECTION KW-LIMIT INT-VAL ;"
|
|
"RO-PASSWORD := KW-PASSWORD CHAR-VAL | KW-ENCRYPTED KW-PASSWORD CHAR-VAL ;"
|
|
;; The value here is actually a date/time value, but that's a level of detail we don't need.
|
|
"RO-TIMEOUT := KW-VALID KW-UNTIL CHAR-VAL ;"
|
|
"RO-IN-ROLE := KW-IN KW-ROLE NAMES | KW-IN KW-GROUP NAMES ;"
|
|
"RO-ROLE := KW-ROLE NAMES ;"
|
|
"RO-ADMIN := KW-ADMIN NAMES ;"
|
|
"RO-USER := KW-USER NAMES ;"
|
|
"RO-SYSID := KW-SYSID CHAR-VAL ;"
|
|
|
|
|
|
"PERMISSIONS-STMT := REVOKE-STMT | GRANT-STMT;"
|
|
"REVOKE-STMT := KW-REVOKE PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-FROM NAMES TERMINATOR;"
|
|
"GRANT-STMT := KW-GRANT PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-TO NAMES TERMINATOR;"
|
|
|
|
"PERMISSIONS := PERMISSION-COMMA * PERMISSION ;"
|
|
"PERMISSION-COMMA := PERMISSION COMMA ;"
|
|
"PERMISSION := KW-ALL | KW-SELECT | KW-INSERT | KW-UPDATE | KW-DELETE ;"
|
|
|
|
"OPT-KW-CONCURRENTLY := KW-CONCURRENTLY | '' ;"
|
|
"OPT-KW-DATA := KW-DATA | '' ;"
|
|
"OPT-KW-SCHEMA := KW-SCHEMA | '' ;"
|
|
"OPT-KW-UNIQUE := KW-UNIQUE | '' ;"
|
|
|
|
"COMMENT := KW-COMMENT #'[^;]*' TERMINATOR |#'--[^\\n\\r]*' ;"
|
|
;; TODO: much more to do here!
|
|
"EXPRESSION := VALUE | KEYWORD | FUNCTION | LPAR EXPRESSION RPAR;"
|
|
"FUNCTION := NAME LPAR VALUES RPAR | NAME LPAR RPAR ;"
|
|
;; TODO: same for values.
|
|
"VALUES := VALUE COMMA *;"
|
|
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES | VALUE '::' NAME;"
|
|
"INT-VAL := #'[0-9]+' ;"
|
|
"CHAR-VAL := #'\\047.*\\047' ;"
|
|
"TRUTH-VAL := KW-TRUE | KW-FALSE ;"
|
|
"DOT := '.' ;"
|
|
;; there don't seem to be any valid cases where a comma may not be either preceded or succeeded by white-space
|
|
"COMMA := OPT-SPACE ',' OPT-SPACE ;"
|
|
"LPAR := OPT-SPACE '(' OPT-SPACE ;"
|
|
"RPAR := OPT-SPACE ')' OPT-SPACE ; "
|
|
"EQUALS := OPT-SPACE '=' OPT-SPACE ;"
|
|
"OPT-COMMA := COMMA | OPT-SPACE ;"
|
|
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
|
|
;; also acceptable wherever OPT-SPACE is acceptable
|
|
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
|
|
;; SPACE is mandatory white-space; comments are acceptable here too but there must be a real space character
|
|
"SPACE := #'\\s+' | OPT-SPACE SPACE | SPACE OPT-SPACE ;"
|
|
"QUAL-NAME := NAME | NAME DOT NAME ;"
|
|
"NAMES := NAME | NAME COMMA NAMES ;"
|
|
"NAME := #'[a-zA-Z][a-zA-Z0-9_]*' | QUOTE-MK NAME QUOTE-MK ;"
|
|
"QUOTE-MK := '\"' ;"
|
|
"TERMINATOR := SEMI-COLON | KW-GO | OPT-SPACE TERMINATOR OPT-SPACE;"
|
|
"SEMI-COLON := ';';"))
|
|
|
|
|
|
(def grammar (join "\n" (flatten (list basic-rules datatype-rules keyword-rules))))
|
|
|
|
(def parse
|
|
"Parse the argument, assumed to be a string in the correct syntax, and return a parse tree."
|
|
(insta/parser grammar))
|
|
|
|
|