Now parses a very substantial amount of SQL.

This commit is contained in:
Simon Brooke 2018-02-13 19:01:34 +00:00
parent d3cc5cb11f
commit 9199483f5e
4 changed files with 248 additions and 42 deletions

6
.gitignore vendored
View file

@ -1,2 +1,8 @@
Leiningen.gitignore
\.lein-repl-history
\.nrepl-port
target/classes/META-INF/maven/squirrel-parse/squirrel-parse/pom\.properties
target/

View file

@ -1,8 +1,9 @@
(ns ^{:doc "A very simple parser which parses production rules."
(ns ^{:doc "A parser for SQL."
:author "Simon Brooke"}
squirrel-parse.parser
(:require [instaparse.core :as insta]
[clojure.string :refer [split trim triml]]))
[clojure.string :refer [join split trim triml upper-case]]
[squirrel-parse.utils :refer :all]))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;
@ -27,46 +28,143 @@
;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(def grammar
(str
"TABLE-SPEC := 'create' SPACE 'table' EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;
TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ;
TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;
COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC SPACE COLUMN-CONSTRAINTS ;
TABLE-CONSTRAINT := 'foo' ;
COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;
COLUMN-CONSTRAINT := 'constraint' SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;
NOT-NULL-CC := 'not' SPACE 'null' ;
NULL-CC := 'null' ;
DEFAULT-CC := 'default' SPACE EXPR ;
UNIQUE-CC := 'unique' SPACE INDEX-PARAMS ;
PRIMARY-CC := 'primary' SPACE 'key' INDEX-PARAMS ;
REFERENCES-CC := 'references' SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;
REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;
REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;
REF-MATCH := 'match' SPACE MATCH-TYPE ;
REF-ON-DELETE := 'on' SPACE 'delete' SPACE REF-ACTION ;
REF-ON-UPDATE := 'on' SPACE 'update' SPACE REF-ACTION ;
MATCH-TYPE := 'full' | 'partial' | 'simple' ;
INDEX-PARAMS := '' ;
REF-ACTION := 'no' SPACE 'action' | 'restrict' | 'cascade' | 'set' SPACE 'null' | 'set' SPACE 'default';
EXPR := 'foo' ; "
;; there are some things we ignore
"EXISTENCE := SPACE 'if' SPACE 'not' SPACE 'exists' | '' ;
COMMENT := #'--[~\\n\\r]*' ;"
"DATATYPE := 'integer' | 'float' | 'char' LPAR INT-VAL RPAR | 'varchar' LPAR INT-VAL RPAR | 'text' ;
INT-VAL := #'[0-9]+' ;"
;; there don't seem to be any valid cases where a comma may not be either preceded or succeeded by white-space
"COMMA := OPT-SPACE ',' OPT-SPACE ;
LPAR := OPT-SPACE '(' OPT-SPACE ;
RPAR := OPT-SPACE ')' OPT-SPACE ; "
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
;; also acceptable wherever OPT-SPACE is acceptable
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
;; SPACE is mandatory white-space; comments are acceptable here too but there must be a real space character
"SPACE := #'\\s+' | OPT-SPACE SPACE | SPACE OPT-SPACE ;
NAMES := NAME | NAME COMMA NAMES ;
NAME := #'[a-zA-Z][a-zA-Z0-9_]*'"))
(def keywords ["action" "alter" "as" "by" "cache" "cascade" "comment" "constraint" "create" "cycle" "day" "default" "delete" "exists"
"extension" "false" "from" "full" "go" "hour" "if" "increment" "key" "match" "maxvalue" "minute" "minvalue"
"month" "no" "none" "not" "null" "off" "on" "owned" "partial" "primary" "references" "restrict" "schema" "second"
"select" "sequence" "set" "simple" "start" "table" "temp" "temporary" "time" "to" "true" "unique" "update" "where" "with" "without"
"year" "zone"])
(def keyword-rules
(flatten
(list
(apply str
(flatten
(list
"KEYWORD :="
(join " | "
(map #(str "KW-" (upper-case (join "-" (split % #"\s+"))))
keywords)))))
(map #(make-case-insensitive-match-rule % "KW") keywords))))
(def simple-datatypes
"datatypes which take no arguments or special syntax"
["bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ])
(def with-integer-parameter-datatypes
"datatypes which take a single integer parameter, typically the storage size"
["char" "character" "character varying" "varchar"])
(def with-or-without-timezone-datatypes
"datatypes which may optionally take 'with (or without) time zone"
["time" "timestamp"])
(def special-datatypes
"Datatypes with complex special grammar"
["interval"])
(def interval-datatype-rules
"interval has special complex grammar. TODO: this is not entirely correct; some
combinations of 'x to y' are not allowed."
(list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;"
"INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;"
"INTERVAL-FIELD := KW-YEAR | KW-MONTH | KW-DAY | KW-HOUR | KW-MINUTE | KW-SECOND;"))
(def datatype-rules
"All rules for all datatypes"
(list
(apply str
(flatten
(list "DATATYPE := "
(join " | "
(map #(str "DT-" (upper-case (join "-" (split % #"\s+"))))
(flatten
(list simple-datatypes
with-integer-parameter-datatypes
with-or-without-timezone-datatypes
special-datatypes)))) " ;\n")))
(map make-simple-datatype-rule simple-datatypes)
(map make-with-integer-parameter-datatype-rule with-integer-parameter-datatypes)
(map make-with-or-without-timezone-datatype-rule with-or-without-timezone-datatypes)
interval-datatype-rules))
(def basic-rules
(list
"STATEMENTS := STATEMENT TERMINATOR | STATEMENT TERMINATOR STATEMENTS ;"
"STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT-STMT | EXTENSION-DECL | SEQUENCE-DECL;"
"ALTER-STMT := ALTER-TABLE ;"
"ALTER-TABLE := KW-ALTER SPACE KW-TABLE SPACE QUAL-NAME SPACE NAME SPACE KW-TO SPACE NAME ;"
"SET-STMT := KW-SET SPACE NAME OPT-SPACE '=' OPT-SPACE EXPRESSION ;"
"COMMENT-STMT := KW-COMMENT #'[~;]*' ;"
"EXTENSION-DECL := KW-CREATE SPACE KW-EXTENSION EXISTENCE SPACE NAME SPACE KW-WITH SPACE KW-SCHEMA SPACE NAME ;"
;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html
"SEQUENCE-DECL := KW-CREATE SPACE PERMANENCE KW-SEQUENCE EXISTENCE SPACE NAME SPACE SEQ-SPEC-ELEMENTS ;"
"SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SPACE SEQ-SPEC-ELEMENTS ;"
"SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;"
"SEQ-SPEC-AS := KW-AS DATATYPE ;"
"SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT SPACE INT-VAL | KW-INCREMENT SPACE KW-BY SPACE INT-VAL ;"
;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue.
"SEQ-SPEC-MIN := KW-NO SPACE KW-MINVALUE | KW-MINVALUE SPACE INT-VAL ;"
"SEQ-SPEC-MAX := KW-NO SPACE KW-MAXVALUE | KW-MAXVALUE SPACE INT-VAL ;"
"SEQ-SPEC-START := KW-START SPACE KW-WITH SPACE INT-VAL | KW-START SPACE INT-VAL ;"
"SEQ-SPEC-CACHE := KW-CACHE SPACE INT-VAL ;"
"SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO SPACE KW-CYCLE ;"
"SEQ-SPEC-OWNER := KW-OWNED SPACE KW-BY SPACE QUAL-NAME | KW-OWNED SPACE KW-BY SPACE KW-NONE ;"
;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html
;; but by no means all of that is implemented.
"TABLE-DECL := KW-CREATE SPACE PERMANENCE KW-TABLE EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;"
"TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ; "
"TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;"
"COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC OPT-SPACE COLUMN-CONSTRAINTS ;"
"TABLE-CONSTRAINT := 'TODO' ;"
"COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;"
"COLUMN-CONSTRAINT := KW-CONSTRAINT SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;"
"NOT-NULL-CC := KW-NOT SPACE KW-NULL ;"
"NULL-CC := KW-NULL ;"
"DEFAULT-CC := KW-DEFAULT SPACE EXPR ;"
"UNIQUE-CC := KW-UNIQUE SPACE INDEX-PARAMS ;"
"PRIMARY-CC := KW-PRIMARY SPACE KW-KEY INDEX-PARAMS ;"
"REFERENCES-CC := KW-REFERENCES SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;"
"REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;"
"REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;"
"REF-MATCH := KW-MATCH SPACE MATCH-TYPE ;"
"REF-ON-DELETE := KW-ON SPACE KW-DELETE SPACE REF-ACTION ;"
"REF-ON-UPDATE := KW-ON SPACE KW-UPDATE SPACE REF-ACTION ;"
"MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;"
"INDEX-PARAMS := 'foo' ;"
"REF-ACTION := KW-NO SPACE KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET SPACE VALUE;"
"EXPR := 'TODO' ; "
"EXISTENCE := '' | SPACE KW-IF SPACE KW-NOT SPACE KW-EXISTS ;"
"PERMANENCE := '' | KW-TEMP SPACE | KW-TEMPORARY SPACE ;"
"COMMENT := #'--[~\\n\\r]*' ;"
;; TODO: much more to do here!
"EXPRESSION := VALUE | KEYWORD;"
;; TODO: same for values.
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES;"
"INT-VAL := #'[0-9]+' ;"
"CHAR-VAL := #'\\047.*\\047' ;"
"TRUTH-VAL := KW-TRUE | KW-FALSE ;"
"DOT := '.' ;"
;; there don't seem to be any valid cases where a comma may not be either preceded or succeeded by white-space
"COMMA := OPT-SPACE ',' OPT-SPACE ;"
"LPAR := OPT-SPACE '(' OPT-SPACE ;"
"RPAR := OPT-SPACE ')' OPT-SPACE ; "
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
;; also acceptable wherever OPT-SPACE is acceptable
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
;; SPACE is mandatory white-space; comments are acceptable here too but there must be a real space character
"SPACE := #'\\s+' | OPT-SPACE SPACE | SPACE OPT-SPACE ;"
"QUAL-NAME := NAME | NAME DOT NAME ;"
"NAMES := NAME | NAME COMMA NAMES ;"
"NAME := #'[a-zA-Z][a-zA-Z0-9_]*'"
"TERMINATOR := OPT-SPACE SEMI-COLON OPT-SPACE | OPT-SPACE KW-GO OPT-SPACE;"
"SEMI-COLON := ';';"))
(def grammar (join "\n" (flatten (list basic-rules datatype-rules keyword-rules))))
(defn ignorable?

View file

@ -0,0 +1,95 @@
(ns ^{:doc "A parser for SQL: utility functions."
:author "Simon Brooke"}
squirrel-parse.utils
(:require [clojure.string :refer [join split trim triml upper-case]]))
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;
;;;; squirrel-parse.utils: utility functions supporting the parser.
;;;;
;;;; This program is free software; you can redistribute it and/or
;;;; modify it under the terms of the GNU General Public License
;;;; as published by the Free Software Foundation; either version 2
;;;; of the License, or (at your option) any later version.
;;;;
;;;; This program is distributed in the hope that it will be useful,
;;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
;;;; GNU General Public License for more details.
;;;;
;;;; You should have received a copy of the GNU General Public License
;;;; along with this program; if not, write to the Free Software
;;;; Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
;;;; USA.
;;;;
;;;; Copyright (C) 2018 Simon Brooke
;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(defn- make-unterminated-case-insensitive-match-rule
"Make a grammar rule which matches this `token` case-insensitively,
without the terminal semi-colon"
[token]
(let [subtokens (split token #"\s+")
name (join "-" subtokens)]
(apply str
(flatten
(list
(upper-case name)
" := "
(join " SPACE " (map #(str "#'(?i)" % "'") subtokens)))))))
(defn make-case-insensitive-match-rule
"Make a grammar rule which matches this `token` case-insensitively, optionally prefixing the
name of the rule with this `prefix`"
([token prefix]
(str prefix "-" (make-case-insensitive-match-rule token)))
([token]
(str (make-unterminated-case-insensitive-match-rule token) ";")))
(defn make-simple-datatype-rule
"Make a rule which matches this `datatype`, for datatypes which take no parameters or special grammar."
[token]
(make-case-insensitive-match-rule token "DT"))
(defn make-with-integer-parameter-datatype-rule
"Make a rule which matches this `datatype`, for datatypes which take a single integer parameter."
[token]
(str "DT-" (make-unterminated-case-insensitive-match-rule token) " LPAR INT-VAL RPAR ;"))
(defn- make-timezone-clause
[match with-tz? with-precision?]
(join " SPACE "
(list (if with-precision? (str match " LPAR INT-VAL RPAR") match) (if with-tz? "KW-WITH" "KW-WITHOUT") "KW-TIME" "KW-ZONE")))
(defn make-with-or-without-timezone-datatype-rule
"Make a rule which matches this `datatype`, for datatypes which may optionally take
'with (or without) time zone'."
[token]
(let [subtokens (split token #"\s+")
name (join "-" subtokens)
match (join " SPACE " (map #(str "#'(?i)" % "'") subtokens))]
(apply str
(flatten
(list
"DT-"
(upper-case name)
" := "
(join
" | "
(list
match
(str match " LPAR INT-VAL RPAR")
(make-timezone-clause match true false)
(make-timezone-clause match true true)
(make-timezone-clause match false false)
(make-timezone-clause match false true))))))))

View file

@ -0,0 +1,7 @@
(ns squirrel-parse.parser-test
(:require [clojure.test :refer :all]
[squirrel-parse.parser :refer :all]))
(deftest a-test
(testing "FIXME, I fail."
(is (= 0 1))))