Now parses the sample SQL file I most want to parse!

This commit is contained in:
Simon Brooke 2018-02-14 13:19:29 +00:00
parent 9199483f5e
commit 6d49ad3774
2 changed files with 107 additions and 53 deletions

View file

@ -28,14 +28,23 @@
;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(def keywords ["action" "alter" "as" "by" "cache" "cascade" "comment" "constraint" "create" "cycle" "day" "default" "delete" "exists"
"extension" "false" "from" "full" "go" "hour" "if" "increment" "key" "match" "maxvalue" "minute" "minvalue"
"month" "no" "none" "not" "null" "off" "on" "owned" "partial" "primary" "references" "restrict" "schema" "second"
"select" "sequence" "set" "simple" "start" "table" "temp" "temporary" "time" "to" "true" "unique" "update" "where" "with" "without"
"year" "zone"])
(def keywords
"Keywords from the subset of Postgresql I have so far needed to support. Feel free to either
1. Add further Postrgresql keywords, or
2. Start a new def for non-postgresql keywords."
["action" "add" "all" "alter" "as" "by" "cache" "cascade" "collate" "column" "comment" "constraint" "create"
"cycle" "data" "day" "default" "delete" "exists" "extension" "false" "foreign" "from" "full"
"go" "grant" "hour" "if" "increment" "insert" "into" "key" "match" "maxvalue" "minute" "minvalue"
"month" "no" "none" "not" "null" "off" "on" "only" "owned" "owner" "partial" "primary"
"references" "rename" "restrict" "revoke" "schema" "second" "select" "sequence" "set"
"simple" "start" "table" "temp" "temporary" "time" "to" "true" "type" "unique"
"update" "using" "values" "where" "with" "without" "year" "zone"])
(def keyword-rules
"Rules to match keywords case-insensitively. It is my practice to write SQL in
lower case, but I know many prefer upper case."
(flatten
(list
(apply str
@ -49,23 +58,23 @@
(def simple-datatypes
"datatypes which take no arguments or special syntax"
["bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ])
"(Postgresql) datatypes which take no arguments or special syntax"
["bigint" "bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ])
(def with-integer-parameter-datatypes
"datatypes which take a single integer parameter, typically the storage size"
"(Postgresql) datatypes which take a single integer parameter, typically the storage size"
["char" "character" "character varying" "varchar"])
(def with-or-without-timezone-datatypes
"datatypes which may optionally take 'with (or without) time zone"
"(Postgresql) datatypes which may optionally take 'with (or without) time zone"
["time" "timestamp"])
(def special-datatypes
"Datatypes with complex special grammar"
"(Postgresql) Datatypes with complex special grammar"
["interval"])
(def interval-datatype-rules
"interval has special complex grammar. TODO: this is not entirely correct; some
"`interval` has special complex grammar. TODO: this is not entirely correct; some
combinations of 'x to y' are not allowed."
(list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;"
"INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;"
@ -90,60 +99,96 @@
interval-datatype-rules))
(def basic-rules
"Rough general syntax of the subset of Postgresql I have so far needed to support.
There will be some differences from ANSI 92. Rules are generally expressed as single
strings. Order is not very significant, but instaparse treats the first rule as special."
(list
"STATEMENTS := STATEMENT TERMINATOR | STATEMENT TERMINATOR STATEMENTS ;"
"STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT-STMT | EXTENSION-DECL | SEQUENCE-DECL;"
"ALTER-STMT := ALTER-TABLE ;"
"ALTER-TABLE := KW-ALTER SPACE KW-TABLE SPACE QUAL-NAME SPACE NAME SPACE KW-TO SPACE NAME ;"
"SET-STMT := KW-SET SPACE NAME OPT-SPACE '=' OPT-SPACE EXPRESSION ;"
"COMMENT-STMT := KW-COMMENT #'[~;]*' ;"
"EXTENSION-DECL := KW-CREATE SPACE KW-EXTENSION EXISTENCE SPACE NAME SPACE KW-WITH SPACE KW-SCHEMA SPACE NAME ;"
"STATEMENTS := OPT-SPACE STATEMENT | OPT-SPACE STATEMENT OPT-SPACE STATEMENTS ;"
"STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT | EXTENSION-DECL | SEQUENCE-DECL | INSERT-STMT | PERMISSIONS-STMT;"
"ALTER-STMT := ALTER-TABLE | ALTER-SEQUENCE ;"
"SET-STMT := KW-SET NAME EQUALS EXPRESSION TERMINATOR ;"
"EXTENSION-DECL := KW-CREATE KW-EXTENSION EXISTENCE NAME KW-WITH KW-SCHEMA NAME TERMINATOR ;"
;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html
"SEQUENCE-DECL := KW-CREATE SPACE PERMANENCE KW-SEQUENCE EXISTENCE SPACE NAME SPACE SEQ-SPEC-ELEMENTS ;"
"SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SPACE SEQ-SPEC-ELEMENTS ;"
"SEQUENCE-DECL := KW-CREATE PERMANENCE KW-SEQUENCE EXISTENCE NAME SEQ-SPEC-ELEMENTS TERMINATOR ;"
"SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SEQ-SPEC-ELEMENTS ;"
"SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;"
"SEQ-SPEC-AS := KW-AS DATATYPE ;"
"SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT SPACE INT-VAL | KW-INCREMENT SPACE KW-BY SPACE INT-VAL ;"
"SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT INT-VAL | KW-INCREMENT KW-BY INT-VAL ;"
;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue.
"SEQ-SPEC-MIN := KW-NO SPACE KW-MINVALUE | KW-MINVALUE SPACE INT-VAL ;"
"SEQ-SPEC-MAX := KW-NO SPACE KW-MAXVALUE | KW-MAXVALUE SPACE INT-VAL ;"
"SEQ-SPEC-START := KW-START SPACE KW-WITH SPACE INT-VAL | KW-START SPACE INT-VAL ;"
"SEQ-SPEC-CACHE := KW-CACHE SPACE INT-VAL ;"
"SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO SPACE KW-CYCLE ;"
"SEQ-SPEC-OWNER := KW-OWNED SPACE KW-BY SPACE QUAL-NAME | KW-OWNED SPACE KW-BY SPACE KW-NONE ;"
"SEQ-SPEC-MIN := KW-NO KW-MINVALUE | KW-MINVALUE INT-VAL ;"
"SEQ-SPEC-MAX := KW-NO KW-MAXVALUE | KW-MAXVALUE INT-VAL ;"
"SEQ-SPEC-START := KW-START KW-WITH INT-VAL | KW-START INT-VAL ;"
"SEQ-SPEC-CACHE := KW-CACHE INT-VAL ;"
"SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO KW-CYCLE ;"
"SEQ-SPEC-OWNER := KW-OWNED KW-BY QUAL-NAME | KW-OWNED KW-BY KW-NONE ;"
;; from https://www.postgresql.org/docs/10/static/sql-altersequence.html
"ALTER-SEQUENCE := KW-ALTER KW-SEQUENCE EXISTENCE NAME ALTER-SEQ-ELEMENTS TERMINATOR ;"
"ALTER-SEQ-ELEMENTS := ALTER-SEQ-ELEMENT | ALTER-SEQ-ELEMENT ALTER-SEQ-ELEMENTS ;"
"ALTER-SEQ-ELEMENT := SEQ-SPEC-ELEMENT | OWNER-TO | SEQ-RENAME | SEQ-SET-SCHEMA ;"
"OWNER-TO := KW-OWNER KW-TO NAME ;"
"SEQ-RENAME := KW-RENAME KW-TO NAME ;"
"SEQ-SET-SCHEMA := KW-SET KW-SCHEMA NAME ;"
;; TODO: there is a *lot* more gramar to do here, but I don't need it (yet)
"INSERT-STMT := KW-INSERT KW-INTO QUAL-NAME KW-VALUES LPAR VALUES RPAR TERMINATOR ;"
;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html
;; but by no means all of that is implemented.
"TABLE-DECL := KW-CREATE SPACE PERMANENCE KW-TABLE EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;"
"TABLE-DECL := KW-CREATE PERMANENCE KW-TABLE EXISTENCE NAME LPAR TABLE-SPEC-ELEMENTS RPAR TERMINATOR ;"
"TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ; "
"TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;"
"COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC OPT-SPACE COLUMN-CONSTRAINTS ;"
"TABLE-CONSTRAINT := 'TODO' ;"
"COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;"
"COLUMN-CONSTRAINT := KW-CONSTRAINT SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;"
"NOT-NULL-CC := KW-NOT SPACE KW-NULL ;"
"COLUMN-CONSTRAINT := KW-CONSTRAINT NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;"
"NOT-NULL-CC := KW-NOT KW-NULL ;"
"NULL-CC := KW-NULL ;"
"DEFAULT-CC := KW-DEFAULT SPACE EXPR ;"
"UNIQUE-CC := KW-UNIQUE SPACE INDEX-PARAMS ;"
"PRIMARY-CC := KW-PRIMARY SPACE KW-KEY INDEX-PARAMS ;"
"REFERENCES-CC := KW-REFERENCES SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;"
"REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;"
"DEFAULT-CC := KW-DEFAULT EXPRESSION ;"
"UNIQUE-CC := KW-UNIQUE INDEX-PARAMS ;"
"PRIMARY-CC := KW-PRIMARY KW-KEY INDEX-PARAMS ;"
"REFERENCES-CC := KW-REFERENCES NAME LPAR NAMES RPAR REF-DIRECTIVES | KW-FOREIGN KW-KEY LPAR NAMES RPAR REFERENCES-CC;"
"REF-DIRECTIVES := '' | REF-DIRECTIVE REF-DIRECTIVES ;"
"REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;"
"REF-MATCH := KW-MATCH SPACE MATCH-TYPE ;"
"REF-ON-DELETE := KW-ON SPACE KW-DELETE SPACE REF-ACTION ;"
"REF-ON-UPDATE := KW-ON SPACE KW-UPDATE SPACE REF-ACTION ;"
"REF-MATCH := KW-MATCH MATCH-TYPE ;"
"REF-ON-DELETE := KW-ON KW-DELETE REF-ACTION ;"
"REF-ON-UPDATE := KW-ON KW-UPDATE REF-ACTION ;"
"MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;"
"INDEX-PARAMS := 'foo' ;"
"REF-ACTION := KW-NO SPACE KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET SPACE VALUE;"
"EXPR := 'TODO' ; "
"EXISTENCE := '' | SPACE KW-IF SPACE KW-NOT SPACE KW-EXISTS ;"
"PERMANENCE := '' | KW-TEMP SPACE | KW-TEMPORARY SPACE ;"
"COMMENT := #'--[~\\n\\r]*' ;"
"INDEX-PARAMS := EXPRESSION ;"
"REF-ACTION := KW-NO KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET VALUE;"
"EXISTENCE := '' | KW-IF KW-NOT KW-EXISTS | KW-IF KW-EXISTS ;"
"PERMANENCE := '' | KW-TEMP | KW-TEMPORARY ;"
"ONLY := KW-ONLY | '' ;"
"ALTER-TABLE := KW-ALTER KW-TABLE EXISTENCE ONLY QUAL-NAME OPT-SPACE ALTER-TABLE-ELEMENTS TERMINATOR ;"
"ALTER-TABLE-ELEMENTS := ALTER-TABLE-ELEMENT | ALTER-TABLE-ELEMENT OPT-SPACE ALTER-TABLE-ELEMENTS ;"
"ALTER-TABLE-ELEMENT := OWNER-TO | ALTER-COLUMN | ADD-CONSTRAINT;"
"ALTER-COLUMN := KW-ALTER KW-COLUMN NAME ALTER-COL-SPEC ;"
"ALTER-COL-SPEC := ALTER-COL-TYPE | ALTER-COL-DFLT;"
"ALTER-COL-TYPE := KW-SET OPT-KW-DATA KW-TYPE DATATYPE | ALTER-COL-TYPE KW-COLLATE NAME | ALTER-COL-TYPE KW-USING EXPRESSION ;"
"ALTER-COL-DFLT := KW-SET KW-DEFAULT EXPRESSION ;"
"ADD-CONSTRAINT := KW-ADD COLUMN-CONSTRAINT ;"
"OPT-KW-DATA := KW-DATA | '' ;"
;; "TABLE-ADD-COL :=
"PERMISSIONS-STMT := REVOKE-STMT | GRANT-STMT;"
"REVOKE-STMT := KW-REVOKE PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-FROM NAMES TERMINATOR;"
"GRANT-STMT := KW-GRANT PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-TO NAMES TERMINATOR;"
"PERMISSIONS := PERMISSION | PERMISSION COMMA PERMISSIONS ;"
"PERMISSION := KW-ALL | KW-SELECT | KW-INSERT | KW-UPDATE | KW-DELETE ;"
"OPT-KW-SCHEMA := KW-SCHEMA | '' ;"
"COMMENT := KW-COMMENT #'[^;]*' TERMINATOR |#'--[^\\n\\r]*' ;"
;; TODO: much more to do here!
"EXPRESSION := VALUE | KEYWORD;"
"EXPRESSION := VALUE | KEYWORD | FUNCTION | LPAR EXPRESSION RPAR;"
"FUNCTION := NAME LPAR VALUES RPAR | NAME LPAR RPAR ;"
;; TODO: same for values.
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES;"
"VALUES := VALUE | VALUE COMMA VALUES ;"
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES | VALUE '::' NAME;"
"INT-VAL := #'[0-9]+' ;"
"CHAR-VAL := #'\\047.*\\047' ;"
"TRUTH-VAL := KW-TRUE | KW-FALSE ;"
@ -152,6 +197,7 @@
"COMMA := OPT-SPACE ',' OPT-SPACE ;"
"LPAR := OPT-SPACE '(' OPT-SPACE ;"
"RPAR := OPT-SPACE ')' OPT-SPACE ; "
"EQUALS := OPT-SPACE '=' OPT-SPACE ;"
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
;; also acceptable wherever OPT-SPACE is acceptable
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
@ -160,7 +206,7 @@
"QUAL-NAME := NAME | NAME DOT NAME ;"
"NAMES := NAME | NAME COMMA NAMES ;"
"NAME := #'[a-zA-Z][a-zA-Z0-9_]*'"
"TERMINATOR := OPT-SPACE SEMI-COLON OPT-SPACE | OPT-SPACE KW-GO OPT-SPACE;"
"TERMINATOR := SEMI-COLON | KW-GO | OPT-SPACE TERMINATOR OPT-SPACE;"
"SEMI-COLON := ';';"))
@ -171,7 +217,7 @@
"True if `x` is something which just clutters up the parse tree."
[x]
(and
(coll? x)(contains? #{:SPACE :OPT-SPACE :COMMENT} (first x))))
(coll? x)(contains? #{:SPACE :OPT-SPACE :COMMENT :OPT-KW-DATA} (first x))))
(defn remove-recursive
@ -190,3 +236,4 @@
"Parse the argument, assumed to be a string in the correct syntax, and return a parse tree."
(insta/parser grammar))
(def parse-comment (insta/parser "COMMENT := #'--[^\\n\\r]*' ;"))

View file

@ -30,7 +30,8 @@
(defn- make-unterminated-case-insensitive-match-rule
"Make a grammar rule which matches this `token` case-insensitively,
without the terminal semi-colon"
without the terminal semi-colon. Keywords may always optionally be preceded
by whitespace and are usually succeeded by whitespace."
[token]
(let [subtokens (split token #"\s+")
name (join "-" subtokens)]
@ -38,8 +39,9 @@
(flatten
(list
(upper-case name)
" := "
(join " SPACE " (map #(str "#'(?i)" % "'") subtokens)))))))
" := OPT-SPACE "
(join " SPACE " (map #(str "#'(?i)" % "'") subtokens))
" OPT-SPACE ")))))
(defn make-case-insensitive-match-rule
@ -65,8 +67,13 @@
(defn- make-timezone-clause
[match with-tz? with-precision?]
(join " SPACE "
(list (if with-precision? (str match " LPAR INT-VAL RPAR") match) (if with-tz? "KW-WITH" "KW-WITHOUT") "KW-TIME" "KW-ZONE")))
(join
" "
(list
(if with-precision? (str match " LPAR INT-VAL RPAR") match)
(if with-tz? "KW-WITH" "KW-WITHOUT")
"KW-TIME"
"KW-ZONE")))
(defn make-with-or-without-timezone-datatype-rule