Now parses the sample SQL file I most want to parse!

This commit is contained in:
Simon Brooke 2018-02-14 13:19:29 +00:00
parent 9199483f5e
commit 6d49ad3774
2 changed files with 107 additions and 53 deletions

View file

@ -28,14 +28,23 @@
;;;; ;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
(def keywords ["action" "alter" "as" "by" "cache" "cascade" "comment" "constraint" "create" "cycle" "day" "default" "delete" "exists"
"extension" "false" "from" "full" "go" "hour" "if" "increment" "key" "match" "maxvalue" "minute" "minvalue" (def keywords
"month" "no" "none" "not" "null" "off" "on" "owned" "partial" "primary" "references" "restrict" "schema" "second" "Keywords from the subset of Postgresql I have so far needed to support. Feel free to either
"select" "sequence" "set" "simple" "start" "table" "temp" "temporary" "time" "to" "true" "unique" "update" "where" "with" "without" 1. Add further Postrgresql keywords, or
"year" "zone"]) 2. Start a new def for non-postgresql keywords."
["action" "add" "all" "alter" "as" "by" "cache" "cascade" "collate" "column" "comment" "constraint" "create"
"cycle" "data" "day" "default" "delete" "exists" "extension" "false" "foreign" "from" "full"
"go" "grant" "hour" "if" "increment" "insert" "into" "key" "match" "maxvalue" "minute" "minvalue"
"month" "no" "none" "not" "null" "off" "on" "only" "owned" "owner" "partial" "primary"
"references" "rename" "restrict" "revoke" "schema" "second" "select" "sequence" "set"
"simple" "start" "table" "temp" "temporary" "time" "to" "true" "type" "unique"
"update" "using" "values" "where" "with" "without" "year" "zone"])
(def keyword-rules (def keyword-rules
"Rules to match keywords case-insensitively. It is my practice to write SQL in
lower case, but I know many prefer upper case."
(flatten (flatten
(list (list
(apply str (apply str
@ -49,23 +58,23 @@
(def simple-datatypes (def simple-datatypes
"datatypes which take no arguments or special syntax" "(Postgresql) datatypes which take no arguments or special syntax"
["bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ]) ["bigint" "bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ])
(def with-integer-parameter-datatypes (def with-integer-parameter-datatypes
"datatypes which take a single integer parameter, typically the storage size" "(Postgresql) datatypes which take a single integer parameter, typically the storage size"
["char" "character" "character varying" "varchar"]) ["char" "character" "character varying" "varchar"])
(def with-or-without-timezone-datatypes (def with-or-without-timezone-datatypes
"datatypes which may optionally take 'with (or without) time zone" "(Postgresql) datatypes which may optionally take 'with (or without) time zone"
["time" "timestamp"]) ["time" "timestamp"])
(def special-datatypes (def special-datatypes
"Datatypes with complex special grammar" "(Postgresql) Datatypes with complex special grammar"
["interval"]) ["interval"])
(def interval-datatype-rules (def interval-datatype-rules
"interval has special complex grammar. TODO: this is not entirely correct; some "`interval` has special complex grammar. TODO: this is not entirely correct; some
combinations of 'x to y' are not allowed." combinations of 'x to y' are not allowed."
(list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;" (list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;"
"INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;" "INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;"
@ -90,60 +99,96 @@
interval-datatype-rules)) interval-datatype-rules))
(def basic-rules (def basic-rules
"Rough general syntax of the subset of Postgresql I have so far needed to support.
There will be some differences from ANSI 92. Rules are generally expressed as single
strings. Order is not very significant, but instaparse treats the first rule as special."
(list (list
"STATEMENTS := STATEMENT TERMINATOR | STATEMENT TERMINATOR STATEMENTS ;" "STATEMENTS := OPT-SPACE STATEMENT | OPT-SPACE STATEMENT OPT-SPACE STATEMENTS ;"
"STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT-STMT | EXTENSION-DECL | SEQUENCE-DECL;" "STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT | EXTENSION-DECL | SEQUENCE-DECL | INSERT-STMT | PERMISSIONS-STMT;"
"ALTER-STMT := ALTER-TABLE ;" "ALTER-STMT := ALTER-TABLE | ALTER-SEQUENCE ;"
"ALTER-TABLE := KW-ALTER SPACE KW-TABLE SPACE QUAL-NAME SPACE NAME SPACE KW-TO SPACE NAME ;"
"SET-STMT := KW-SET SPACE NAME OPT-SPACE '=' OPT-SPACE EXPRESSION ;" "SET-STMT := KW-SET NAME EQUALS EXPRESSION TERMINATOR ;"
"COMMENT-STMT := KW-COMMENT #'[~;]*' ;" "EXTENSION-DECL := KW-CREATE KW-EXTENSION EXISTENCE NAME KW-WITH KW-SCHEMA NAME TERMINATOR ;"
"EXTENSION-DECL := KW-CREATE SPACE KW-EXTENSION EXISTENCE SPACE NAME SPACE KW-WITH SPACE KW-SCHEMA SPACE NAME ;"
;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html ;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html
"SEQUENCE-DECL := KW-CREATE SPACE PERMANENCE KW-SEQUENCE EXISTENCE SPACE NAME SPACE SEQ-SPEC-ELEMENTS ;" "SEQUENCE-DECL := KW-CREATE PERMANENCE KW-SEQUENCE EXISTENCE NAME SEQ-SPEC-ELEMENTS TERMINATOR ;"
"SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SPACE SEQ-SPEC-ELEMENTS ;" "SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SEQ-SPEC-ELEMENTS ;"
"SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;" "SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;"
"SEQ-SPEC-AS := KW-AS DATATYPE ;" "SEQ-SPEC-AS := KW-AS DATATYPE ;"
"SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT SPACE INT-VAL | KW-INCREMENT SPACE KW-BY SPACE INT-VAL ;" "SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT INT-VAL | KW-INCREMENT KW-BY INT-VAL ;"
;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue. ;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue.
"SEQ-SPEC-MIN := KW-NO SPACE KW-MINVALUE | KW-MINVALUE SPACE INT-VAL ;" "SEQ-SPEC-MIN := KW-NO KW-MINVALUE | KW-MINVALUE INT-VAL ;"
"SEQ-SPEC-MAX := KW-NO SPACE KW-MAXVALUE | KW-MAXVALUE SPACE INT-VAL ;" "SEQ-SPEC-MAX := KW-NO KW-MAXVALUE | KW-MAXVALUE INT-VAL ;"
"SEQ-SPEC-START := KW-START SPACE KW-WITH SPACE INT-VAL | KW-START SPACE INT-VAL ;" "SEQ-SPEC-START := KW-START KW-WITH INT-VAL | KW-START INT-VAL ;"
"SEQ-SPEC-CACHE := KW-CACHE SPACE INT-VAL ;" "SEQ-SPEC-CACHE := KW-CACHE INT-VAL ;"
"SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO SPACE KW-CYCLE ;" "SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO KW-CYCLE ;"
"SEQ-SPEC-OWNER := KW-OWNED SPACE KW-BY SPACE QUAL-NAME | KW-OWNED SPACE KW-BY SPACE KW-NONE ;" "SEQ-SPEC-OWNER := KW-OWNED KW-BY QUAL-NAME | KW-OWNED KW-BY KW-NONE ;"
;; from https://www.postgresql.org/docs/10/static/sql-altersequence.html
"ALTER-SEQUENCE := KW-ALTER KW-SEQUENCE EXISTENCE NAME ALTER-SEQ-ELEMENTS TERMINATOR ;"
"ALTER-SEQ-ELEMENTS := ALTER-SEQ-ELEMENT | ALTER-SEQ-ELEMENT ALTER-SEQ-ELEMENTS ;"
"ALTER-SEQ-ELEMENT := SEQ-SPEC-ELEMENT | OWNER-TO | SEQ-RENAME | SEQ-SET-SCHEMA ;"
"OWNER-TO := KW-OWNER KW-TO NAME ;"
"SEQ-RENAME := KW-RENAME KW-TO NAME ;"
"SEQ-SET-SCHEMA := KW-SET KW-SCHEMA NAME ;"
;; TODO: there is a *lot* more gramar to do here, but I don't need it (yet)
"INSERT-STMT := KW-INSERT KW-INTO QUAL-NAME KW-VALUES LPAR VALUES RPAR TERMINATOR ;"
;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html ;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html
;; but by no means all of that is implemented. ;; but by no means all of that is implemented.
"TABLE-DECL := KW-CREATE SPACE PERMANENCE KW-TABLE EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;" "TABLE-DECL := KW-CREATE PERMANENCE KW-TABLE EXISTENCE NAME LPAR TABLE-SPEC-ELEMENTS RPAR TERMINATOR ;"
"TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ; " "TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ; "
"TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;" "TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;"
"COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC OPT-SPACE COLUMN-CONSTRAINTS ;" "COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC OPT-SPACE COLUMN-CONSTRAINTS ;"
"TABLE-CONSTRAINT := 'TODO' ;" "TABLE-CONSTRAINT := 'TODO' ;"
"COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;" "COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;"
"COLUMN-CONSTRAINT := KW-CONSTRAINT SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;" "COLUMN-CONSTRAINT := KW-CONSTRAINT NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;"
"NOT-NULL-CC := KW-NOT SPACE KW-NULL ;" "NOT-NULL-CC := KW-NOT KW-NULL ;"
"NULL-CC := KW-NULL ;" "NULL-CC := KW-NULL ;"
"DEFAULT-CC := KW-DEFAULT SPACE EXPR ;" "DEFAULT-CC := KW-DEFAULT EXPRESSION ;"
"UNIQUE-CC := KW-UNIQUE SPACE INDEX-PARAMS ;" "UNIQUE-CC := KW-UNIQUE INDEX-PARAMS ;"
"PRIMARY-CC := KW-PRIMARY SPACE KW-KEY INDEX-PARAMS ;" "PRIMARY-CC := KW-PRIMARY KW-KEY INDEX-PARAMS ;"
"REFERENCES-CC := KW-REFERENCES SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;" "REFERENCES-CC := KW-REFERENCES NAME LPAR NAMES RPAR REF-DIRECTIVES | KW-FOREIGN KW-KEY LPAR NAMES RPAR REFERENCES-CC;"
"REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;" "REF-DIRECTIVES := '' | REF-DIRECTIVE REF-DIRECTIVES ;"
"REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;" "REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;"
"REF-MATCH := KW-MATCH SPACE MATCH-TYPE ;" "REF-MATCH := KW-MATCH MATCH-TYPE ;"
"REF-ON-DELETE := KW-ON SPACE KW-DELETE SPACE REF-ACTION ;" "REF-ON-DELETE := KW-ON KW-DELETE REF-ACTION ;"
"REF-ON-UPDATE := KW-ON SPACE KW-UPDATE SPACE REF-ACTION ;" "REF-ON-UPDATE := KW-ON KW-UPDATE REF-ACTION ;"
"MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;" "MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;"
"INDEX-PARAMS := 'foo' ;" "INDEX-PARAMS := EXPRESSION ;"
"REF-ACTION := KW-NO SPACE KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET SPACE VALUE;" "REF-ACTION := KW-NO KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET VALUE;"
"EXPR := 'TODO' ; " "EXISTENCE := '' | KW-IF KW-NOT KW-EXISTS | KW-IF KW-EXISTS ;"
"EXISTENCE := '' | SPACE KW-IF SPACE KW-NOT SPACE KW-EXISTS ;" "PERMANENCE := '' | KW-TEMP | KW-TEMPORARY ;"
"PERMANENCE := '' | KW-TEMP SPACE | KW-TEMPORARY SPACE ;" "ONLY := KW-ONLY | '' ;"
"COMMENT := #'--[~\\n\\r]*' ;"
"ALTER-TABLE := KW-ALTER KW-TABLE EXISTENCE ONLY QUAL-NAME OPT-SPACE ALTER-TABLE-ELEMENTS TERMINATOR ;"
"ALTER-TABLE-ELEMENTS := ALTER-TABLE-ELEMENT | ALTER-TABLE-ELEMENT OPT-SPACE ALTER-TABLE-ELEMENTS ;"
"ALTER-TABLE-ELEMENT := OWNER-TO | ALTER-COLUMN | ADD-CONSTRAINT;"
"ALTER-COLUMN := KW-ALTER KW-COLUMN NAME ALTER-COL-SPEC ;"
"ALTER-COL-SPEC := ALTER-COL-TYPE | ALTER-COL-DFLT;"
"ALTER-COL-TYPE := KW-SET OPT-KW-DATA KW-TYPE DATATYPE | ALTER-COL-TYPE KW-COLLATE NAME | ALTER-COL-TYPE KW-USING EXPRESSION ;"
"ALTER-COL-DFLT := KW-SET KW-DEFAULT EXPRESSION ;"
"ADD-CONSTRAINT := KW-ADD COLUMN-CONSTRAINT ;"
"OPT-KW-DATA := KW-DATA | '' ;"
;; "TABLE-ADD-COL :=
"PERMISSIONS-STMT := REVOKE-STMT | GRANT-STMT;"
"REVOKE-STMT := KW-REVOKE PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-FROM NAMES TERMINATOR;"
"GRANT-STMT := KW-GRANT PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-TO NAMES TERMINATOR;"
"PERMISSIONS := PERMISSION | PERMISSION COMMA PERMISSIONS ;"
"PERMISSION := KW-ALL | KW-SELECT | KW-INSERT | KW-UPDATE | KW-DELETE ;"
"OPT-KW-SCHEMA := KW-SCHEMA | '' ;"
"COMMENT := KW-COMMENT #'[^;]*' TERMINATOR |#'--[^\\n\\r]*' ;"
;; TODO: much more to do here! ;; TODO: much more to do here!
"EXPRESSION := VALUE | KEYWORD;" "EXPRESSION := VALUE | KEYWORD | FUNCTION | LPAR EXPRESSION RPAR;"
"FUNCTION := NAME LPAR VALUES RPAR | NAME LPAR RPAR ;"
;; TODO: same for values. ;; TODO: same for values.
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES;" "VALUES := VALUE | VALUE COMMA VALUES ;"
"VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES | VALUE '::' NAME;"
"INT-VAL := #'[0-9]+' ;" "INT-VAL := #'[0-9]+' ;"
"CHAR-VAL := #'\\047.*\\047' ;" "CHAR-VAL := #'\\047.*\\047' ;"
"TRUTH-VAL := KW-TRUE | KW-FALSE ;" "TRUTH-VAL := KW-TRUE | KW-FALSE ;"
@ -152,6 +197,7 @@
"COMMA := OPT-SPACE ',' OPT-SPACE ;" "COMMA := OPT-SPACE ',' OPT-SPACE ;"
"LPAR := OPT-SPACE '(' OPT-SPACE ;" "LPAR := OPT-SPACE '(' OPT-SPACE ;"
"RPAR := OPT-SPACE ')' OPT-SPACE ; " "RPAR := OPT-SPACE ')' OPT-SPACE ; "
"EQUALS := OPT-SPACE '=' OPT-SPACE ;"
;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are ;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are
;; also acceptable wherever OPT-SPACE is acceptable ;; also acceptable wherever OPT-SPACE is acceptable
"OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;" "OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;"
@ -160,7 +206,7 @@
"QUAL-NAME := NAME | NAME DOT NAME ;" "QUAL-NAME := NAME | NAME DOT NAME ;"
"NAMES := NAME | NAME COMMA NAMES ;" "NAMES := NAME | NAME COMMA NAMES ;"
"NAME := #'[a-zA-Z][a-zA-Z0-9_]*'" "NAME := #'[a-zA-Z][a-zA-Z0-9_]*'"
"TERMINATOR := OPT-SPACE SEMI-COLON OPT-SPACE | OPT-SPACE KW-GO OPT-SPACE;" "TERMINATOR := SEMI-COLON | KW-GO | OPT-SPACE TERMINATOR OPT-SPACE;"
"SEMI-COLON := ';';")) "SEMI-COLON := ';';"))
@ -171,7 +217,7 @@
"True if `x` is something which just clutters up the parse tree." "True if `x` is something which just clutters up the parse tree."
[x] [x]
(and (and
(coll? x)(contains? #{:SPACE :OPT-SPACE :COMMENT} (first x)))) (coll? x)(contains? #{:SPACE :OPT-SPACE :COMMENT :OPT-KW-DATA} (first x))))
(defn remove-recursive (defn remove-recursive
@ -190,3 +236,4 @@
"Parse the argument, assumed to be a string in the correct syntax, and return a parse tree." "Parse the argument, assumed to be a string in the correct syntax, and return a parse tree."
(insta/parser grammar)) (insta/parser grammar))
(def parse-comment (insta/parser "COMMENT := #'--[^\\n\\r]*' ;"))

View file

@ -30,7 +30,8 @@
(defn- make-unterminated-case-insensitive-match-rule (defn- make-unterminated-case-insensitive-match-rule
"Make a grammar rule which matches this `token` case-insensitively, "Make a grammar rule which matches this `token` case-insensitively,
without the terminal semi-colon" without the terminal semi-colon. Keywords may always optionally be preceded
by whitespace and are usually succeeded by whitespace."
[token] [token]
(let [subtokens (split token #"\s+") (let [subtokens (split token #"\s+")
name (join "-" subtokens)] name (join "-" subtokens)]
@ -38,8 +39,9 @@
(flatten (flatten
(list (list
(upper-case name) (upper-case name)
" := " " := OPT-SPACE "
(join " SPACE " (map #(str "#'(?i)" % "'") subtokens))))))) (join " SPACE " (map #(str "#'(?i)" % "'") subtokens))
" OPT-SPACE ")))))
(defn make-case-insensitive-match-rule (defn make-case-insensitive-match-rule
@ -65,8 +67,13 @@
(defn- make-timezone-clause (defn- make-timezone-clause
[match with-tz? with-precision?] [match with-tz? with-precision?]
(join " SPACE " (join
(list (if with-precision? (str match " LPAR INT-VAL RPAR") match) (if with-tz? "KW-WITH" "KW-WITHOUT") "KW-TIME" "KW-ZONE"))) " "
(list
(if with-precision? (str match " LPAR INT-VAL RPAR") match)
(if with-tz? "KW-WITH" "KW-WITHOUT")
"KW-TIME"
"KW-ZONE")))
(defn make-with-or-without-timezone-datatype-rule (defn make-with-or-without-timezone-datatype-rule