diff --git a/src/squirrel_parse/parser.clj b/src/squirrel_parse/parser.clj index 489c41e..ed7e5e7 100644 --- a/src/squirrel_parse/parser.clj +++ b/src/squirrel_parse/parser.clj @@ -28,14 +28,23 @@ ;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -(def keywords ["action" "alter" "as" "by" "cache" "cascade" "comment" "constraint" "create" "cycle" "day" "default" "delete" "exists" - "extension" "false" "from" "full" "go" "hour" "if" "increment" "key" "match" "maxvalue" "minute" "minvalue" - "month" "no" "none" "not" "null" "off" "on" "owned" "partial" "primary" "references" "restrict" "schema" "second" - "select" "sequence" "set" "simple" "start" "table" "temp" "temporary" "time" "to" "true" "unique" "update" "where" "with" "without" - "year" "zone"]) + +(def keywords + "Keywords from the subset of Postgresql I have so far needed to support. Feel free to either + 1. Add further Postrgresql keywords, or + 2. Start a new def for non-postgresql keywords." + ["action" "add" "all" "alter" "as" "by" "cache" "cascade" "collate" "column" "comment" "constraint" "create" + "cycle" "data" "day" "default" "delete" "exists" "extension" "false" "foreign" "from" "full" + "go" "grant" "hour" "if" "increment" "insert" "into" "key" "match" "maxvalue" "minute" "minvalue" + "month" "no" "none" "not" "null" "off" "on" "only" "owned" "owner" "partial" "primary" + "references" "rename" "restrict" "revoke" "schema" "second" "select" "sequence" "set" + "simple" "start" "table" "temp" "temporary" "time" "to" "true" "type" "unique" + "update" "using" "values" "where" "with" "without" "year" "zone"]) (def keyword-rules + "Rules to match keywords case-insensitively. It is my practice to write SQL in + lower case, but I know many prefer upper case." (flatten (list (apply str @@ -49,23 +58,23 @@ (def simple-datatypes - "datatypes which take no arguments or special syntax" - ["bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ]) + "(Postgresql) datatypes which take no arguments or special syntax" + ["bigint" "bigserial" "bit" "boolean" "bytea" "date" "double precision" "float" "integer" "money" "numeric" "real" "serial" "text" ]) (def with-integer-parameter-datatypes - "datatypes which take a single integer parameter, typically the storage size" + "(Postgresql) datatypes which take a single integer parameter, typically the storage size" ["char" "character" "character varying" "varchar"]) (def with-or-without-timezone-datatypes - "datatypes which may optionally take 'with (or without) time zone" + "(Postgresql) datatypes which may optionally take 'with (or without) time zone" ["time" "timestamp"]) (def special-datatypes - "Datatypes with complex special grammar" + "(Postgresql) Datatypes with complex special grammar" ["interval"]) (def interval-datatype-rules - "interval has special complex grammar. TODO: this is not entirely correct; some + "`interval` has special complex grammar. TODO: this is not entirely correct; some combinations of 'x to y' are not allowed." (list "DT-INTERVAL := #'(?i)interval' | #'(?i)interval' SPACE INTERVAL-FIELDS;" "INTERVAL-FIELDS := INTERVAL-FIELD | INTERVAL-FIELD SPACE KW-TO SPACE INTERVAL-FIELD;" @@ -90,60 +99,96 @@ interval-datatype-rules)) (def basic-rules + "Rough general syntax of the subset of Postgresql I have so far needed to support. + There will be some differences from ANSI 92. Rules are generally expressed as single + strings. Order is not very significant, but instaparse treats the first rule as special." (list - "STATEMENTS := STATEMENT TERMINATOR | STATEMENT TERMINATOR STATEMENTS ;" - "STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT-STMT | EXTENSION-DECL | SEQUENCE-DECL;" - "ALTER-STMT := ALTER-TABLE ;" - "ALTER-TABLE := KW-ALTER SPACE KW-TABLE SPACE QUAL-NAME SPACE NAME SPACE KW-TO SPACE NAME ;" - "SET-STMT := KW-SET SPACE NAME OPT-SPACE '=' OPT-SPACE EXPRESSION ;" - "COMMENT-STMT := KW-COMMENT #'[~;]*' ;" - "EXTENSION-DECL := KW-CREATE SPACE KW-EXTENSION EXISTENCE SPACE NAME SPACE KW-WITH SPACE KW-SCHEMA SPACE NAME ;" + "STATEMENTS := OPT-SPACE STATEMENT | OPT-SPACE STATEMENT OPT-SPACE STATEMENTS ;" + "STATEMENT := TABLE-DECL | ALTER-STMT | SET-STMT | COMMENT | EXTENSION-DECL | SEQUENCE-DECL | INSERT-STMT | PERMISSIONS-STMT;" + "ALTER-STMT := ALTER-TABLE | ALTER-SEQUENCE ;" + + "SET-STMT := KW-SET NAME EQUALS EXPRESSION TERMINATOR ;" + "EXTENSION-DECL := KW-CREATE KW-EXTENSION EXISTENCE NAME KW-WITH KW-SCHEMA NAME TERMINATOR ;" ;; taken from https://www.postgresql.org/docs/10/static/sql-createsequence.html - "SEQUENCE-DECL := KW-CREATE SPACE PERMANENCE KW-SEQUENCE EXISTENCE SPACE NAME SPACE SEQ-SPEC-ELEMENTS ;" - "SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SPACE SEQ-SPEC-ELEMENTS ;" + "SEQUENCE-DECL := KW-CREATE PERMANENCE KW-SEQUENCE EXISTENCE NAME SEQ-SPEC-ELEMENTS TERMINATOR ;" + "SEQ-SPEC-ELEMENTS := SEQ-SPEC-ELEMENT | SEQ-SPEC-ELEMENT SEQ-SPEC-ELEMENTS ;" "SEQ-SPEC-ELEMENT := SEQ-SPEC-AS | SEQ-SPEC-INCREMENT | SEQ-SPEC-MIN | SEQ-SPEC-MAX | SEQ-SPEC-START | SEQ-SPEC-CACHE | SEQ-SPEC-CYCLE | SEQ-SPEC-OWNER ;" "SEQ-SPEC-AS := KW-AS DATATYPE ;" - "SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT SPACE INT-VAL | KW-INCREMENT SPACE KW-BY SPACE INT-VAL ;" + "SEQ-SPEC-INCREMENT := INT-VAL | KW-INCREMENT INT-VAL | KW-INCREMENT KW-BY INT-VAL ;" ;; I can't tell from the spec whether expressions are allowed as the value of minvalue or maxvalue. - "SEQ-SPEC-MIN := KW-NO SPACE KW-MINVALUE | KW-MINVALUE SPACE INT-VAL ;" - "SEQ-SPEC-MAX := KW-NO SPACE KW-MAXVALUE | KW-MAXVALUE SPACE INT-VAL ;" - "SEQ-SPEC-START := KW-START SPACE KW-WITH SPACE INT-VAL | KW-START SPACE INT-VAL ;" - "SEQ-SPEC-CACHE := KW-CACHE SPACE INT-VAL ;" - "SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO SPACE KW-CYCLE ;" - "SEQ-SPEC-OWNER := KW-OWNED SPACE KW-BY SPACE QUAL-NAME | KW-OWNED SPACE KW-BY SPACE KW-NONE ;" + "SEQ-SPEC-MIN := KW-NO KW-MINVALUE | KW-MINVALUE INT-VAL ;" + "SEQ-SPEC-MAX := KW-NO KW-MAXVALUE | KW-MAXVALUE INT-VAL ;" + "SEQ-SPEC-START := KW-START KW-WITH INT-VAL | KW-START INT-VAL ;" + "SEQ-SPEC-CACHE := KW-CACHE INT-VAL ;" + "SEQ-SPEC-CYCLE := KW-CYCLE | KW-NO KW-CYCLE ;" + "SEQ-SPEC-OWNER := KW-OWNED KW-BY QUAL-NAME | KW-OWNED KW-BY KW-NONE ;" + + ;; from https://www.postgresql.org/docs/10/static/sql-altersequence.html + "ALTER-SEQUENCE := KW-ALTER KW-SEQUENCE EXISTENCE NAME ALTER-SEQ-ELEMENTS TERMINATOR ;" + "ALTER-SEQ-ELEMENTS := ALTER-SEQ-ELEMENT | ALTER-SEQ-ELEMENT ALTER-SEQ-ELEMENTS ;" + "ALTER-SEQ-ELEMENT := SEQ-SPEC-ELEMENT | OWNER-TO | SEQ-RENAME | SEQ-SET-SCHEMA ;" + "OWNER-TO := KW-OWNER KW-TO NAME ;" + "SEQ-RENAME := KW-RENAME KW-TO NAME ;" + "SEQ-SET-SCHEMA := KW-SET KW-SCHEMA NAME ;" + + ;; TODO: there is a *lot* more gramar to do here, but I don't need it (yet) + "INSERT-STMT := KW-INSERT KW-INTO QUAL-NAME KW-VALUES LPAR VALUES RPAR TERMINATOR ;" ;; taken from https://www.postgresql.org/docs/10/static/sql-createtable.html ;; but by no means all of that is implemented. - "TABLE-DECL := KW-CREATE SPACE PERMANENCE KW-TABLE EXISTENCE SPACE NAME OPT-SPACE LPAR TABLE-SPEC-ELEMENTS RPAR ;" + "TABLE-DECL := KW-CREATE PERMANENCE KW-TABLE EXISTENCE NAME LPAR TABLE-SPEC-ELEMENTS RPAR TERMINATOR ;" "TABLE-SPEC-ELEMENTS := TABLE-SPEC-ELEMENT | TABLE-SPEC-ELEMENT COMMA TABLE-SPEC-ELEMENTS ; " "TABLE-SPEC-ELEMENT := COLUMN-SPEC | TABLE-CONSTRAINT ;" "COLUMN-SPEC := NAME SPACE DATATYPE | COLUMN-SPEC OPT-SPACE COLUMN-CONSTRAINTS ;" "TABLE-CONSTRAINT := 'TODO' ;" "COLUMN-CONSTRAINTS := COLUMN-CONSTRAINT | COLUMN-CONSTRAINT SPACE COLUMN-CONSTRAINT ;" - "COLUMN-CONSTRAINT := KW-CONSTRAINT SPACE NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;" - "NOT-NULL-CC := KW-NOT SPACE KW-NULL ;" + "COLUMN-CONSTRAINT := KW-CONSTRAINT NAME COLUMN-CONSTRAINT | NOT-NULL-CC | NULL-CC | DEFAULT-CC | UNIQUE-CC | PRIMARY-CC | REFERENCES-CC ;" + "NOT-NULL-CC := KW-NOT KW-NULL ;" "NULL-CC := KW-NULL ;" - "DEFAULT-CC := KW-DEFAULT SPACE EXPR ;" - "UNIQUE-CC := KW-UNIQUE SPACE INDEX-PARAMS ;" - "PRIMARY-CC := KW-PRIMARY SPACE KW-KEY INDEX-PARAMS ;" - "REFERENCES-CC := KW-REFERENCES SPACE NAME SPACE LPAR NAMES RPAR REF-DIRECTIVES ;" - "REF-DIRECTIVES := '' | REF-DIRECTIVE SPACE REF-DIRECTIVES ;" + "DEFAULT-CC := KW-DEFAULT EXPRESSION ;" + "UNIQUE-CC := KW-UNIQUE INDEX-PARAMS ;" + "PRIMARY-CC := KW-PRIMARY KW-KEY INDEX-PARAMS ;" + "REFERENCES-CC := KW-REFERENCES NAME LPAR NAMES RPAR REF-DIRECTIVES | KW-FOREIGN KW-KEY LPAR NAMES RPAR REFERENCES-CC;" + "REF-DIRECTIVES := '' | REF-DIRECTIVE REF-DIRECTIVES ;" "REF-DIRECTIVE := REF-MATCH | REF-ON-UPDATE | REF-ON-DELETE ;" - "REF-MATCH := KW-MATCH SPACE MATCH-TYPE ;" - "REF-ON-DELETE := KW-ON SPACE KW-DELETE SPACE REF-ACTION ;" - "REF-ON-UPDATE := KW-ON SPACE KW-UPDATE SPACE REF-ACTION ;" + "REF-MATCH := KW-MATCH MATCH-TYPE ;" + "REF-ON-DELETE := KW-ON KW-DELETE REF-ACTION ;" + "REF-ON-UPDATE := KW-ON KW-UPDATE REF-ACTION ;" "MATCH-TYPE := KW-FULL | KW-PARTIAL | KW-SIMPLE ;" - "INDEX-PARAMS := 'foo' ;" - "REF-ACTION := KW-NO SPACE KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET SPACE VALUE;" - "EXPR := 'TODO' ; " - "EXISTENCE := '' | SPACE KW-IF SPACE KW-NOT SPACE KW-EXISTS ;" - "PERMANENCE := '' | KW-TEMP SPACE | KW-TEMPORARY SPACE ;" - "COMMENT := #'--[~\\n\\r]*' ;" + "INDEX-PARAMS := EXPRESSION ;" + "REF-ACTION := KW-NO KW-ACTION | KW-RESTRICT | KW-CASCADE | KW-SET VALUE;" + "EXISTENCE := '' | KW-IF KW-NOT KW-EXISTS | KW-IF KW-EXISTS ;" + "PERMANENCE := '' | KW-TEMP | KW-TEMPORARY ;" + "ONLY := KW-ONLY | '' ;" + + "ALTER-TABLE := KW-ALTER KW-TABLE EXISTENCE ONLY QUAL-NAME OPT-SPACE ALTER-TABLE-ELEMENTS TERMINATOR ;" + "ALTER-TABLE-ELEMENTS := ALTER-TABLE-ELEMENT | ALTER-TABLE-ELEMENT OPT-SPACE ALTER-TABLE-ELEMENTS ;" + "ALTER-TABLE-ELEMENT := OWNER-TO | ALTER-COLUMN | ADD-CONSTRAINT;" + "ALTER-COLUMN := KW-ALTER KW-COLUMN NAME ALTER-COL-SPEC ;" + "ALTER-COL-SPEC := ALTER-COL-TYPE | ALTER-COL-DFLT;" + "ALTER-COL-TYPE := KW-SET OPT-KW-DATA KW-TYPE DATATYPE | ALTER-COL-TYPE KW-COLLATE NAME | ALTER-COL-TYPE KW-USING EXPRESSION ;" + "ALTER-COL-DFLT := KW-SET KW-DEFAULT EXPRESSION ;" + "ADD-CONSTRAINT := KW-ADD COLUMN-CONSTRAINT ;" + "OPT-KW-DATA := KW-DATA | '' ;" +;; "TABLE-ADD-COL := + + "PERMISSIONS-STMT := REVOKE-STMT | GRANT-STMT;" + "REVOKE-STMT := KW-REVOKE PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-FROM NAMES TERMINATOR;" + "GRANT-STMT := KW-GRANT PERMISSIONS KW-ON OPT-KW-SCHEMA QUAL-NAME KW-TO NAMES TERMINATOR;" + + "PERMISSIONS := PERMISSION | PERMISSION COMMA PERMISSIONS ;" + "PERMISSION := KW-ALL | KW-SELECT | KW-INSERT | KW-UPDATE | KW-DELETE ;" + + "OPT-KW-SCHEMA := KW-SCHEMA | '' ;" + + "COMMENT := KW-COMMENT #'[^;]*' TERMINATOR |#'--[^\\n\\r]*' ;" ;; TODO: much more to do here! - "EXPRESSION := VALUE | KEYWORD;" + "EXPRESSION := VALUE | KEYWORD | FUNCTION | LPAR EXPRESSION RPAR;" + "FUNCTION := NAME LPAR VALUES RPAR | NAME LPAR RPAR ;" ;; TODO: same for values. - "VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES;" + "VALUES := VALUE | VALUE COMMA VALUES ;" + "VALUE := INT-VAL | CHAR-VAL | TRUTH-VAL | KW-NULL | NAMES | VALUE '::' NAME;" "INT-VAL := #'[0-9]+' ;" "CHAR-VAL := #'\\047.*\\047' ;" "TRUTH-VAL := KW-TRUE | KW-FALSE ;" @@ -152,6 +197,7 @@ "COMMA := OPT-SPACE ',' OPT-SPACE ;" "LPAR := OPT-SPACE '(' OPT-SPACE ;" "RPAR := OPT-SPACE ')' OPT-SPACE ; " + "EQUALS := OPT-SPACE '=' OPT-SPACE ;" ;; OPT-SPACE is optional space - it's acceptable as OPT-SPACE if there are no whitespace characters. Comments are ;; also acceptable wherever OPT-SPACE is acceptable "OPT-SPACE := #'\\s*' | OPT-SPACE COMMENT OPT-SPACE;" @@ -160,7 +206,7 @@ "QUAL-NAME := NAME | NAME DOT NAME ;" "NAMES := NAME | NAME COMMA NAMES ;" "NAME := #'[a-zA-Z][a-zA-Z0-9_]*'" - "TERMINATOR := OPT-SPACE SEMI-COLON OPT-SPACE | OPT-SPACE KW-GO OPT-SPACE;" + "TERMINATOR := SEMI-COLON | KW-GO | OPT-SPACE TERMINATOR OPT-SPACE;" "SEMI-COLON := ';';")) @@ -171,7 +217,7 @@ "True if `x` is something which just clutters up the parse tree." [x] (and - (coll? x)(contains? #{:SPACE :OPT-SPACE :COMMENT} (first x)))) + (coll? x)(contains? #{:SPACE :OPT-SPACE :COMMENT :OPT-KW-DATA} (first x)))) (defn remove-recursive @@ -190,3 +236,4 @@ "Parse the argument, assumed to be a string in the correct syntax, and return a parse tree." (insta/parser grammar)) +(def parse-comment (insta/parser "COMMENT := #'--[^\\n\\r]*' ;")) diff --git a/src/squirrel_parse/utils.clj b/src/squirrel_parse/utils.clj index 6a97978..c1e2324 100644 --- a/src/squirrel_parse/utils.clj +++ b/src/squirrel_parse/utils.clj @@ -30,7 +30,8 @@ (defn- make-unterminated-case-insensitive-match-rule "Make a grammar rule which matches this `token` case-insensitively, - without the terminal semi-colon" + without the terminal semi-colon. Keywords may always optionally be preceded + by whitespace and are usually succeeded by whitespace." [token] (let [subtokens (split token #"\s+") name (join "-" subtokens)] @@ -38,8 +39,9 @@ (flatten (list (upper-case name) - " := " - (join " SPACE " (map #(str "#'(?i)" % "'") subtokens))))))) + " := OPT-SPACE " + (join " SPACE " (map #(str "#'(?i)" % "'") subtokens)) + " OPT-SPACE "))))) (defn make-case-insensitive-match-rule @@ -65,8 +67,13 @@ (defn- make-timezone-clause [match with-tz? with-precision?] - (join " SPACE " - (list (if with-precision? (str match " LPAR INT-VAL RPAR") match) (if with-tz? "KW-WITH" "KW-WITHOUT") "KW-TIME" "KW-ZONE"))) + (join + " " + (list + (if with-precision? (str match " LPAR INT-VAL RPAR") match) + (if with-tz? "KW-WITH" "KW-WITHOUT") + "KW-TIME" + "KW-ZONE"))) (defn make-with-or-without-timezone-datatype-rule