PEG for GraphViz DOT

"Abstract Grammar" (year 2020) from
http://www.graphviz.org/doc/info/lang.html

Notation converted to PEG.

Debugged.

Simplification.
Recursion converted to simple repetition
operators.

Completion.
Spacing.

file <-

_ graph+

graph:

[ strict ] (graph | digraph) [ ID ] '{' stmt_list '}'

graph <-

'strict'? ('graph' / 'digraph') ID? '{' stmt_list '}'

graph <-

'strict'? ('graph' / 'digraph') ID? '{' stmt_list '}'

graph <-

'strict'? ('graph' / 'digraph') ID? '{' stmt_list '}'

graph <-

('strict' _)? ('graph' / 'digraph') _ ID? '{' _ stmt_list '}' _

stmt_list:

[ stmt [ ';' ] stmt_list ]

stmt_list <-

(stmt ';'? stmt_list)?

stmt_list <-

(stmt ';'? stmt_list)?

stmt_list <-

(stmt ';'?)*

stmt_list <-

(stmt (';' _)?)*

stmt:

node_stmt

stmt <-

node_stmt

stmt <-

attr_stmt

stmt <-

attr_stmt

stmt <-

attr_stmt

| edge_stmt

/ edge_stmt

/ edge_stmt

/ edge_stmt

/ edge_stmt

| attr_stmt

/ attr_stmt

/ subgraph

/ subgraph

/ subgraph

| ID '=' ID

/ ID '=' ID

/ ID '=' ID

/ ID '=' ID

/ ID '=' _ ID

| subgraph

/ subgraph

/ node_stmt

/ node_stmt

/ node_stmt

attr_stmt:

(graph | node | edge) attr_list

attr_stmt <-

('graph' / 'node' / 'edge') attr_list

attr_stmt <-

('graph' / 'node' / 'edge') attr_list

attr_stmt <-

('graph' / 'node' / 'edge') attr_list

attr_stmt <-

('graph' / 'node' / 'edge') _ attr_list

attr_list:

'[' [ a_list ] ']' [ attr_list ]

attr_list <-

'[' a_list? ']' attr_list?

attr_list <-

'[' a_list? ']' attr_list?

attr_list <-

('[' attr* ']')+

attr_list <-

('[' _ attr* ']' _)+

a_list:

ID '=' ID [ (';' | ',') ] [ a_list ]

a_list <-

ID '=' ID [;,]? a_list?

a_list <-

ID '=' ID [;,]? a_list?

a_list <-

(ID '=' ID [;,]?)+

attr <-

ID '=' ID [;,]?

attr <-

ID '=' _ ID ([;,] _)?

edge_stmt:

(node_id | subgraph) edgeRHS [ attr_list ]

edge_stmt <-

(node_id / subgraph) edgeRHS attr_list?

edge_stmt <-

(subgraph / node_id) edgeRHS attr_list?

edge_stmt <-

(subgraph / node_id) edgeRHS attr_list?

edge_stmt <-

(subgraph / node_id) edgeRHS attr_list?

edgeRHS:

edgeop (node_id | subgraph) [ edgeRHS ]

edgeRHS <-

edgeop (node_id / subgraph) edgeRHS?

edgeRHS <-

edgeop (subgraph / node_id) edgeRHS?

edgeRHS <-

(edgeop (subgraph / node_id))+

edgeRHS <-

(edgeop (subgraph / node_id))+

node_stmt:

node_id [ attr_list ]

node_stmt <-

node_id attr_list?

node_stmt <-

node_id attr_list?

node_stmt <-

node_id attr_list?

node_stmt <-

node_id attr_list?

node_id:

ID [ port ]

node_id <-

ID port?

node_id <-

ID port?

node_id <-

ID port? port?

node_id <-

ID port? port?

port:

':' ID [ ':' compass_pt ]

port <-

':' ID (':' compass_pt)?

port <-

':' ID (':' ID)?

port <-

':' ID

port <-

':' _ ID

| ':' compass_pt

/ ':' compass_pt

subgraph:

[ subgraph [ ID ] ] '{' stmt_list '}'

subgraph <-

('subgraph' ID?)? '{' stmt_list '}'

subgraph <-

('subgraph' ID?)? '{' stmt_list '}'

subgraph <-

('subgraph' ID?)? '{' stmt_list '}'

subgraph <-

('subgraph' _ ID?)? '{' _ stmt_list '}' _

compass_pt:

(n | ne | e | se | s | sw | w | nw | c | _)

compass_pt <-

'n' / 'ne' / 'e' / 'se' / 's' / 'sw' / 'w' / 'nw' / 'c' / '_'

compass_pt <-

'ne' / 'nw' / 'se' / 'sw'/ 's' / 'w' / 'n' / 'e' / 'c' / '_'

edgeop <-

edgeop <-

edgeop <-

('--' / '->') _

ID <-

ID <-

ID <-

HTML

/ QUOTED

/ NUMERAL

/ STRING

HTML <-

'<' (![<>] . / HTML)* '>' _

QUOTED <-

'"' ESCAPED* '"' _ ('+' _ '"' ESCAPED* '"' _)*

ESCAPED <-

ESC_QUOTE

/ ESC_EOL

/ '\\' .

/ !'"' .

ESC_QUOTE <-

'\\"'

ESC_EOL <-

'\\' EOL

NUMERAL <-

'-'? ([0-9]+ ('.' [0-9]*)? / '.' [0-9]+) _

STRING <-

[A-Z_a-z\u0080-\uFFFF] [0-9A-Z_a-z\u0080-\uFFFF]* _

_ <-

([ \t] / EOL / COMMENT)*

COMMENT <-

COMMENT_HASH

/ COMMENT_SLASH

/ COMMENT_BLOCK

COMMENT_HASH <-

'#' (!EOL .)*

COMMENT_SLASH <-

'//' (!EOL .)*

COMMENT_BLOCK <-

'/*' (!'*/' .)* '*/'

EOL <-

'\r\n' / '\n' / '\r'

Notes

Main DOT specification:

The keywords node, edge, graph, digraph, subgraph, and strict are case-independent. At least in GraphViz 2.38 and 2.46 keywords are case sensitive! So attribute statement with "Node" keyword will create a new node (will be interpreted as node statement)!

As another aid for readability, dot allows double-quoted strings to span multiple physical lines using the standard C convention of a backslash immediately preceding a newline character. (In GraphViz 2.38 this is true, but in 2.46 newline characters are ignored also if not escaped! Seems like a bug because it breaks HTML strings (that preserve them) and quoted strings equivalence.)

In quoted strings in DOT, the only escaped character is double-quote ("). That is, in quoted strings, the dyad \" is converted to "; all other characters are left unchanged. (As stated above - backslash before newline character removes them both.) In particular, \\ remains \\. Layout engines may apply additional escape sequences.
http://www.graphviz.org/doc/info/lang.html
https://gitlab.com/graphviz/graphviz/-/blob/master/doc/info/lang.html
https://gitlab.com/graphviz/graphviz/-/blob/master/doc/infosrc/grammar


Note that it is legal to have a portname the same as one of the compass points. In this case, this reference will be resolved to the port.

Contrary to specification compass points type is ID!
http://www.graphviz.org/doc/info/attrs.html#k:portPos
graphviz-2.46.0/lib/common/shapes.c compassPort()


Additional escape sequences are different for quoted strings and HTML strings:
http://www.graphviz.org/doc/info/attrs.html#k:escString
In HTML strings substitution is performed after interpretation (so e.g. &#x5C;N will be substituted as \N (node name) would be, but the content of substitution will be left not interpreted). In quoted strings substitution is the first thing that is done (so a node name with &gt; will be displayed with >). Record type node shapes add more complications:
http://www.graphviz.org/doc/info/shapes.html#record


GraphViz may not understand Unicode BOM.


Note how in PEG explicit spacing declaration tends to occur after literals (terminals).