Add PostgreSQL Explain lexer (#2398)

This lexer add support for PostgreSQL Explain plan : https://www.postgresql.org/docs/current/sql-explain.html This was heavily inspired by Maxence Ahlouche work, thanks to him : https://github.com/maahl/pg_explain_lexer Co-authored-by: Jean Abou Samra <jean@abou-samra.fr>
pygments · Apr 4, 2023 · ef0abba · ef0abba
1 parent 3c6e2af
commit ef0abba
Show file tree

Hide file tree

Showing 6 changed files with 3,125 additions and 4 deletions.
diff --git a/AUTHORS b/AUTHORS
@@ -11,6 +11,7 @@ Other contributors, listed alphabetically, are:
 * Ali Afshar -- image formatter
 * Thomas Aglassinger -- Easytrieve, JCL, Rexx, Transact-SQL and VBScript
   lexers
+* Maxence Ahlouche -- PostgreSQL Explain lexer
 * Muthiah Annamalai -- Ezhil lexer
 * Kumar Appaiah -- Debian control lexer
 * Andreas Amann -- AppleScript lexer
@@ -162,6 +163,7 @@ Other contributors, listed alphabetically, are:
 * Paulo Moura -- Logtalk lexer
 * Mher Movsisyan -- DTD lexer
 * Dejan Muhamedagic -- Crmsh lexer
+* Adrien Nayrat -- PostgreSQL Explain lexer
 * Ana Nelson -- Ragel, ANTLR, R console lexers
 * David Neto, Google LLC -- WebGPU Shading Language lexer
 * Kurt Neufeld -- Markdown lexer
@@ -191,7 +193,7 @@ Other contributors, listed alphabetically, are:
 * Justin Reidy -- MXML lexer
 * Norman Richards -- JSON lexer
 * Corey Richardson -- Rust lexer updates
-* Fabrizio Riguzzi -- cplint leder 
+* Fabrizio Riguzzi -- cplint leder
 * Lubomir Rintel -- GoodData MAQL and CL lexers
 * Andre Roberge -- Tango style
 * Georg Rollinger -- HSAIL lexer

diff --git a/pygments/lexers/_mapping.py b/pygments/lexers/_mapping.py
@@ -370,6 +370,7 @@
     'PortugolLexer': ('pygments.lexers.pascal', 'Portugol', ('portugol',), ('*.alg', '*.portugol'), ()),
     'PostScriptLexer': ('pygments.lexers.graphics', 'PostScript', ('postscript', 'postscr'), ('*.ps', '*.eps'), ('application/postscript',)),
     'PostgresConsoleLexer': ('pygments.lexers.sql', 'PostgreSQL console (psql)', ('psql', 'postgresql-console', 'postgres-console'), (), ('text/x-postgresql-psql',)),
+    'PostgresExplainLexer': ('pygments.lexers.sql', 'PostgreSQL EXPLAIN dialect', ('postgres-explain',), ('*.explain',), ('text/x-postgresql-explain',)),
     'PostgresLexer': ('pygments.lexers.sql', 'PostgreSQL SQL dialect', ('postgresql', 'postgres'), (), ('text/x-postgresql',)),
     'PovrayLexer': ('pygments.lexers.graphics', 'POVRay', ('pov',), ('*.pov', '*.inc'), ('text/x-povray',)),
     'PowerShellLexer': ('pygments.lexers.shell', 'PowerShell', ('powershell', 'pwsh', 'posh', 'ps1', 'psm1'), ('*.ps1', '*.psm1'), ('text/x-powershell',)),

diff --git a/pygments/lexers/_postgres_builtins.py b/pygments/lexers/_postgres_builtins.py
@@ -571,6 +571,61 @@
     'RETURN', 'REVERSE', 'SQLSTATE', 'WHILE',
 )
 
+# Most of these keywords are from ExplainNode function
+# in src/backend/commands/explain.c
+
+EXPLAIN_KEYWORDS = (
+    'Aggregate',
+    'Append',
+    'Bitmap Heap Scan',
+    'Bitmap Index Scan',
+    'BitmapAnd',
+    'BitmapOr',
+    'CTE Scan',
+    'Custom Scan',
+    'Delete',
+    'Foreign Scan',
+    'Function Scan',
+    'Gather Merge',
+    'Gather',
+    'Group',
+    'GroupAggregate',
+    'Hash Join',
+    'Hash',
+    'HashAggregate',
+    'Incremental Sort',
+    'Index Only Scan',
+    'Index Scan',
+    'Insert',
+    'Limit',
+    'LockRows',
+    'Materialize',
+    'Memoize',
+    'Merge Append',
+    'Merge Join',
+    'Merge',
+    'MixedAggregate',
+    'Named Tuplestore Scan',
+    'Nested Loop',
+    'ProjectSet',
+    'Recursive Union',
+    'Result',
+    'Sample Scan',
+    'Seq Scan',
+    'SetOp',
+    'Sort',
+    'SubPlan',
+    'Subquery Scan',
+    'Table Function Scan',
+    'Tid Range Scan',
+    'Tid Scan',
+    'Unique',
+    'Update',
+    'Values Scan',
+    'WindowAgg',
+    'WorkTable Scan',
+)
+
 
 if __name__ == '__main__':  # pragma: no cover
     import re

diff --git a/pygments/lexers/sql.py b/pygments/lexers/sql.py
@@ -30,6 +30,9 @@
         - highlights errors in the output and notification levels;
         - handles psql backslash commands.
 
+    `PostgresExplainLexer`
+        A lexer to highlight Postgres execution plan.
+
     The ``tests/examplefiles`` contains a few test files with data to be
     parsed by these lexers.
 
@@ -45,7 +48,7 @@
 from pygments.lexers import get_lexer_by_name, ClassNotFound
 
 from pygments.lexers._postgres_builtins import KEYWORDS, DATATYPES, \
-    PSEUDO_TYPES, PLPGSQL_KEYWORDS
+    PSEUDO_TYPES, PLPGSQL_KEYWORDS, EXPLAIN_KEYWORDS
 from pygments.lexers._mysql_builtins import \
     MYSQL_CONSTANTS, \
     MYSQL_DATATYPES, \
@@ -57,8 +60,8 @@
 
 
 __all__ = ['PostgresLexer', 'PlPgsqlLexer', 'PostgresConsoleLexer',
-           'SqlLexer', 'TransactSqlLexer', 'MySqlLexer',
-           'SqliteConsoleLexer', 'RqlLexer']
+           'PostgresExplainLexer', 'SqlLexer', 'TransactSqlLexer',
+           'MySqlLexer', 'SqliteConsoleLexer', 'RqlLexer']
 
 line_re  = re.compile('.*?\n')
 sqlite_prompt_re = re.compile(r'^(?:sqlite|   ...)>(?= )')
@@ -368,6 +371,191 @@ def get_tokens_unprocessed(self, data):
                 return
 
 
+class PostgresExplainLexer(RegexLexer):
+    """
+    Handle PostgreSQL EXPLAIN output
+
+    """
+
+    name = 'PostgreSQL EXPLAIN dialect'
+    aliases = ['postgres-explain']
+    filenames = ['*.explain']
+    mimetypes = ['text/x-postgresql-explain']
+
+    tokens = {
+        'root': [
+            (r'(:|\(|\)|ms|kB|->|\.\.|\,)', Punctuation),
+            (r'(\s+)', Whitespace),
+
+            # This match estimated cost and effectively measured counters with ANALYZE
+            # Then, we move to instrumentation state
+            (r'(cost)(=?)', bygroups(Name.Class, Punctuation), 'instrumentation'),
+            (r'(actual)( )(=?)', bygroups(Name.Class, Whitespace, Punctuation), 'instrumentation'),
+
+            # Misc keywords
+            (words(('actual', 'Memory Usage', 'Memory', 'Buckets', 'Batches',
+                    'originally', 'row', 'rows', 'Hits', 'Misses',
+                    'Evictions', 'Overflows'), suffix=r'\b'),
+             Comment.Single),
+
+            (r'(hit|read|dirtied|written|write|time|calls)(=)', bygroups(Comment.Single, Operator)),
+            (r'(shared|temp|local)', Keyword.Pseudo),
+
+            # We move to sort state in order to emphasize specific keywords (especially disk access)
+            (r'(Sort Method)(: )', bygroups(Comment.Preproc, Punctuation), 'sort'),
+
+            # These keywords can be followed by an object, like a table
+            (r'(Sort Key|Group Key|Presorted Key|Hash Key)(:)( )',
+             bygroups(Comment.Preproc, Punctuation, Whitespace), 'object_name'),
+            (r'(Cache Key|Cache Mode)(:)( )', bygroups(Comment, Punctuation, Whitespace), 'object_name'),
+
+            # These keywords can be followed by a predicate
+            (words(('Join Filter', 'Subplans Removed', 'Filter', 'Merge Cond',
+                    'Hash Cond', 'Index Cond', 'Recheck Cond', 'Heap Blocks',
+                    'TID Cond', 'Run Condition', 'Order By', 'Function Call',
+                    'Table Function Call', 'Inner Unique', 'Params Evaluated',
+                    'Single Copy', 'Sampling', 'One-Time Filter', 'Output',
+                    'Relations', 'Remote SQL'), suffix=r'\b'),
+             Comment.Preproc, 'predicate'),
+
+            # Special keyword to handle ON CONFLICT
+            (r'Conflict ', Comment.Preproc, 'conflict'),
+
+            # Special keyword for InitPlan or SubPlan
+            (r'(InitPlan|SubPlan)( )(\d+)( )',
+             bygroups(Keyword, Whitespace, Number.Integer, Whitespace),
+             'init_plan'),
+
+            (words(('Sort Method', 'Join Filter', 'Planning time',
+                    'Planning Time', 'Execution time', 'Execution Time',
+                    'Workers Planned', 'Workers Launched', 'Buffers',
+                    'Planning', 'Worker', 'Query Identifier', 'Time',
+                    'Full-sort Groups'), suffix=r'\b'), Comment.Preproc),
+
+            # Emphasize these keywords
+
+            (words(('Rows Removed by Join Filter', 'Rows Removed by Filter',
+                    'Rows Removed by Index Recheck',
+                    'Heap Fetches', 'never executed'),
+                   suffix=r'\b'), Name.Exception),
+            (r'(I/O Timings)(:)( )', bygroups(Name.Exception, Punctuation, Whitespace)),
+
+            (words(EXPLAIN_KEYWORDS, suffix=r'\b'), Keyword),
+
+            # join keywords
+            (r'((Right|Left|Full|Semi|Anti) Join)', Keyword.Type),
+            (r'(Parallel |Async |Finalize |Partial )', Comment.Preproc),
+            (r'Backward', Comment.Preproc),
+            (r'(Intersect|Except|Hash)', Comment.Preproc),
+
+            (r'(CTE)( )(\w*)?', bygroups(Comment, Whitespace, Name.Variable)),
+
+
+            # Treat "on" and "using" as a punctuation
+            (r'(on|using)', Punctuation, 'object_name'),
+
+
+            # strings
+            (r"'(''|[^'])*'", String.Single),
+            # numbers
+            (r'\d+\.\d+', Number.Float),
+            (r'(\d+)', Number.Integer),
+
+            # boolean
+            (r'(true|false)', Name.Constant),
+            # explain header
+            (r'\s*QUERY PLAN\s*\n\s*-+', Comment.Single),
+            # Settings
+            (r'(Settings)(:)( )', bygroups(Comment.Preproc, Punctuation, Whitespace), 'setting'),
+
+            # Handle JIT counters
+            (r'(JIT|Functions|Options|Timing)(:)', bygroups(Comment.Preproc, Punctuation)),
+            (r'(Inlining|Optimization|Expressions|Deforming|Generation|Emission|Total)', Keyword.Pseudo),
+
+            # Handle Triggers counters
+            (r'(Trigger)( )(\S*)(:)( )',
+             bygroups(Comment.Preproc, Whitespace, Name.Variable, Punctuation, Whitespace)),
+
+        ],
+        'expression': [
+            # matches any kind of parenthesized expression
+            # the first opening paren is matched by the 'caller'
+            (r'\(', Punctuation, '#push'),
+            (r'\)', Punctuation, '#pop'),
+            (r'(never executed)', Name.Exception),
+            (r'[^)(]+', Comment),
+        ],
+        'object_name': [
+
+            # This is a cost or analyze measure
+            (r'(\(cost)(=?)', bygroups(Name.Class, Punctuation), 'instrumentation'),
+            (r'(\(actual)( )(=?)', bygroups(Name.Class, Whitespace, Punctuation), 'instrumentation'),
+
+            # if object_name is parenthesized, mark opening paren as
+            # punctuation, call 'expression', and exit state
+            (r'\(', Punctuation, 'expression'),
+            (r'(on)', Punctuation),
+            # matches possibly schema-qualified table and column names
+            (r'\w+(\.\w+)*( USING \S+| \w+ USING \S+)', Name.Variable),
+            (r'\"?\w+\"?(?:\.\"?\w+\"?)?', Name.Variable),
+            (r'\'\S*\'', Name.Variable),
+
+            # if we encounter a comma, another object is listed
+            (r',\n', Punctuation, 'object_name'),
+            (r',', Punctuation, 'object_name'),
+
+            # special case: "*SELECT*"
+            (r'"\*SELECT\*( \d+)?"(.\w+)?', Name.Variable),
+            (r'"\*VALUES\*(_\d+)?"(.\w+)?', Name.Variable),
+            (r'"ANY_subquery"', Name.Variable),
+
+            # Variable $1 ...
+            (r'\$\d+', Name.Variable),
+            # cast
+            (r'::\w+', Name.Variable),
+            (r' +', Whitespace),
+            (r'"', Punctuation),
+            (r'\[\.\.\.\]', Punctuation),
+            (r'\)', Punctuation, '#pop'),
+        ],
+        'predicate': [
+            # if predicate is parenthesized, mark paren as punctuation
+            (r'(\()([^\n]*)(\))', bygroups(Punctuation, Name.Variable, Punctuation), '#pop'),
+            # otherwise color until newline
+            (r'[^\n]*', Name.Variable, '#pop'),
+        ],
+        'instrumentation': [
+            (r'=|\.\.', Punctuation),
+            (r' +', Whitespace),
+            (r'(rows|width|time|loops)', Name.Class),
+            (r'\d+\.\d+', Number.Float),
+            (r'(\d+)', Number.Integer),
+            (r'\)', Punctuation, '#pop'),
+        ],
+        'conflict': [
+            (r'(Resolution: )(\w+)', bygroups(Comment.Preproc, Name.Variable)),
+            (r'(Arbiter \w+:)', Comment.Preproc, 'object_name'),
+            (r'(Filter: )', Comment.Preproc, 'predicate'),
+        ],
+        'setting': [
+            (r'([a-z_]*?)(\s*)(=)(\s*)(\'.*?\')', bygroups(Name.Attribute, Whitespace, Operator, Whitespace, String)),
+            (r'\, ', Punctuation),
+        ],
+        'init_plan': [
+            (r'\(', Punctuation),
+            (r'returns \$\d+(,\$\d+)?', Name.Variable),
+            (r'\)', Punctuation, '#pop'),
+        ],
+        'sort': [
+            (r':|kB', Punctuation),
+            (r'(quicksort|top-N|heapsort|Average|Memory|Peak)', Comment.Prepoc),
+            (r'(external|merge|Disk|sort)', Name.Exception),
+            (r'(\d+)', Number.Integer),
+            (r' +', Whitespace),
+        ],
+    }
+
+
 class SqlLexer(RegexLexer):
     """
     Lexer for Structured Query Language. Currently, this lexer does