Skip to content

Commit

Permalink
Match lexers and formatters by extension separatly
Browse files Browse the repository at this point in the history
d0487e3 "Remove filename pattern caches" (#2153)
introduced a huge performance regession. While it is true
that fnmatch already uses functools.lru_cache, that cache
is limited to 127 entries and we have over 1000 matching patterns,
which means the cache is evicted entirely on every iteration.

We can be more clever without reverting that patch by avoiding
fnmatch calls alltogether, which provides an even better speedup.
The bulk (> 99%) of all filename matches are on the filename extension,
so in gen_mapfiles.py we split these into a separate tuple list that
we match without calling fnmatch().

This restores previous performance speedup without the overhead
of an extra regexp cache.
  • Loading branch information
dirkmueller committed Jan 30, 2023
1 parent d4403f0 commit 8872d85
Show file tree
Hide file tree
Showing 6 changed files with 608 additions and 580 deletions.
15 changes: 11 additions & 4 deletions pygments/formatters/__init__.py
Expand Up @@ -46,7 +46,7 @@ def find_formatter_class(alias):
Returns None if not found.
"""
for module_name, name, aliases, _, _ in FORMATTERS.values():
for module_name, name, aliases, _, _, _ in FORMATTERS.values():
if alias in aliases:
if name not in _formatter_cache:
_load_formatters(module_name)
Expand Down Expand Up @@ -109,15 +109,22 @@ def get_formatter_for_filename(fn, **options):
Raises ClassNotFound if not found.
"""
fn = basename(fn)
for modname, name, _, filenames, _ in FORMATTERS.values():
fn_ext = None
if '.' in fn:
fn_ext = fn.rpartition('.')[2]
for modname, name, _, filenames, fn_exts, _ in FORMATTERS.values():
if fn_ext in fn_exts:
if name not in _formatter_cache:
_load_formatters(modname)
return _formatter_cache[name](**options)
for filename in filenames:
if fnmatch(fn, filename):
if fn == filename or fnmatch(fn, filename):
if name not in _formatter_cache:
_load_formatters(modname)
return _formatter_cache[name](**options)
for cls in find_plugin_formatters():
for filename in cls.filenames:
if fnmatch(fn, filename):
if fn == filename or fnmatch(fn, filename):
return cls(**options)
raise ClassNotFound("no formatter found for file name %r" % fn)

Expand Down
36 changes: 18 additions & 18 deletions pygments/formatters/_mapping.py
Expand Up @@ -2,22 +2,22 @@
# DO NOT EDIT BY HAND; run `make mapfiles` instead.

FORMATTERS = {
'BBCodeFormatter': ('pygments.formatters.bbcode', 'BBCode', ('bbcode', 'bb'), (), 'Format tokens with BBcodes. These formatting codes are used by many bulletin boards, so you can highlight your sourcecode with pygments before posting it there.'),
'BmpImageFormatter': ('pygments.formatters.img', 'img_bmp', ('bmp', 'bitmap'), ('*.bmp',), 'Create a bitmap image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'GifImageFormatter': ('pygments.formatters.img', 'img_gif', ('gif',), ('*.gif',), 'Create a GIF image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'GroffFormatter': ('pygments.formatters.groff', 'groff', ('groff', 'troff', 'roff'), (), 'Format tokens with groff escapes to change their color and font style.'),
'HtmlFormatter': ('pygments.formatters.html', 'HTML', ('html',), ('*.html', '*.htm'), "Format tokens as HTML 4 ``<span>`` tags within a ``<pre>`` tag, wrapped in a ``<div>`` tag. The ``<div>``'s CSS class can be set by the `cssclass` option."),
'IRCFormatter': ('pygments.formatters.irc', 'IRC', ('irc', 'IRC'), (), 'Format tokens with IRC color sequences'),
'ImageFormatter': ('pygments.formatters.img', 'img', ('img', 'IMG', 'png'), ('*.png',), 'Create a PNG image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'JpgImageFormatter': ('pygments.formatters.img', 'img_jpg', ('jpg', 'jpeg'), ('*.jpg',), 'Create a JPEG image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'LatexFormatter': ('pygments.formatters.latex', 'LaTeX', ('latex', 'tex'), ('*.tex',), 'Format tokens as LaTeX code. This needs the `fancyvrb` and `color` standard packages.'),
'NullFormatter': ('pygments.formatters.other', 'Text only', ('text', 'null'), ('*.txt',), 'Output the text unchanged without any formatting.'),
'PangoMarkupFormatter': ('pygments.formatters.pangomarkup', 'Pango Markup', ('pango', 'pangomarkup'), (), 'Format tokens as Pango Markup code. It can then be rendered to an SVG.'),
'RawTokenFormatter': ('pygments.formatters.other', 'Raw tokens', ('raw', 'tokens'), ('*.raw',), 'Format tokens as a raw representation for storing token streams.'),
'RtfFormatter': ('pygments.formatters.rtf', 'RTF', ('rtf',), ('*.rtf',), 'Format tokens as RTF markup. This formatter automatically outputs full RTF documents with color information and other useful stuff. Perfect for Copy and Paste into Microsoft(R) Word(R) documents.'),
'SvgFormatter': ('pygments.formatters.svg', 'SVG', ('svg',), ('*.svg',), 'Format tokens as an SVG graphics file. This formatter is still experimental. Each line of code is a ``<text>`` element with explicit ``x`` and ``y`` coordinates containing ``<tspan>`` elements with the individual token styles.'),
'Terminal256Formatter': ('pygments.formatters.terminal256', 'Terminal256', ('terminal256', 'console256', '256'), (), 'Format tokens with ANSI color sequences, for output in a 256-color terminal or console. Like in `TerminalFormatter` color sequences are terminated at newlines, so that paging the output works correctly.'),
'TerminalFormatter': ('pygments.formatters.terminal', 'Terminal', ('terminal', 'console'), (), 'Format tokens with ANSI color sequences, for output in a text console. Color sequences are terminated at newlines, so that paging the output works correctly.'),
'TerminalTrueColorFormatter': ('pygments.formatters.terminal256', 'TerminalTrueColor', ('terminal16m', 'console16m', '16m'), (), 'Format tokens with ANSI color sequences, for output in a true-color terminal or console. Like in `TerminalFormatter` color sequences are terminated at newlines, so that paging the output works correctly.'),
'TestcaseFormatter': ('pygments.formatters.other', 'Testcase', ('testcase',), (), 'Format tokens as appropriate for a new testcase.'),
'BBCodeFormatter': ('pygments.formatters.bbcode', 'BBCode', ('bbcode', 'bb'), (), (), 'Format tokens with BBcodes. These formatting codes are used by many bulletin boards, so you can highlight your sourcecode with pygments before posting it there.'),
'BmpImageFormatter': ('pygments.formatters.img', 'img_bmp', ('bmp', 'bitmap'), (), ('bmp',), 'Create a bitmap image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'GifImageFormatter': ('pygments.formatters.img', 'img_gif', ('gif',), (), ('gif',), 'Create a GIF image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'GroffFormatter': ('pygments.formatters.groff', 'groff', ('groff', 'troff', 'roff'), (), (), 'Format tokens with groff escapes to change their color and font style.'),
'HtmlFormatter': ('pygments.formatters.html', 'HTML', ('html',), (), ('html', 'htm'), "Format tokens as HTML 4 ``<span>`` tags within a ``<pre>`` tag, wrapped in a ``<div>`` tag. The ``<div>``'s CSS class can be set by the `cssclass` option."),
'IRCFormatter': ('pygments.formatters.irc', 'IRC', ('irc', 'IRC'), (), (), 'Format tokens with IRC color sequences'),
'ImageFormatter': ('pygments.formatters.img', 'img', ('img', 'IMG', 'png'), (), ('png',), 'Create a PNG image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'JpgImageFormatter': ('pygments.formatters.img', 'img_jpg', ('jpg', 'jpeg'), (), ('jpg',), 'Create a JPEG image from source code. This uses the Python Imaging Library to generate a pixmap from the source code.'),
'LatexFormatter': ('pygments.formatters.latex', 'LaTeX', ('latex', 'tex'), (), ('tex',), 'Format tokens as LaTeX code. This needs the `fancyvrb` and `color` standard packages.'),
'NullFormatter': ('pygments.formatters.other', 'Text only', ('text', 'null'), (), ('txt',), 'Output the text unchanged without any formatting.'),
'PangoMarkupFormatter': ('pygments.formatters.pangomarkup', 'Pango Markup', ('pango', 'pangomarkup'), (), (), 'Format tokens as Pango Markup code. It can then be rendered to an SVG.'),
'RawTokenFormatter': ('pygments.formatters.other', 'Raw tokens', ('raw', 'tokens'), (), ('raw',), 'Format tokens as a raw representation for storing token streams.'),
'RtfFormatter': ('pygments.formatters.rtf', 'RTF', ('rtf',), (), ('rtf',), 'Format tokens as RTF markup. This formatter automatically outputs full RTF documents with color information and other useful stuff. Perfect for Copy and Paste into Microsoft(R) Word(R) documents.'),
'SvgFormatter': ('pygments.formatters.svg', 'SVG', ('svg',), (), ('svg',), 'Format tokens as an SVG graphics file. This formatter is still experimental. Each line of code is a ``<text>`` element with explicit ``x`` and ``y`` coordinates containing ``<tspan>`` elements with the individual token styles.'),
'Terminal256Formatter': ('pygments.formatters.terminal256', 'Terminal256', ('terminal256', 'console256', '256'), (), (), 'Format tokens with ANSI color sequences, for output in a 256-color terminal or console. Like in `TerminalFormatter` color sequences are terminated at newlines, so that paging the output works correctly.'),
'TerminalFormatter': ('pygments.formatters.terminal', 'Terminal', ('terminal', 'console'), (), (), 'Format tokens with ANSI color sequences, for output in a text console. Color sequences are terminated at newlines, so that paging the output works correctly.'),
'TerminalTrueColorFormatter': ('pygments.formatters.terminal256', 'TerminalTrueColor', ('terminal16m', 'console16m', '16m'), (), (), 'Format tokens with ANSI color sequences, for output in a true-color terminal or console. Like in `TerminalFormatter` color sequences are terminated at newlines, so that paging the output works correctly.'),
'TestcaseFormatter': ('pygments.formatters.other', 'Testcase', ('testcase',), (), (), 'Format tokens as appropriate for a new testcase.'),
}
23 changes: 17 additions & 6 deletions pygments/lexers/__init__.py
Expand Up @@ -44,7 +44,11 @@ def get_all_lexers(plugins=True):
are also returned. Otherwise, only builtin ones are considered.
"""
for item in LEXERS.values():
yield item[1:]
_, name, aliases, fn_matches, file_extensions, mimetypes = item
filenames = list(file_extensions)
for e in file_extensions:
filenames.append(f"*.{e}")
yield name, aliases, filenames, mimetypes
if plugins:
for lexer in find_plugin_lexers():
yield lexer.name, lexer.aliases, lexer.filenames, lexer.mimetypes
Expand Down Expand Up @@ -99,7 +103,7 @@ def get_lexer_by_name(_alias, **options):
raise ClassNotFound('no lexer for alias %r found' % _alias)

# lookup builtin lexers
for module_name, name, aliases, _, _ in LEXERS.values():
for module_name, name, aliases, _, _, _ in LEXERS.values():
if _alias.lower() in aliases:
if name not in _lexer_cache:
_load_lexers(module_name)
Expand Down Expand Up @@ -156,15 +160,22 @@ def find_lexer_class_for_filename(_fn, code=None):
"""
matches = []
fn = basename(_fn)
for modname, name, _, filenames, _ in LEXERS.values():
fn_ext = None
if '.' in fn:
fn_ext = fn.rpartition('.')[2]
for modname, name, _, filenames, fn_exts, _ in LEXERS.values():
if fn_ext in fn_exts:
if name not in _lexer_cache:
_load_lexers(modname)
matches.append((_lexer_cache[name], filename))
for filename in filenames:
if fnmatch(fn, filename):
if fn == filename or fnmatch(fn, filename):
if name not in _lexer_cache:
_load_lexers(modname)
matches.append((_lexer_cache[name], filename))
for cls in find_plugin_lexers():
for filename in cls.filenames:
if fnmatch(fn, filename):
if fn == filename or fnmatch(fn, filename):
matches.append((cls, filename))

if isinstance(code, bytes):
Expand Down Expand Up @@ -208,7 +219,7 @@ def get_lexer_for_mimetype(_mime, **options):
Raises ClassNotFound if not found.
"""
for modname, name, _, _, mimetypes in LEXERS.values():
for modname, name, _, _, _, mimetypes in LEXERS.values():
if _mime in mimetypes:
if name not in _lexer_cache:
_load_lexers(modname)
Expand Down

0 comments on commit 8872d85

Please sign in to comment.