"Unterminated character set" exception when using extra extension #1444

noamgat · 2024-02-22T07:32:21Z

Hi,

When using the "extra" extensions, some invalid markdowns (i think?) are causing exceptions rather than returning plaintext line.

Example:

txt = """
*[^1^]: This is going to crash if extra extension is enabled
"""

import markdown
ok = markdown.markdown(txt, extensions=[])
not_ok = markdown.markdown(txt, extensions=['extra'])

Is this expected behavior?

Full exception stack trace:

Cell In[29], line 7
      5 import markdown
      6 ok = markdown.markdown(txt, extensions=[])
----> 7 not_ok = markdown.markdown(txt, extensions=['extra'])

File lib/python3.10/site-packages/markdown/core.py:482, in markdown(text, **kwargs)
    464 """
    465 Convert a markdown string to HTML and return HTML as a Unicode string.
    466 
   (...)
    479 
    480 """
    481 md = Markdown(**kwargs)
--> 482 return md.convert(text)

File lib/python3.10/site-packages/markdown/core.py:357, in Markdown.convert(self, source)
    354     self.lines = prep.run(self.lines)
    356 # Parse the high-level elements.
--> 357 root = self.parser.parseDocument(self.lines).getroot()
    359 # Run the tree-processors
    360 for treeprocessor in self.treeprocessors:

File lib/python3.10/site-packages/markdown/blockparser.py:117, in BlockParser.parseDocument(self, lines)
    115 # Create an `ElementTree` from the lines
    116 self.root = etree.Element(self.md.doc_tag)
--> 117 self.parseChunk(self.root, '\n'.join(lines))
    118 return etree.ElementTree(self.root)

File lib/python3.10/site-packages/markdown/blockparser.py:136, in BlockParser.parseChunk(self, parent, text)
    120 def parseChunk(self, parent: etree.Element, text: str) -> None:
    121     """ Parse a chunk of Markdown text and attach to given `etree` node.
    122 
    123     While the `text` argument is generally assumed to contain multiple
   (...)
    134 
    135     """
--> 136     self.parseBlocks(parent, text.split('\n\n'))

File lib/python3.10/site-packages/markdown/blockparser.py:158, in BlockParser.parseBlocks(self, parent, blocks)
    156 for processor in self.blockprocessors:
    157     if processor.test(parent, blocks[0]):
--> 158         if processor.run(parent, blocks) is not False:
    159             # run returns True or None
    160             break

File lib/python3.10/site-packages/markdown/extensions/abbr.py:61, in AbbrPreprocessor.run(self, parent, blocks)
     58 abbr = m.group('abbr').strip()
     59 title = m.group('title').strip()
     60 self.parser.md.inlinePatterns.register(
---> 61     AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
     62 )
     63 if block[m.end():].strip():
     64     # Add any content after match back to blocks as separate block
     65     blocks.insert(0, block[m.end():].lstrip('\n'))

File lib/python3.10/site-packages/markdown/extensions/abbr.py:94, in AbbrInlineProcessor.__init__(self, pattern, title)
     93 def __init__(self, pattern: str, title: str):
---> 94     super().__init__(pattern)
     95     self.title = title

File lib/python3.10/site-packages/markdown/inlinepatterns.py:297, in InlineProcessor.__init__(self, pattern, md)
    287 """
    288 Create an instant of an inline processor.
    289 
   (...)
    294 
    295 """
    296 self.pattern = pattern
--> 297 self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
    299 # API for Markdown to pass `safe_mode` into instance
    300 self.safe_mode = False

File lib/python3.10/re.py:251, in compile(pattern, flags)
    249 def compile(pattern, flags=0):
    250     "Compile a regular expression pattern, returning a Pattern object."
--> 251     return _compile(pattern, flags)

File lib/python3.10/re.py:303, in _compile(pattern, flags)
    301 if not sre_compile.isstring(pattern):
    302     raise TypeError("first argument must be string or compiled pattern")
--> 303 p = sre_compile.compile(pattern, flags)
    304 if not (flags & DEBUG):
    305     if len(_cache) >= _MAXCACHE:
    306         # Drop the oldest item

File lib/python3.10/sre_compile.py:788, in compile(p, flags)
    786 if isstring(p):
    787     pattern = p
--> 788     p = sre_parse.parse(p, flags)
    789 else:
    790     pattern = None

File lib/python3.10/sre_parse.py:955, in parse(str, flags, state)
    952 state.str = str
    954 try:
--> 955     p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
    956 except Verbose:
    957     # the VERBOSE flag was switched on inside the pattern.  to be
    958     # on the safe side, we'll parse the whole thing again...
    959     state = State()

File lib/python3.10/sre_parse.py:444, in _parse_sub(source, state, verbose, nested)
    442 start = source.tell()
    443 while True:
--> 444     itemsappend(_parse(source, state, verbose, nested + 1,
    445                        not nested and not items))
    446     if not sourcematch("|"):
    447         break

File lib/python3.10/sre_parse.py:841, in _parse(source, state, verbose, nested, first)
    838         raise source.error(err.msg, len(name) + 1) from None
    839 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
    840                not (del_flags & SRE_FLAG_VERBOSE))
--> 841 p = _parse_sub(source, state, sub_verbose, nested + 1)
    842 if not source.match(")"):
    843     raise source.error("missing ), unterminated subpattern",
    844                        source.tell() - start)

File lib/python3.10/sre_parse.py:444, in _parse_sub(source, state, verbose, nested)
    442 start = source.tell()
    443 while True:
--> 444     itemsappend(_parse(source, state, verbose, nested + 1,
    445                        not nested and not items))
    446     if not sourcematch("|"):
    447         break

File lib/python3.10/sre_parse.py:550, in _parse(source, state, verbose, nested, first)
    548 this = sourceget()
    549 if this is None:
--> 550     raise source.error("unterminated character set",
    551                        source.tell() - here)
    552 if this == "]" and set:
    553     break

error: unterminated character set at position 17

The text was updated successfully, but these errors were encountered:

waylan · 2024-03-06T19:16:42Z

Thanks for the report. This appears to be an issue with the abbr extension. In fact, the behavior can be replicated using that extension only, Specifically, the extension correctly identifies the line of input as an abbreviation definition and tries to build a regex to match instances of ^1^ in the document. The regex it builds then fails to compile and raises an error.

Obviously, the carrot (^) has special meaning in regex, so it would need to be escaped to match the actual character. In fact, the code accounts for the fact that a user could include anything in an abbreviation and wraps each character in a character set ([]). In other words, the regex for the abbreviation HTML would be [H][T][M][L]. As it turns out, there are only 4 characters which have special meaning in character sets (^, \, -, and ]) and it appears we do not do anything to account for them.

We have a few options to address this:

Retain the current method of constructing regex but backslash escape the 4 special characters in character sets.
Utilize a different method of constructing regex. For example, we could abandon character sets and do literal characters; either backslash escaping every character (/H/T/M/L) or only backslash escaping special characters listed here. Or maybe some other completely different approach.
Restrict the characters allowed in abbreviations to exclude most punctuation.

Option 3 would be simple in that it would mostly eliminate the possibility of special characters being used in generated regex . But it could break users' existing documents as the current permissive approach has been in-place for over a decade. And I say it would only "mostly eliminate" the issue because we would likely want to allow - at a minimum; so we would still need to do some escaping. Option 2 would be more dramatic that option 1 and is listed for completeness. We might want to go that way of it was more performant or provided some other significant benefit. However, if we are just fixing the immediate bug, then option 1 seems like the best choice.

>>> import markdown
>>> txt = "*[^1^]: This is going to crash if extra extension is enabled"
>>> markdown.markdown(txt, extensions=['abbr'])
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "C:\code\md\markdown\core.py", line 482, in markdown
    return md.convert(text)
  File "C:\code\md\markdown\core.py", line 357, in convert
    root = self.parser.parseDocument(self.lines).getroot()
  File "C:\code\md\markdown\blockparser.py", line 117, in parseDocument
    self.parseChunk(self.root, '\n'.join(lines))
  File "C:\code\md\markdown\blockparser.py", line 136, in parseChunk
    self.parseBlocks(parent, text.split('\n\n'))
  File "C:\code\md\markdown\blockparser.py", line 158, in parseBlocks
    if processor.run(parent, blocks) is not False:
  File "C:\code\md\markdown\extensions\abbr.py", line 61, in run
    AbbrInlineProcessor(self._generate_pattern(abbr), title), 'abbr-%s' % abbr, 2
  File "C:\code\md\markdown\extensions\abbr.py", line 94, in __init__
    super().__init__(pattern)
  File "C:\code\md\markdown\inlinepatterns.py", line 297, in __init__
    self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\re.py", line 250, in compile
    return _compile(pattern, flags)
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\re.py", line 302, in _compile
    p = sre_compile.compile(pattern, flags)
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\sre_compile.py", line 764, in compile
    p = sre_parse.parse(p, flags)
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\sre_parse.py", line 948, in parse
    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\sre_parse.py", line 443, in _parse_sub
    itemsappend(_parse(source, state, verbose, nested + 1,
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\sre_parse.py", line 834, in _parse
    p = _parse_sub(source, state, sub_verbose, nested + 1)
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\sre_parse.py", line 443, in _parse_sub
    itemsappend(_parse(source, state, verbose, nested + 1,
  File "C:\Users\wlimberg\AppData\Local\Programs\Python\Python38\lib\sre_parse.py", line 549, in _parse
    raise source.error("unterminated character set",
re.error: unterminated character set at position 17

waylan · 2024-03-06T19:38:48Z

On second thought, we could go with option 3 and restrict abbreviations to not allow them to include the 3 characters ^, \, and ]. As it is, those 3 characters would have never worked, so we aren't breaking anyone's existing documents. And then we don't need to worry about escaping them. While the backslash is a special character in Markdown, it does not make sense as part of an abbreviation, so being not permitted there should be fine. In fact, we have never supported escaping characters in abbreviations, which also means that a ] would never end up in one (actually, it already is restricted)

Finally, I will note that while the - does have special meaning in character sets, it is treated as a regular character if it is the first or last character of a character set. As we only every include a single character in a character set, it has worked fine without escaping and should continue to work without modification. Therefore, it does not need to be addressed at all.

Fixes Python-Markdown#1444.

Fixes #1444.

oprypin · 2024-03-06T21:57:54Z

I am rather opposed to the chosen solution, on two fronts:

It is strange to let the regex implementation details dictate the behavior.
It is really easy to implement the correct solution: replace the _generate_pattern function entirely with just re.escape

waylan · 2024-03-07T18:41:05Z

It is strange to let the regex implementation details dictate the behavior.

I will note that there is no change in behavior, which is why I took this route. I simply pushed a bug fix which maintains the existing behavior. Specifically, the change prevents an error from being raised. So the change stands.

It is really easy to implement the correct solution: replace the _generate_pattern function entirely with just re.escape

Apparently, way back when I first wrote this extension, I didn't realize that re.escape existed. Because, if I did, I would have used it. In any event, a change in behavior would be considered in a separate PR.

A alternate fix to Python-Markdown#1444. This does not omit the use of carrots in abbreviations. It still omits backslashs, however. I played around with backslashes and it just doesn't make sense to support them as they have special meaning in the Markdown, not because of their use in regular expressions.

waylan · 2024-03-07T19:49:39Z

I just pushed #1449. It still doesn't allow backslashes, but because of their meaning in Markdown, not due to the regex implementation.

oprypin · 2024-03-07T20:03:40Z

Nice, thanks!

A alternate fix to #1444. This does not exclude the use of carrots or square brackets in abbreviations. It still excludes backslashse, however. I played with backslashes and it just doesn't make sense to support them as they have special meaning in the Markdown, not because of their use in regular expressions.

waylan added bug Bug report. extension Related to one or more of the included extensions. confirmed Confirmed bug report or approved feature request. labels Mar 6, 2024

waylan added a commit to waylan/markdown that referenced this issue Mar 6, 2024

Explicitly omit carot and backslash from abbr

16ccc52

Fixes Python-Markdown#1444.

waylan mentioned this issue Mar 6, 2024

Explicitly omit carot and backslash from abbr #1448

Merged

waylan closed this as completed in #1448 Mar 6, 2024

waylan added a commit that referenced this issue Mar 6, 2024

Explicitly omit carot and backslash from abbr

a18765c

Fixes #1444.

waylan mentioned this issue Mar 7, 2024

Refactor abbr escaping #1449

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

"Unterminated character set" exception when using extra extension #1444

"Unterminated character set" exception when using extra extension #1444

noamgat commented Feb 22, 2024 •

edited

waylan commented Mar 6, 2024

waylan commented Mar 6, 2024

oprypin commented Mar 6, 2024

waylan commented Mar 7, 2024

waylan commented Mar 7, 2024

oprypin commented Mar 7, 2024

"Unterminated character set" exception when using extra extension #1444

"Unterminated character set" exception when using extra extension #1444

Comments

noamgat commented Feb 22, 2024 • edited

waylan commented Mar 6, 2024

waylan commented Mar 6, 2024

oprypin commented Mar 6, 2024

waylan commented Mar 7, 2024

waylan commented Mar 7, 2024

oprypin commented Mar 7, 2024

noamgat commented Feb 22, 2024 •

edited