Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow attr_list quoted values to contain curly braces #1414

Merged
merged 9 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions .spell-dict
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ Treeprocessor
Treeprocessors
tuple
tuples
unparsable
unclosed
unescape
unescaping
Expand Down
4 changes: 3 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
* Include `scripts/*.py` in the generated source tarballs (#1430).
* Ensure lines after heading in loose list are properly detabbed (#1443).
* Give smarty tree processor higher priority than toc (#1440).
* Permit carrots (`^`) and square brackets (`]`) but explicitly exclude
* Permit carets (`^`) and square brackets (`]`) but explicitly exclude
backslashes (`\`) from abbreviations (#1444).
* In attribute lists (`attr_list`, `fenced_code`), quoted attribute values are
now allowed to contain curly braces (`}`) (#1414).

## [3.5.2] -- 2024-01-10

Expand Down
86 changes: 55 additions & 31 deletions markdown/extensions/attr_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,30 @@ def _handle_word(s, t):


_scanner = re.Scanner([
(r'[^ =]+=".*?"', _handle_double_quote),
(r"[^ =]+='.*?'", _handle_single_quote),
(r'[^ =]+=[^ =]+', _handle_key_value),
(r'[^ =]+', _handle_word),
(r'[^ =}]+=".*?"', _handle_double_quote),
(r"[^ =}]+='.*?'", _handle_single_quote),
(r'[^ =}]+=[^ =}]+', _handle_key_value),
(r'[^ =}]+', _handle_word),
(r' ', None)
])


def get_attrs(str: str) -> list[tuple[str, str]]:
""" Parse attribute list and return a list of attribute tuples. """
return _scanner.scan(str)[0]
def get_attrs_and_remainder(attrs_string: str) -> tuple[list[tuple[str, str]], str]:
""" Parse attribute list and return a list of attribute tuples.
Additionally, return any text that remained after a curly brace. In typical cases, its presence
should mean that the input does not match the intended attribute list syntax.
"""
attrs, remainder = _scanner.scan(attrs_string)
# To keep historic behavior, discard all unparsable text prior to '}'.
index = remainder.find('}')
remainder = remainder[index:] if index != -1 else ''
return attrs, remainder


def get_attrs(str: str) -> list[tuple[str, str]]: # pragma: no cover
""" Soft-deprecated. Prefer `get_attrs_and_remainder`. """
return get_attrs_and_remainder(str)[0]


def isheader(elem: Element) -> bool:
Expand All @@ -76,7 +89,7 @@ def isheader(elem: Element) -> bool:

class AttrListTreeprocessor(Treeprocessor):

BASE_RE = r'\{\:?[ ]*([^\}\n ][^\}\n]*)[ ]*\}'
BASE_RE = r'\{\:?[ ]*([^\}\n ][^\n]*)[ ]*\}'
HEADER_RE = re.compile(r'[ ]+{}[ ]*$'.format(BASE_RE))
BLOCK_RE = re.compile(r'\n[ ]*{}[ ]*$'.format(BASE_RE))
INLINE_RE = re.compile(r'^{}'.format(BASE_RE))
Expand Down Expand Up @@ -106,49 +119,58 @@ def run(self, doc: Element) -> None:
# use tail of last child. no `ul` or `ol`.
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[-1].tail = elem[-1].tail[:m.start()]
elif pos is not None and pos > 0 and elem[pos-1].tail:
# use tail of last child before `ul` or `ol`
m = RE.search(elem[pos-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[pos-1].tail = elem[pos-1].tail[:m.start()]
elif elem.text:
# use text. `ul` is first child.
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if not self.assign_attrs(elem, m.group(1), strict=True):
elem.text = elem.text[:m.start()]
elif len(elem) and elem[-1].tail:
# has children. Get from tail of last child
m = RE.search(elem[-1].tail)
if m:
self.assign_attrs(elem, m.group(1))
elem[-1].tail = elem[-1].tail[:m.start()]
if isheader(elem):
# clean up trailing #s
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
if not self.assign_attrs(elem, m.group(1), strict=True):
elem[-1].tail = elem[-1].tail[:m.start()]
if isheader(elem):
# clean up trailing #s
elem[-1].tail = elem[-1].tail.rstrip('#').rstrip()
elif elem.text:
# no children. Get from text.
m = RE.search(elem.text)
if m:
self.assign_attrs(elem, m.group(1))
elem.text = elem.text[:m.start()]
if isheader(elem):
# clean up trailing #s
elem.text = elem.text.rstrip('#').rstrip()
if not self.assign_attrs(elem, m.group(1), strict=True):
elem.text = elem.text[:m.start()]
if isheader(elem):
# clean up trailing #s
elem.text = elem.text.rstrip('#').rstrip()
else:
# inline: check for `attrs` at start of tail
if elem.tail:
m = self.INLINE_RE.match(elem.tail)
if m:
self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():]
remainder = self.assign_attrs(elem, m.group(1))
elem.tail = elem.tail[m.end():] + remainder

def assign_attrs(self, elem: Element, attrs_string: str, *, strict: bool = False) -> str:
""" Assign `attrs` to element.
If the `attrs_string` has an extra closing curly brace, the remaining text is returned.
The `strict` argument controls whether to still assign `attrs` if there is a remaining `}`.
"""
attrs, remainder = get_attrs_and_remainder(attrs_string)
if strict and remainder:
return remainder

def assign_attrs(self, elem: Element, attrs: str) -> None:
""" Assign `attrs` to element. """
for k, v in get_attrs(attrs):
for k, v in attrs:
if k == '.':
# add to class
cls = elem.get('class')
Expand All @@ -159,11 +181,13 @@ def assign_attrs(self, elem: Element, attrs: str) -> None:
else:
# assign attribute `k` with `v`
elem.set(self.sanitize_name(k), v)
# The text that we initially over-matched will be put back.
return remainder

def sanitize_name(self, name: str) -> str:
"""
Sanitize name as 'an XML Name, minus the ":"'.
See https://www.w3.org/TR/REC-xml-names/#NT-NCName
Sanitize name as 'an XML Name, minus the `:`.'
See <https://www.w3.org/TR/REC-xml-names/#NT-NCName>.
"""
return self.NAME_RE.sub('_', name)

Expand Down
15 changes: 11 additions & 4 deletions markdown/extensions/fenced_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from . import Extension
from ..preprocessors import Preprocessor
from .codehilite import CodeHilite, CodeHiliteExtension, parse_hl_lines
from .attr_list import get_attrs, AttrListExtension
from .attr_list import get_attrs_and_remainder, AttrListExtension
from ..util import parseBoolValue
from ..serializers import _escape_attrib_html
import re
Expand Down Expand Up @@ -56,7 +56,7 @@ class FencedBlockPreprocessor(Preprocessor):
FENCED_BLOCK_RE = re.compile(
dedent(r'''
(?P<fence>^(?:~{3,}|`{3,}))[ ]* # opening fence
((\{(?P<attrs>[^\}\n]*)\})| # (optional {attrs} or
((\{(?P<attrs>[^\n]*)\})| # (optional {attrs} or
(\.?(?P<lang>[\w#.+-]*)[ ]*)? # optional (.)lang
(hl_lines=(?P<quot>"|')(?P<hl_lines>.*?)(?P=quot)[ ]*)?) # optional hl_lines)
\n # newline (end of opening fence)
Expand Down Expand Up @@ -94,12 +94,17 @@ def run(self, lines: list[str]) -> list[str]:
self.checked_for_deps = True

text = "\n".join(lines)
index = 0
while 1:
m = self.FENCED_BLOCK_RE.search(text)
m = self.FENCED_BLOCK_RE.search(text, index)
if m:
lang, id, classes, config = None, '', [], {}
if m.group('attrs'):
id, classes, config = self.handle_attrs(get_attrs(m.group('attrs')))
attrs, remainder = get_attrs_and_remainder(m.group('attrs'))
if remainder: # Does not have correctly matching curly braces, so the syntax is invalid.
index = m.end('attrs') # Explicitly skip over this, to prevent an infinite loop.
continue
id, classes, config = self.handle_attrs(attrs)
if len(classes):
lang = classes.pop(0)
else:
Expand Down Expand Up @@ -151,6 +156,8 @@ def run(self, lines: list[str]) -> list[str]:

placeholder = self.md.htmlStash.store(code)
text = f'{text[:m.start()]}\n{placeholder}\n{text[m.end():]}'
# Continue from after the replaced text in the next iteration.
index = m.start() + 1 + len(placeholder)
else:
break
return text.split("\n")
Expand Down
45 changes: 41 additions & 4 deletions tests/test_syntax/extensions/test_attr_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,53 @@


class TestAttrList(TestCase):

maxDiff = None
default_kwargs = {'extensions': ['attr_list']}

# TODO: Move the rest of the `attr_list` tests here.

def test_empty_list(self):
def test_empty_attr_list(self):
self.assertMarkdownRenders(
'*foo*{ }',
'<p><em>foo</em>{ }</p>',
extensions=['attr_list']
'<p><em>foo</em>{ }</p>'
)

def test_curly_after_inline(self):
self.assertMarkdownRenders(
'*inline*{.a} } *text*{.a }}',
'<p><em class="a">inline</em> } <em class="a">text</em>}</p>'
)

def test_extra_eq_gets_ignored_inside_curly_inline(self):
# Undesired behavior but kept for historic compatibility.
self.assertMarkdownRenders(
'*inline*{data-test="x" =a} *text*',
'<p><em data-test="x">inline</em> <em>text</em></p>'
)

def test_curly_after_block(self):
self.assertMarkdownRenders(
'# Heading {.a} }',
'<h1>Heading {.a} }</h1>'
)

def test_curly_in_single_quote(self):
self.assertMarkdownRenders(
"# Heading {data-test='{}'}",
'<h1 data-test="{}">Heading</h1>'
)

def test_curly_in_double_quote(self):
self.assertMarkdownRenders(
'# Heading {data-test="{}"}',
'<h1 data-test="{}">Heading</h1>'
)

def test_unclosed_quote_ignored(self):
# Undesired behavior but kept for historic compatibility.
self.assertMarkdownRenders(
'# Heading {foo="bar}',
'<h1 foo="&quot;bar">Heading</h1>'
)

def test_table_td(self):
Expand Down
42 changes: 42 additions & 0 deletions tests/test_syntax/extensions/test_fenced_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -394,6 +394,48 @@ def testFencedCodeEscapedAttrs(self):
extensions=['fenced_code', 'attr_list']
)

def testFencedCodeCurlyInAttrs(self):
self.assertMarkdownRenders(
self.dedent(
'''
``` { data-test="{}" }
# Some python code
```
'''
),
self.dedent(
'''
<pre><code data-test="{}"># Some python code
</code></pre>
'''
),
extensions=['fenced_code', 'attr_list']
)

def testFencedCodeMismatchedCurlyInAttrs(self):
self.assertMarkdownRenders(
self.dedent(
'''
``` { data-test="{}" } }
# Some python code
```
```
test
```
'''
),
self.dedent(
'''
<p>``` { data-test="{}" } }</p>
<h1>Some python code</h1>
<pre><code></code></pre>
<p>test
```</p>
'''
),
extensions=['fenced_code', 'attr_list']
)


class TestFencedCodeWithCodehilite(TestCase):

Expand Down