From da300b46155fa370f205cb380113198b3609ec39 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Tue, 30 Jan 2024 09:37:50 -0500 Subject: [PATCH] fix: apply upstream patch for in-context parsing Fixes #3112 --- CHANGELOG.md | 1 + ...h-in-xmlParseInNodeContext-with-HTML.patch | 33 +++++++++++++++++++ test/html5/test_api.rb | 16 +++++++++ 3 files changed, 50 insertions(+) create mode 100644 patches/libxml2/0012-parser-Fix-crash-in-xmlParseInNodeContext-with-HTML.patch diff --git a/CHANGELOG.md b/CHANGELOG.md index 8cbacd72a0..d89509fd2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA * [CRuby] `XML::Reader` defaults the encoding to UTF-8 if it's not specified in either the document or as a method parameter. Previously non-ASCII characters were serialized as NCRs in this case. [#2891] (@flavorjones) * [CRuby] Restored support for compilation by GCC versions earlier than 4.6, which was broken in v1.15.0 (540e9aee). [#3090] (@adfoster-r7) +* [CRuby] Patched upstream libxml2 to allow parsing HTML5 in the context of a namespaced node (e.g., foreign content like MathML). [#3112, #3116] (@flavorjones) ## v1.16.0 / 2023-12-27 diff --git a/patches/libxml2/0012-parser-Fix-crash-in-xmlParseInNodeContext-with-HTML.patch b/patches/libxml2/0012-parser-Fix-crash-in-xmlParseInNodeContext-with-HTML.patch new file mode 100644 index 0000000000..56e8851784 --- /dev/null +++ b/patches/libxml2/0012-parser-Fix-crash-in-xmlParseInNodeContext-with-HTML.patch @@ -0,0 +1,33 @@ +From 95f2a17440568694a6df6a326c5b411e77597be2 Mon Sep 17 00:00:00 2001 +From: Nick Wellnhofer +Date: Tue, 30 Jan 2024 13:25:17 +0100 +Subject: [PATCH] parser: Fix crash in xmlParseInNodeContext with HTML + documents + +Ignore namespaces if we have an HTML document with namespaces added +manually. + +Fixes #672. +--- + parser.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/parser.c b/parser.c +index 1038d71b..f7842ed1 100644 +--- a/parser.c ++++ b/parser.c +@@ -12415,8 +12415,10 @@ xmlParseInNodeContext(xmlNodePtr node, const char *data, int datalen, + } + xmlAddChild(node, fake); + +- if (node->type == XML_ELEMENT_NODE) { ++ if (node->type == XML_ELEMENT_NODE) + nodePush(ctxt, node); ++ ++ if ((ctxt->html == 0) && (node->type == XML_ELEMENT_NODE)) { + /* + * initialize the SAX2 namespaces stack + */ +-- +2.42.0 + diff --git a/test/html5/test_api.rb b/test/html5/test_api.rb index 403287c7ee..1263aefa36 100644 --- a/test/html5/test_api.rb +++ b/test/html5/test_api.rb @@ -238,6 +238,22 @@ def test_node_wrap_uses_parent_node_as_parsing_context_node assert_equal("select", el.parent.parent.name) end + def test_parse_in_context_of_foreign_namespace + if Nokogiri.uses_libxml?("~> 2.12.0") + skip_unless_libxml2_patch("0012-parser-Fix-crash-in-xmlParseInNodeContext-with-HTML.patch") + end + + # https://github.com/sparklemotion/nokogiri/issues/3112 + # https://gitlab.gnome.org/GNOME/libxml2/-/issues/672 + doc = Nokogiri::HTML5::Document.parse("") + math = doc.at_css("math") + + nodes = math.parse("mrow") # segfaults in libxml 2.12 before 95f2a174 + + assert_kind_of(Nokogiri::XML::NodeSet, nodes) + assert_equal(1, nodes.length) + end + describe Nokogiri::HTML5::Document do describe "#fragment" do it "parses text nodes in a `body` context" do