From de6bcd20206a3b5394aebb85117e10c299259af5 Mon Sep 17 00:00:00 2001 From: Stephen Checkoway Date: Mon, 15 Jan 2024 20:35:28 -0500 Subject: [PATCH 1/2] Delay breaking out of the parse loop when max_tree_depth is hit When a token causes a node to be added to the DOM which increases the depth of the DOM to exceed the `max_tree_depth` _and_ the token needs to be reprocessed, memory is leaked. By delaying breaking out of the loop until after the token has been completely handled, this appears to fix the leak. Fixes #3098 --- gumbo-parser/src/parser.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/gumbo-parser/src/parser.c b/gumbo-parser/src/parser.c index 67812b23fe..9fe9c623e4 100644 --- a/gumbo-parser/src/parser.c +++ b/gumbo-parser/src/parser.c @@ -4826,14 +4826,17 @@ GumboOutput* gumbo_parse_with_options ( // to a token. if (token.type == GUMBO_TOKEN_END_TAG && token.v.end_tag.tag == GUMBO_TAG_UNKNOWN) + { gumbo_free(token.v.end_tag.name); + token.v.end_tag.name = NULL; + } + if (unlikely(state->_open_elements.length > max_tree_depth)) { + parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP; + gumbo_debug("Tree depth limit exceeded.\n"); + break; + } } - if (unlikely(state->_open_elements.length > max_tree_depth)) { - parser._output->status = GUMBO_STATUS_TREE_TOO_DEEP; - gumbo_debug("Tree depth limit exceeded.\n"); - break; - } ++loop_count; assert(loop_count < 1000000000UL); From 2e26a7236e15931e937cd0fdd7c2145be6ca7c58 Mon Sep 17 00:00:00 2001 From: Stephen Checkoway Date: Tue, 16 Jan 2024 11:00:03 -0500 Subject: [PATCH 2/2] Add test --- test/test_memory_usage.rb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/test/test_memory_usage.rb b/test/test_memory_usage.rb index 2b90eed7e3..4842724cd6 100644 --- a/test/test_memory_usage.rb +++ b/test/test_memory_usage.rb @@ -301,5 +301,15 @@ def start_element(name, attrs = []) Nokogiri::HTML5::Document.parse(html) end end + + it "libgumbo max depth exceeded" do + html = "" + + memwatch(__method__) do + Nokogiri::HTML5.parse(html, max_tree_depth: 1) + rescue ArgumentError + # Expected error. This comment makes rubocop happy. + end + end end if ENV["NOKOGIRI_MEMORY_SUITE"] && Nokogiri.uses_libxml? end