From 2c97b17da4ebe8438474ca61672f3635ebf0e09f Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Mon, 29 May 2023 13:59:02 -0400 Subject: [PATCH] fix(libxml): default Reader node encoding to UTF-8 when it's not specified either as a method param or in the document Fixes #2891 --- CHANGELOG.md | 7 +++ ext/nokogiri/xml_reader.c | 28 ++++++++-- test/xml/test_reader_encoding.rb | 95 +++++++++++++++++++++++++++++--- 3 files changed, 119 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a487bc80d4..a9bfe0d379 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA --- +## v1.16.next / unreleased + +### Fixed + +* [CRuby] `XML::Reader` defaults the encoding to UTF-8 if it's not specified in either the document or as a method parameter. Previously non-ASCII characters were serialized as NCRs in this case. [#2891] (@flavorjones) + + ## v1.16.0 / 2023-12-27 ### Notable Changes diff --git a/ext/nokogiri/xml_reader.c b/ext/nokogiri/xml_reader.c index 7ea5976488..0520f8f7d5 100644 --- a/ext/nokogiri/xml_reader.c +++ b/ext/nokogiri/xml_reader.c @@ -5,8 +5,14 @@ VALUE cNokogiriXmlReader; static void xml_reader_deallocate(void *data) { + // free the document separately because we _may_ have triggered preservation by calling + // xmlTextReaderCurrentDoc during a read_more. xmlTextReaderPtr reader = data; + xmlDocPtr doc = xmlTextReaderCurrentDoc(reader); xmlFreeTextReader(reader); + if (doc) { + xmlFreeDoc(doc); + } } static const rb_data_type_t xml_reader_type = { @@ -515,6 +521,7 @@ read_more(VALUE self) xmlErrorConstPtr error; VALUE error_list; int ret; + xmlDocPtr c_document; TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader); @@ -524,6 +531,16 @@ read_more(VALUE self) ret = xmlTextReaderRead(reader); xmlSetStructuredErrorFunc(NULL, NULL); + c_document = xmlTextReaderCurrentDoc(reader); + if (c_document && c_document->encoding == NULL) { + VALUE constructor_encoding = rb_iv_get(self, "@encoding"); + if (RTEST(constructor_encoding)) { + c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding)); + } else { + c_document->encoding = xmlStrdup(BAD_CAST "UTF-8"); + } + } + if (ret == 1) { return self; } if (ret == 0) { return Qnil; } @@ -707,15 +724,18 @@ rb_xml_reader_encoding(VALUE rb_reader) const char *parser_encoding; VALUE constructor_encoding; + TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader); + parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader); + if (parser_encoding) { + return NOKOGIRI_STR_NEW2(parser_encoding); + } + constructor_encoding = rb_iv_get(rb_reader, "@encoding"); if (RTEST(constructor_encoding)) { return constructor_encoding; } - TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader); - parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader); - if (parser_encoding == NULL) { return Qnil; } - return NOKOGIRI_STR_NEW2(parser_encoding); + return Qnil; } void diff --git a/test/xml/test_reader_encoding.rb b/test/xml/test_reader_encoding.rb index 7e4c1ea183..fd1d2d447b 100644 --- a/test/xml/test_reader_encoding.rb +++ b/test/xml/test_reader_encoding.rb @@ -15,32 +15,113 @@ def setup ) end - def test_libxml2_detects_internal_encoding_correctly - skip_unless_libxml2("This feature wasn't implemented for JRuby") + def test_detects_internal_encoding_correctly + skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby") reader = Nokogiri::XML::Reader(<<~XML) - + inspiraci\xF3n XML assert_nil(reader.encoding) + reader.each do assert_equal("ISO-8859-1", reader.encoding) end end - def test_libxml2_overrides_internal_encoding_when_specified - reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8") - + def test_reader_defaults_internal_encoding_to_utf8 + skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby") + + reader = Nokogiri::XML::Reader(<<~XML) + XML - assert_equal("UTF-8", reader.encoding) + assert_nil(reader.encoding) + reader.each do assert_equal("UTF-8", reader.encoding) end end + def test_override_internal_encoding_when_specified + # UTF-8 is the correct encoding for this doc, ISO-8859-1 is wrong + reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8") + + inspiración + XML + + assert_equal("UTF-8", reader.encoding) + + reader.read + + assert_equal("UTF-8", reader.encoding) + + # ISO-8859-1 is the correct encoding for this doc, UTF-8 is wrong + reader = Nokogiri::XML::Reader(<<~XML, nil, "ISO-8859-1") + + inspiraci\xF3n + XML + + assert_equal("ISO-8859-1", reader.encoding) + + reader.read + + assert_equal("ISO-8859-1", reader.encoding) + end + + def test_attribute_encoding_issue_2891_no_encoding_specified + # https://github.com/sparklemotion/nokogiri/issues/2891 + reader = Nokogiri::XML::Reader(<<~XML) + + INSPIRACIÓN + XML + + assert_nil(reader.encoding) + + reader.read + + assert_equal("UTF-8", reader.encoding) unless Nokogiri.jruby? # JRuby doesn't support encoding detection + assert_equal( + "INSPIRACIÓN", + reader.outer_xml, + ) + end + + def test_attribute_encoding_issue_2891_correct_encoding_specified + # https://github.com/sparklemotion/nokogiri/issues/2891 + reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8") + + INSPIRACIÓN + XML + + assert_equal("UTF-8", reader.encoding) + + reader.read + + assert_equal("UTF-8", reader.encoding) + assert_equal( + "INSPIRACIÓN", + reader.outer_xml, + ) + end + + def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8 + xml = <<~XML + + \u{82B1}\u{82F1} + XML + reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS") + + assert_equal("Shift_JIS", reader.encoding) + + reader.read + + assert_equal("Shift_JIS", reader.encoding) + assert_equal("闃ア闍ア", reader.outer_xml) + end + def test_attribute_at @reader.each do |node| next unless (attribute = node.attribute_at(0))