diff --git a/CHANGELOG.md b/CHANGELOG.md
index a487bc80d4..a9bfe0d379 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,13 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA
---
+## v1.16.next / unreleased
+
+### Fixed
+
+* [CRuby] `XML::Reader` defaults the encoding to UTF-8 if it's not specified in either the document or as a method parameter. Previously non-ASCII characters were serialized as NCRs in this case. [#2891] (@flavorjones)
+
+
## v1.16.0 / 2023-12-27
### Notable Changes
diff --git a/ext/nokogiri/xml_reader.c b/ext/nokogiri/xml_reader.c
index 7ea5976488..0520f8f7d5 100644
--- a/ext/nokogiri/xml_reader.c
+++ b/ext/nokogiri/xml_reader.c
@@ -5,8 +5,14 @@ VALUE cNokogiriXmlReader;
static void
xml_reader_deallocate(void *data)
{
+ // free the document separately because we _may_ have triggered preservation by calling
+ // xmlTextReaderCurrentDoc during a read_more.
xmlTextReaderPtr reader = data;
+ xmlDocPtr doc = xmlTextReaderCurrentDoc(reader);
xmlFreeTextReader(reader);
+ if (doc) {
+ xmlFreeDoc(doc);
+ }
}
static const rb_data_type_t xml_reader_type = {
@@ -515,6 +521,7 @@ read_more(VALUE self)
xmlErrorConstPtr error;
VALUE error_list;
int ret;
+ xmlDocPtr c_document;
TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
@@ -524,6 +531,16 @@ read_more(VALUE self)
ret = xmlTextReaderRead(reader);
xmlSetStructuredErrorFunc(NULL, NULL);
+ c_document = xmlTextReaderCurrentDoc(reader);
+ if (c_document && c_document->encoding == NULL) {
+ VALUE constructor_encoding = rb_iv_get(self, "@encoding");
+ if (RTEST(constructor_encoding)) {
+ c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
+ } else {
+ c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
+ }
+ }
+
if (ret == 1) { return self; }
if (ret == 0) { return Qnil; }
@@ -707,15 +724,18 @@ rb_xml_reader_encoding(VALUE rb_reader)
const char *parser_encoding;
VALUE constructor_encoding;
+ TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
+ parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
+ if (parser_encoding) {
+ return NOKOGIRI_STR_NEW2(parser_encoding);
+ }
+
constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}
- TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
- parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
- if (parser_encoding == NULL) { return Qnil; }
- return NOKOGIRI_STR_NEW2(parser_encoding);
+ return Qnil;
}
void
diff --git a/test/xml/test_reader_encoding.rb b/test/xml/test_reader_encoding.rb
index 7e4c1ea183..652bfe7294 100644
--- a/test/xml/test_reader_encoding.rb
+++ b/test/xml/test_reader_encoding.rb
@@ -15,32 +15,127 @@ def setup
)
end
- def test_libxml2_detects_internal_encoding_correctly
- skip_unless_libxml2("This feature wasn't implemented for JRuby")
+ def test_detects_internal_encoding_correctly
+ skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")
reader = Nokogiri::XML::Reader(<<~XML)
-
+ inspiraci\xF3n
XML
assert_nil(reader.encoding)
+
reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end
- def test_libxml2_overrides_internal_encoding_when_specified
+ def test_reader_defaults_internal_encoding_to_utf8
+ skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")
+
+ reader = Nokogiri::XML::Reader(<<~XML)
+
+
+ XML
+
+ assert_nil(reader.encoding)
+
+ reader.each do
+ assert_equal("UTF-8", reader.encoding)
+ end
+ end
+
+ def test_override_internal_encoding_when_specified
+ if Nokogiri.uses_libxml? && !Nokogiri::VersionInfo.instance.libxml2_has_iconv?
+ skip("iconv is not compiled into libxml2")
+ end
+
+ #
+ # note that libxml2 behavior around document encoding changed at least twice between 2.9
+ # and 2.12, so the testing here is superficial -- asserting on the reported encoding, but
+ # not asserting on the bytes in the document or the serialized nodes.
+ #
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
-
+ asdf
XML
assert_equal("UTF-8", reader.encoding)
- reader.each do
+
+ reader.read
+
+ if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.12.0")
+ assert_equal("UTF-8", reader.encoding)
+ else
+ assert_equal("ISO-8859-1", reader.encoding)
+ end
+
+ reader = Nokogiri::XML::Reader(<<~XML, nil, "ISO-8859-1")
+
+ asdf
+ XML
+
+ assert_equal("ISO-8859-1", reader.encoding)
+
+ reader.read
+
+ if Nokogiri.jruby? || Nokogiri.uses_libxml?(">= 2.12.0")
+ assert_equal("ISO-8859-1", reader.encoding)
+ else
assert_equal("UTF-8", reader.encoding)
end
end
+ def test_attribute_encoding_issue_2891_no_encoding_specified
+ # https://github.com/sparklemotion/nokogiri/issues/2891
+ reader = Nokogiri::XML::Reader(<<~XML)
+
+ INSPIRACIÓN
+ XML
+
+ assert_nil(reader.encoding)
+
+ reader.read
+
+ assert_equal("UTF-8", reader.encoding) unless Nokogiri.jruby? # JRuby doesn't support encoding detection
+ assert_equal(
+ "INSPIRACIÓN",
+ reader.outer_xml,
+ )
+ end
+
+ def test_attribute_encoding_issue_2891_correct_encoding_specified
+ # https://github.com/sparklemotion/nokogiri/issues/2891
+ reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
+
+ INSPIRACIÓN
+ XML
+
+ assert_equal("UTF-8", reader.encoding)
+
+ reader.read
+
+ assert_equal("UTF-8", reader.encoding)
+ assert_equal(
+ "INSPIRACIÓN",
+ reader.outer_xml,
+ )
+ end
+
+ def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8
+ xml = <<~XML
+
+ \u{82B1}\u{82F1}
+ XML
+ reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS")
+
+ assert_equal("Shift_JIS", reader.encoding)
+
+ reader.read
+
+ assert_equal("Shift_JIS", reader.encoding)
+ end
+
def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))