diff --git a/ext/nokogiri/xml_reader.c b/ext/nokogiri/xml_reader.c
index 7ea5976488..0520f8f7d5 100644
--- a/ext/nokogiri/xml_reader.c
+++ b/ext/nokogiri/xml_reader.c
@@ -5,8 +5,14 @@ VALUE cNokogiriXmlReader;
static void
xml_reader_deallocate(void *data)
{
+ // free the document separately because we _may_ have triggered preservation by calling
+ // xmlTextReaderCurrentDoc during a read_more.
xmlTextReaderPtr reader = data;
+ xmlDocPtr doc = xmlTextReaderCurrentDoc(reader);
xmlFreeTextReader(reader);
+ if (doc) {
+ xmlFreeDoc(doc);
+ }
}
static const rb_data_type_t xml_reader_type = {
@@ -515,6 +521,7 @@ read_more(VALUE self)
xmlErrorConstPtr error;
VALUE error_list;
int ret;
+ xmlDocPtr c_document;
TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);
@@ -524,6 +531,16 @@ read_more(VALUE self)
ret = xmlTextReaderRead(reader);
xmlSetStructuredErrorFunc(NULL, NULL);
+ c_document = xmlTextReaderCurrentDoc(reader);
+ if (c_document && c_document->encoding == NULL) {
+ VALUE constructor_encoding = rb_iv_get(self, "@encoding");
+ if (RTEST(constructor_encoding)) {
+ c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
+ } else {
+ c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
+ }
+ }
+
if (ret == 1) { return self; }
if (ret == 0) { return Qnil; }
@@ -707,15 +724,18 @@ rb_xml_reader_encoding(VALUE rb_reader)
const char *parser_encoding;
VALUE constructor_encoding;
+ TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
+ parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
+ if (parser_encoding) {
+ return NOKOGIRI_STR_NEW2(parser_encoding);
+ }
+
constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}
- TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
- parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
- if (parser_encoding == NULL) { return Qnil; }
- return NOKOGIRI_STR_NEW2(parser_encoding);
+ return Qnil;
}
void
diff --git a/test/xml/test_reader_encoding.rb b/test/xml/test_reader_encoding.rb
index 7e4c1ea183..fd1d2d447b 100644
--- a/test/xml/test_reader_encoding.rb
+++ b/test/xml/test_reader_encoding.rb
@@ -15,32 +15,113 @@ def setup
)
end
- def test_libxml2_detects_internal_encoding_correctly
- skip_unless_libxml2("This feature wasn't implemented for JRuby")
+ def test_detects_internal_encoding_correctly
+ skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")
reader = Nokogiri::XML::Reader(<<~XML)
-
+ inspiraci\xF3n
XML
assert_nil(reader.encoding)
+
reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end
- def test_libxml2_overrides_internal_encoding_when_specified
- reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
-
+ def test_reader_defaults_internal_encoding_to_utf8
+ skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")
+
+ reader = Nokogiri::XML::Reader(<<~XML)
+
XML
- assert_equal("UTF-8", reader.encoding)
+ assert_nil(reader.encoding)
+
reader.each do
assert_equal("UTF-8", reader.encoding)
end
end
+ def test_override_internal_encoding_when_specified
+ # UTF-8 is the correct encoding for this doc, ISO-8859-1 is wrong
+ reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
+
+ inspiración
+ XML
+
+ assert_equal("UTF-8", reader.encoding)
+
+ reader.read
+
+ assert_equal("UTF-8", reader.encoding)
+
+ # ISO-8859-1 is the correct encoding for this doc, UTF-8 is wrong
+ reader = Nokogiri::XML::Reader(<<~XML, nil, "ISO-8859-1")
+
+ inspiraci\xF3n
+ XML
+
+ assert_equal("ISO-8859-1", reader.encoding)
+
+ reader.read
+
+ assert_equal("ISO-8859-1", reader.encoding)
+ end
+
+ def test_attribute_encoding_issue_2891_no_encoding_specified
+ # https://github.com/sparklemotion/nokogiri/issues/2891
+ reader = Nokogiri::XML::Reader(<<~XML)
+
+ INSPIRACIÓN
+ XML
+
+ assert_nil(reader.encoding)
+
+ reader.read
+
+ assert_equal("UTF-8", reader.encoding) unless Nokogiri.jruby? # JRuby doesn't support encoding detection
+ assert_equal(
+ "INSPIRACIÓN",
+ reader.outer_xml,
+ )
+ end
+
+ def test_attribute_encoding_issue_2891_correct_encoding_specified
+ # https://github.com/sparklemotion/nokogiri/issues/2891
+ reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
+
+ INSPIRACIÓN
+ XML
+
+ assert_equal("UTF-8", reader.encoding)
+
+ reader.read
+
+ assert_equal("UTF-8", reader.encoding)
+ assert_equal(
+ "INSPIRACIÓN",
+ reader.outer_xml,
+ )
+ end
+
+ def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8
+ xml = <<~XML
+
+ \u{82B1}\u{82F1}
+ XML
+ reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS")
+
+ assert_equal("Shift_JIS", reader.encoding)
+
+ reader.read
+
+ assert_equal("Shift_JIS", reader.encoding)
+ assert_equal("闃ア闍ア", reader.outer_xml)
+ end
+
def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))