Skip to content

Commit

Permalink
fix(libxml): default Reader node encoding to UTF-8
Browse files Browse the repository at this point in the history
when it's not specified either as a method param or in the document

Fixes #2891
  • Loading branch information
flavorjones committed Dec 28, 2023
1 parent 94cb2dc commit d4a30b8
Show file tree
Hide file tree
Showing 2 changed files with 112 additions and 11 deletions.
28 changes: 24 additions & 4 deletions ext/nokogiri/xml_reader.c
Expand Up @@ -5,8 +5,14 @@ VALUE cNokogiriXmlReader;
static void
xml_reader_deallocate(void *data)
{
// free the document separately because we _may_ have triggered preservation by calling
// xmlTextReaderCurrentDoc during a read_more.
xmlTextReaderPtr reader = data;
xmlDocPtr doc = xmlTextReaderCurrentDoc(reader);
xmlFreeTextReader(reader);
if (doc) {
xmlFreeDoc(doc);
}
}

static const rb_data_type_t xml_reader_type = {
Expand Down Expand Up @@ -515,6 +521,7 @@ read_more(VALUE self)
xmlErrorConstPtr error;
VALUE error_list;
int ret;
xmlDocPtr c_document;

TypedData_Get_Struct(self, xmlTextReader, &xml_reader_type, reader);

Expand All @@ -524,6 +531,16 @@ read_more(VALUE self)
ret = xmlTextReaderRead(reader);
xmlSetStructuredErrorFunc(NULL, NULL);

c_document = xmlTextReaderCurrentDoc(reader);
if (c_document && c_document->encoding == NULL) {
VALUE constructor_encoding = rb_iv_get(self, "@encoding");
if (RTEST(constructor_encoding)) {
c_document->encoding = xmlStrdup(BAD_CAST StringValueCStr(constructor_encoding));
} else {
c_document->encoding = xmlStrdup(BAD_CAST "UTF-8");
}
}

if (ret == 1) { return self; }
if (ret == 0) { return Qnil; }

Expand Down Expand Up @@ -707,15 +724,18 @@ rb_xml_reader_encoding(VALUE rb_reader)
const char *parser_encoding;
VALUE constructor_encoding;

TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding) {
return NOKOGIRI_STR_NEW2(parser_encoding);
}

constructor_encoding = rb_iv_get(rb_reader, "@encoding");
if (RTEST(constructor_encoding)) {
return constructor_encoding;
}

TypedData_Get_Struct(rb_reader, xmlTextReader, &xml_reader_type, c_reader);
parser_encoding = (const char *)xmlTextReaderConstEncoding(c_reader);
if (parser_encoding == NULL) { return Qnil; }
return NOKOGIRI_STR_NEW2(parser_encoding);
return Qnil;
}

void
Expand Down
95 changes: 88 additions & 7 deletions test/xml/test_reader_encoding.rb
Expand Up @@ -15,32 +15,113 @@ def setup
)
end

def test_libxml2_detects_internal_encoding_correctly
skip_unless_libxml2("This feature wasn't implemented for JRuby")
def test_detects_internal_encoding_correctly
skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0" encoding="ISO-8859-1"?>
<root attr="foo"><employee /></root>
<anotaci\xF3n>inspiraci\xF3n</anotaci\xF3n>
XML

assert_nil(reader.encoding)

reader.each do
assert_equal("ISO-8859-1", reader.encoding)
end
end

def test_libxml2_overrides_internal_encoding_when_specified
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
def test_reader_defaults_internal_encoding_to_utf8
skip_unless_libxml2("Internal encoding detection isn't implemented yet for JRuby")

reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0"?>
<root attr="foo"><employee /></root>
XML

assert_equal("UTF-8", reader.encoding)
assert_nil(reader.encoding)

reader.each do
assert_equal("UTF-8", reader.encoding)
end
end

def test_override_internal_encoding_when_specified
# UTF-8 is the correct encoding for this doc, ISO-8859-1 is wrong
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0" encoding="ISO-8859-1"?>
<anotación>inspiración</anotación>
XML

assert_equal("UTF-8", reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding)

# ISO-8859-1 is the correct encoding for this doc, UTF-8 is wrong
reader = Nokogiri::XML::Reader(<<~XML, nil, "ISO-8859-1")
<?xml version="1.0" encoding="UTF-8"?>
<anotaci\xF3n>inspiraci\xF3n</anotaci\xF3n>
XML

assert_equal("ISO-8859-1", reader.encoding)

reader.read

assert_equal("ISO-8859-1", reader.encoding)
end

def test_attribute_encoding_issue_2891_no_encoding_specified
# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML)
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML

assert_nil(reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding) unless Nokogiri.jruby? # JRuby doesn't support encoding detection
assert_equal(
"<anotación tipo=\"inspiración\">INSPIRACIÓN</anotación>",
reader.outer_xml,
)
end

def test_attribute_encoding_issue_2891_correct_encoding_specified
# https://github.com/sparklemotion/nokogiri/issues/2891
reader = Nokogiri::XML::Reader(<<~XML, nil, "UTF-8")
<?xml version="1.0"?>
<anotación tipo="inspiración">INSPIRACIÓN</anotación>
XML

assert_equal("UTF-8", reader.encoding)

reader.read

assert_equal("UTF-8", reader.encoding)
assert_equal(
"<anotación tipo=\"inspiración\">INSPIRACIÓN</anotación>",
reader.outer_xml,
)
end

def test_attribute_encoding_issue_2891_correct_encoding_specified_non_utf8
xml = <<~XML
<?xml version="1.0"?>
<test>\u{82B1}\u{82F1}</test>
XML
reader = Nokogiri::XML::Reader(xml, nil, "Shift_JIS")

assert_equal("Shift_JIS", reader.encoding)

reader.read

assert_equal("Shift_JIS", reader.encoding)
assert_equal("<test>闃ア闍ア</test>", reader.outer_xml)
end

def test_attribute_at
@reader.each do |node|
next unless (attribute = node.attribute_at(0))
Expand Down

0 comments on commit d4a30b8

Please sign in to comment.