ruby · Sep 4, 2024 · Sep 24, 2024 · Sep 29, 2024 · Sep 29, 2024 · Sep 29, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,47 @@
 # News
 
+## 3.3.9 - 2024-10-24 {#version-3-3-9}
+
+### Improvements
+
+  * Improved performance.
+    * GH-210
+    * Patch by NAITOH Jun.
+
+### Fixes
+
+  * Fixed a parse bug for text only invalid XML.
+    * GH-215
+    * Patch by NAITOH Jun.
+
+  * Fixed a parse bug that `&#0x...;` is accepted as a character
+    reference.
+
+### Thanks
+
+  * NAITOH Jun
+
+## 3.3.8 - 2024-09-29 {#version-3-3-8}
+
+### Improvements
+
+  * SAX2: Improve parse performance.
+    * GH-207
+    * Patch by NAITOH Jun.
+
+### Fixes
+
+  * Fixed a bug that unexpected attribute namespace conflict error for
+    the predefined "xml" namespace is reported.
+    * GH-208
+    * Patch by KITAITI Makoto
+
+### Thanks
+
+  * NAITOH Jun
+
+  * KITAITI Makoto
+
 ## 3.3.7 - 2024-09-04 {#version-3-3-7}
 
 ### Improvements

diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
@@ -150,12 +150,13 @@ module Private
         PEDECL_PATTERN = "\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
         ENTITYDECL_PATTERN = /(?:#{GEDECL_PATTERN})|(?:#{PEDECL_PATTERN})/um
         CARRIAGE_RETURN_NEWLINE_PATTERN = /\r\n?/
-        CHARACTER_REFERENCES = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
+        CHARACTER_REFERENCES = /&#((?:\d+)|(?:x[a-fA-F0-9]+));/
         DEFAULT_ENTITIES_PATTERNS = {}
         default_entities = ['gt', 'lt', 'quot', 'apos', 'amp']
         default_entities.each do |term|
           DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/
         end
+        XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
       end
       private_constant :Private
 
@@ -166,6 +167,7 @@ def initialize( source )
         @entity_expansion_count = 0
         @entity_expansion_limit = Security.entity_expansion_limit
         @entity_expansion_text_limit = Security.entity_expansion_text_limit
+        @source.ensure_buffer
       end
 
       def add_listener( listener )
@@ -185,7 +187,7 @@ def stream=( source )
         @tags = []
         @stack = []
         @entities = []
-        @namespaces = {}
+        @namespaces = {"xml" => Private::XML_PREFIXED_NAMESPACE}
         @namespaces_restore_stack = []
       end
 
@@ -568,8 +570,12 @@ def unnormalize( string, entities=nil, filter=nil )
         return rv if matches.size == 0
         rv.gsub!( Private::CHARACTER_REFERENCES ) {
           m=$1
-          m = "0#{m}" if m[0] == ?x
-          [Integer(m)].pack('U*')
+          if m.start_with?("x")
+            code_point = Integer(m[1..-1], 16)
+          else
+            code_point = Integer(m, 10)
+          end
+          [code_point].pack('U*')
         }
         matches.collect!{|x|x[0]}.compact!
         if filter
@@ -790,7 +796,7 @@ def parse_attributes(prefixes)
             @source.match(/\s*/um, true)
             if prefix == "xmlns"
               if local_part == "xml"
-                if value != "http://www.w3.org/XML/1998/namespace"
+                if value != Private::XML_PREFIXED_NAMESPACE
                   msg = "The 'xml' prefix must not be bound to any other namespace "+
                     "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
                   raise REXML::ParseException.new( msg, @source, self )

diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb
@@ -259,6 +259,8 @@ def add( pair )
       end
 
       def get_namespace( prefix )
+        return nil if @namespace_stack.empty?
+
         uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) ||
           (@namespace_stack.find { |ns| not ns[nil].nil? })
         uris[-1][prefix] unless uris.nil? or 0 == uris.size

diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
@@ -31,7 +31,7 @@
 module REXML
   COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
   DATE = "2008/019"
-  VERSION = "3.3.7"
+  VERSION = "3.3.9"
   REVISION = ""
 
   Copyright = COPYRIGHT

diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
@@ -77,6 +77,7 @@ def initialize(arg, encoding=nil)
         detect_encoding
       end
       @line = 0
+      @term_encord = {}
     end
 
     # The current buffer (what we're going to read next)
@@ -227,7 +228,7 @@ def read(term = nil, min_bytes = 1)
 
     def read_until(term)
       pattern = Private::PRE_DEFINED_TERM_PATTERNS[term] || /#{Regexp.escape(term)}/
-      term = encode(term)
+      term = @term_encord[term] ||= encode(term)
       until str = @scanner.scan_until(pattern)
         break if @source.nil?
         break if @source.eof?
@@ -294,14 +295,19 @@ def current_line
 
     private
     def readline(term = nil)
-      str = @source.readline(term || @line_break)
       if @pending_buffer
+        begin
+          str = @source.readline(term || @line_break)
+        rescue IOError
+        end
         if str.nil?
           str = @pending_buffer
         else
           str = @pending_buffer + str
         end
         @pending_buffer = nil
+      else
+        str = @source.readline(term || @line_break)
       end
       return nil if str.nil?
 

diff --git a/test/parse/test_character_reference.rb b/test/parse/test_character_reference.rb
@@ -13,5 +13,11 @@ def test_linear_performance_many_preceding_zeros
         REXML::Document.new('<test testing="&#' + "0" * n + '97;"/>')
       end
     end
+
+    def test_hex_precedding_zero
+      parser = REXML::Parsers::PullParser.new("<root>&#x61;&#0x61;</root>")
+      parser.pull # :start_element
+      assert_equal("a&#0x61;", parser.pull[1]) # :text
+    end
   end
 end
diff --git a/test/parse/test_text.rb b/test/parse/test_text.rb
@@ -4,6 +4,23 @@
 module REXMLTests
   class TestParseText < Test::Unit::TestCase
     class TestInvalid < self
+      def test_text_only
+        exception = assert_raise(REXML::ParseException) do
+          parser = REXML::Parsers::BaseParser.new('a')
+          while parser.has_next?
+            parser.pull
+          end
+        end
+
+        assert_equal(<<~DETAIL.chomp, exception.to_s)
+          Malformed XML: Content at the start of the document (got 'a')
+          Line: 1
+          Position: 1
+          Last 80 unconsumed characters:
+
+        DETAIL
+      end
+
       def test_before_root
         exception = assert_raise(REXML::ParseException) do
           parser = REXML::Parsers::BaseParser.new('b<a></a>')

diff --git a/test/parser/test_base_parser.rb b/test/parser/test_base_parser.rb
@@ -23,5 +23,40 @@ def test_large_xml
         parser.position < xml.bytesize
       end
     end
+
+    def test_attribute_prefixed_by_xml
+      xml = <<-XML
+        <?xml version="1.0" encoding="UTF-8"?>
+        <!DOCTYPE html>
+        <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+          <head>
+            <title>XHTML Document</title>
+          </head>
+          <body>
+            <h1>XHTML Document</h1>
+            <p xml:lang="ja" lang="ja">For Japanese</p>
+          </body>
+        </html>
+      XML
+
+      parser = REXML::Parsers::BaseParser.new(xml)
+      5.times {parser.pull}
+
+      html = parser.pull
+      assert_equal([:start_element,
+                    "html",
+                    {"xmlns" => "http://www.w3.org/1999/xhtml",
+                     "xml:lang" => "en",
+                     "lang" => "en"}],
+                   html)
+
+      15.times {parser.pull}
+
+      p = parser.pull
+      assert_equal([:start_element,
+                    "p",
+                    {"xml:lang" => "ja", "lang" => "ja"}],
+                   p)
+    end
   end
 end
diff --git a/test/test_document.rb b/test/test_document.rb
@@ -403,6 +403,40 @@ def test_utf_16
           assert_equal(expected_xml, actual_xml)
         end
       end
+
+      class ReadUntilTest < Test::Unit::TestCase
+        def test_utf_8
+          xml = <<-EOX.force_encoding("ASCII-8BIT")
+<?xml version="1.0" encoding="UTF-8"?>
+<message testing=">">Hello world!</message>
+EOX
+          document = REXML::Document.new(xml)
+          assert_equal("UTF-8", document.encoding)
+          assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
+        end
+
+        def test_utf_16le
+          xml = <<-EOX.encode("UTF-16LE").force_encoding("ASCII-8BIT")
+<?xml version="1.0" encoding="UTF-16"?>
+<message testing=">">Hello world!</message>
+EOX
+          bom = "\ufeff".encode("UTF-16LE").force_encoding("ASCII-8BIT")
+          document = REXML::Document.new(bom + xml)
+          assert_equal("UTF-16", document.encoding)
+          assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
+        end
+
+        def test_utf_16be
+          xml = <<-EOX.encode("UTF-16BE").force_encoding("ASCII-8BIT")
+<?xml version="1.0" encoding="UTF-16"?>
+<message testing=">">Hello world!</message>
+EOX
+          bom = "\ufeff".encode("UTF-16BE").force_encoding("ASCII-8BIT")
+          document = REXML::Document.new(bom + xml)
+          assert_equal("UTF-16", document.encoding)
+          assert_equal(">", REXML::XPath.match(document, "/message")[0].attribute("testing").value)
+        end
+      end
     end
   end
 end
diff --git a/test/test_sax.rb b/test/test_sax.rb
@@ -99,6 +99,52 @@ def test_sax2
       end
     end
 
+    def test_without_namespace
+      xml = <<-XML
+<root >
+  <a att1='1' att2='2' att3='&lt;'>
+    <b />
+  </a>
+</root>
+      XML
+
+      parser = REXML::Parsers::SAX2Parser.new(xml)
+      elements = []
+      parser.listen(:start_element) do |uri, localname, qname, attrs|
+        elements << [uri, localname, qname, attrs]
+      end
+      parser.parse
+      assert_equal([
+        [nil, "root", "root", {}],
+        [nil, "a", "a", {"att1"=>"1", "att2"=>"2", "att3"=>"&lt;"}],
+        [nil, "b", "b", {}]
+      ], elements)
+    end
+
+    def test_with_namespace
+      xml = <<-XML
+<root xmlns="http://example.org/default"
+      xmlns:foo="http://example.org/foo"
+      xmlns:bar="http://example.org/bar">
+  <a foo:att='1' bar:att='2' att='&lt;'>
+    <bar:b />
+  </a>
+</root>
+      XML
+
+      parser = REXML::Parsers::SAX2Parser.new(xml)
+      elements = []
+      parser.listen(:start_element) do |uri, localname, qname, attrs|
+        elements << [uri, localname, qname, attrs]
+      end
+      parser.parse
+      assert_equal([
+        ["http://example.org/default", "root", "root",  {"xmlns"=>"http://example.org/default", "xmlns:bar"=>"http://example.org/bar", "xmlns:foo"=>"http://example.org/foo"}],
+        ["http://example.org/default", "a", "a", {"att"=>"&lt;", "bar:att"=>"2", "foo:att"=>"1"}],
+        ["http://example.org/bar", "b", "bar:b", {}]
+      ], elements)
+    end
+
     class EntityExpansionLimitTest < Test::Unit::TestCase
       class GeneralEntityTest < self
         def test_have_value