Use ruby unicode normalize to avoid libidn C problems and heavy legac…

…y ruby code
sporkmonger · Feb 14, 2023 · 9499a18 · 9499a18
1 parent 1fdd676
commit 9499a18
Show file tree

Hide file tree

Showing 7 changed files with 71 additions and 204 deletions.
diff --git a/benchmark/unicode_normalize.rb b/benchmark/unicode_normalize.rb
@@ -0,0 +1,34 @@
+# /usr/bin/env ruby
+# frozen_string_literal: true.
+
+require "benchmark"
+require "addressable/idna/pure.rb"
+require "idn"
+
+value = "ﬁﾯリ宠퐱卄.com"
+expected = "fiᆵリ宠퐱卄.com"
+N = 100_000
+
+fail "ruby does not match" unless expected == value.unicode_normalize(:nfkc)
+fail "libidn does not match" unless expected == IDN::Stringprep.nfkc_normalize(value)
+fail "addressable does not match" unless expected == Addressable::IDNA.unicode_normalize_kc(value)
+
+Benchmark.bmbm do |x|
+  x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } }
+  x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } }
+  x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } }
+end
+
+# February 14th 2023, before replacing the legacy pure normalize code:
+
+# > ruby benchmark/unicode_normalize.rb
+# Rehearsal ------------------------------------------
+# pure     1.335230   0.000315   1.335545 (  1.335657)
+# libidn   0.058568   0.000000   0.058568 (  0.058570)
+# ruby     0.326008   0.000014   0.326022 (  0.326026)
+# --------------------------------- total: 1.720135sec
+
+#              user     system      total        real
+# pure     1.325948   0.000000   1.325948 (  1.326054)
+# libidn   0.058067   0.000000   0.058067 (  0.058069)
+# ruby     0.325062   0.000000   0.325062 (  0.325115)
diff --git a/lib/addressable/idna/native.rb b/lib/addressable/idna/native.rb
@@ -29,10 +29,6 @@ def self.punycode_decode(value)
        IDN::Punycode.decode(value.to_s)
      end
 
-    def self.unicode_normalize_kc(value)
-      IDN::Stringprep.nfkc_normalize(value.to_s)
-    end
-
     def self.to_ascii(value)
       value.to_s.split('.', -1).map do |segment|
         if segment.size > 0 && segment.size < 64

diff --git a/lib/addressable/idna/pure.rb b/lib/addressable/idna/pure.rb
@@ -66,7 +66,7 @@ module IDNA
     # domain name as described in RFC 3490.
     def self.to_ascii(input)
       input = input.to_s unless input.is_a?(String)
-      input = input.dup
+      input = input.dup.force_encoding(Encoding::UTF_8).unicode_normalize(:nfkc)
       if input.respond_to?(:force_encoding)
         input.force_encoding(Encoding::ASCII_8BIT)
       end
@@ -77,7 +77,7 @@ def self.to_ascii(input)
             part.force_encoding(Encoding::ASCII_8BIT)
           end
           if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
-            ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
+            ACE_PREFIX + punycode_encode(part)
           else
             part
           end
@@ -112,15 +112,6 @@ def self.to_unicode(input)
       output
     end
 
-    # Unicode normalization form KC.
-    def self.unicode_normalize_kc(input)
-      input = input.to_s unless input.is_a?(String)
-      unpacked = input.unpack("U*")
-      unpacked =
-        unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
-      return unpacked.pack("U*")
-    end
-
     ##
     # Unicode aware downcase method.
     #
@@ -136,164 +127,6 @@ def self.unicode_downcase(input)
     end
     private_class_method :unicode_downcase
 
-    def self.unicode_compose(unpacked)
-      unpacked_result = []
-      length = unpacked.length
-
-      return unpacked if length == 0
-
-      starter = unpacked[0]
-      starter_cc = lookup_unicode_combining_class(starter)
-      starter_cc = 256 if starter_cc != 0
-      for i in 1...length
-        ch = unpacked[i]
-
-        if (starter_cc == 0 &&
-            (composite = unicode_compose_pair(starter, ch)) != nil)
-          starter = composite
-        else
-          unpacked_result << starter
-          starter = ch
-        end
-      end
-      unpacked_result << starter
-      return unpacked_result
-    end
-    private_class_method :unicode_compose
-
-    def self.unicode_compose_pair(ch_one, ch_two)
-      if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
-          ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
-        # Hangul L + V
-        return HANGUL_SBASE + (
-          (ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
-        ) * HANGUL_TCOUNT
-      elsif ch_one >= HANGUL_SBASE &&
-          ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
-          (ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
-          ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
-           # Hangul LV + T
-        return ch_one + (ch_two - HANGUL_TBASE)
-      end
-
-      p = []
-
-      ucs4_to_utf8(ch_one, p)
-      ucs4_to_utf8(ch_two, p)
-
-      return lookup_unicode_composition(p)
-    end
-    private_class_method :unicode_compose_pair
-
-    def self.ucs4_to_utf8(char, buffer)
-      if char < 128
-        buffer << char
-      elsif char < 2048
-        buffer << (char >> 6 | 192)
-        buffer << (char & 63 | 128)
-      elsif char < 0x10000
-        buffer << (char >> 12 | 224)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x200000
-        buffer << (char >> 18 | 240)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x4000000
-        buffer << (char >> 24 | 248)
-        buffer << (char >> 18 & 63 | 128)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      elsif char < 0x80000000
-        buffer << (char >> 30 | 252)
-        buffer << (char >> 24 & 63 | 128)
-        buffer << (char >> 18 & 63 | 128)
-        buffer << (char >> 12 & 63 | 128)
-        buffer << (char >> 6 & 63 | 128)
-        buffer << (char & 63 | 128)
-      end
-    end
-    private_class_method :ucs4_to_utf8
-
-    def self.unicode_sort_canonical(unpacked)
-      unpacked = unpacked.dup
-      i = 1
-      length = unpacked.length
-
-      return unpacked if length < 2
-
-      while i < length
-        last = unpacked[i-1]
-        ch = unpacked[i]
-        last_cc = lookup_unicode_combining_class(last)
-        cc = lookup_unicode_combining_class(ch)
-        if cc != 0 && last_cc != 0 && last_cc > cc
-          unpacked[i] = last
-          unpacked[i-1] = ch
-          i -= 1 if i > 1
-        else
-          i += 1
-        end
-      end
-      return unpacked
-    end
-    private_class_method :unicode_sort_canonical
-
-    def self.unicode_decompose(unpacked)
-      unpacked_result = []
-      for cp in unpacked
-        if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
-          l, v, t = unicode_decompose_hangul(cp)
-          unpacked_result << l
-          unpacked_result << v if v
-          unpacked_result << t if t
-        else
-          dc = lookup_unicode_compatibility(cp)
-          unless dc
-            unpacked_result << cp
-          else
-            unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
-          end
-        end
-      end
-      return unpacked_result
-    end
-    private_class_method :unicode_decompose
-
-    def self.unicode_decompose_hangul(codepoint)
-      sindex = codepoint - HANGUL_SBASE;
-      if sindex < 0 || sindex >= HANGUL_SCOUNT
-        l = codepoint
-        v = t = nil
-        return l, v, t
-      end
-      l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
-      v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
-      t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
-      if t == HANGUL_TBASE
-        t = nil
-      end
-      return l, v, t
-    end
-    private_class_method :unicode_decompose_hangul
-
-    def self.lookup_unicode_combining_class(codepoint)
-      codepoint_data = UNICODE_DATA[codepoint]
-      (codepoint_data ?
-        (codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
-        0)
-    end
-    private_class_method :lookup_unicode_combining_class
-
-    def self.lookup_unicode_compatibility(codepoint)
-      codepoint_data = UNICODE_DATA[codepoint]
-      (codepoint_data ?
-        codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
-    end
-    private_class_method :lookup_unicode_compatibility
-
     def self.lookup_unicode_lowercase(codepoint)
       codepoint_data = UNICODE_DATA[codepoint]
       (codepoint_data ?
@@ -302,21 +135,6 @@ def self.lookup_unicode_lowercase(codepoint)
     end
     private_class_method :lookup_unicode_lowercase
 
-    def self.lookup_unicode_composition(unpacked)
-      return COMPOSITION_TABLE[unpacked]
-    end
-    private_class_method :lookup_unicode_composition
-
-    HANGUL_SBASE =  0xac00
-    HANGUL_LBASE =  0x1100
-    HANGUL_LCOUNT = 19
-    HANGUL_VBASE =  0x1161
-    HANGUL_VCOUNT = 21
-    HANGUL_TBASE =  0x11a7
-    HANGUL_TCOUNT = 28
-    HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
-    HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172
-
     UNICODE_DATA_COMBINING_CLASS = 0
     UNICODE_DATA_EXCLUSION = 1
     UNICODE_DATA_CANONICAL = 2

diff --git a/lib/addressable/template.rb b/lib/addressable/template.rb
@@ -892,7 +892,7 @@ def join_values(operator, return_value)
     # operator.
     #
     # @param [Hash, Array, String] value
-    #   Normalizes keys and values with IDNA#unicode_normalize_kc
+    #   Normalizes unicode keys and values with String#unicode_normalize (NFC)
     #
     # @return [Hash, Array, String] The normalized values
     def normalize_value(value)
@@ -902,15 +902,14 @@ def normalize_value(value)
 
       # Handle unicode normalization
       if value.kind_of?(Array)
-        value.map! { |val| Addressable::IDNA.unicode_normalize_kc(val) }
+        value.map! { |val| normalize_value(val) }
       elsif value.kind_of?(Hash)
         value = value.inject({}) { |acc, (k, v)|
-          acc[Addressable::IDNA.unicode_normalize_kc(k)] =
-            Addressable::IDNA.unicode_normalize_kc(v)
+          acc[normalize_value(k)] = normalize_value(v)
           acc
         }
-      else
-        value = Addressable::IDNA.unicode_normalize_kc(value)
+      elsif value.encoding == Encoding::UTF_8
+        value = value.unicode_normalize(:nfc)
       end
       value
     end

diff --git a/lib/addressable/uri.rb b/lib/addressable/uri.rb
@@ -53,7 +53,7 @@ module CharacterClasses
       PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze
       SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze
       HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze
-      AUTHORITY = (PCHAR + "\\[\\:\\]").freeze
+      AUTHORITY = (PCHAR + "\\[\\]").freeze
       PATH = (PCHAR + "\\/").freeze
       QUERY = (PCHAR + "\\/\\?").freeze
       FRAGMENT = (PCHAR + "\\/\\?").freeze
@@ -481,7 +481,7 @@ def self.unencode(uri, return_type=String, leave_encoded='')
         leave_encoded.include?(c) ? sequence : c
       end
 
-      result.force_encoding("utf-8")
+      result.force_encoding(Encoding::UTF_8)
       if return_type == String
         return result
       elsif return_type == ::Addressable::URI
@@ -579,7 +579,7 @@ def self.normalize_component(component, character_class=
       unencoded = self.unencode_component(component, String, leave_encoded)
       begin
         encoded = self.encode_component(
-          Addressable::IDNA.unicode_normalize_kc(unencoded),
+          unencoded.unicode_normalize(:nfc),
           character_class,
           leave_encoded
         )
@@ -687,8 +687,7 @@ def self.normalized_encode(uri, return_type=String)
       components.each do |key, value|
         if value != nil
           begin
-            components[key] =
-              Addressable::IDNA.unicode_normalize_kc(value.to_str)
+            components[key] = value.to_str.unicode_normalize(:nfc)
           rescue ArgumentError
             # Likely a malformed UTF-8 character, skip unicode normalization
             components[key] = value.to_str

diff --git a/spec/addressable/idna_spec.rb b/spec/addressable/idna_spec.rb
@@ -38,6 +38,12 @@
     )).to eq("www.xn--8ws00zhy3a.com")
   end
 
+  it "also accepts unicode strings encoded as ascii-8bit" do
+    expect(Addressable::IDNA.to_ascii(
+      "www.詹姆斯.com".b
+    )).to eq("www.xn--8ws00zhy3a.com")
+  end
+
   it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do
     "www.Iñtërnâtiônàlizætiøn.com"
     expect(Addressable::IDNA.to_ascii(
@@ -249,11 +255,6 @@
       "example..host"
     )).to eq("example..host")
   end
-
-  it "should normalize 'string' correctly" do
-    expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string")
-    expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string")
-  end
 end
 
 describe Addressable::IDNA, "when using the pure-Ruby implementation" do

diff --git a/spec/addressable/uri_spec.rb b/spec/addressable/uri_spec.rb
@@ -5953,6 +5953,26 @@ def to_str
   end
 end
 
+describe Addressable::URI, "when normalizing a path with special unicode" do
+  it "does not stop at or ignore null bytes" do
+    expect(Addressable::URI.parse("/path%00segment/").normalize.path).to eq(
+      "/path%00segment/"
+    )
+  end
+
+  it "does apply NFC unicode normalization" do
+    expect(Addressable::URI.parse("/%E2%84%A6").normalize.path).to eq(
+      "/%CE%A9"
+    )
+  end
+
+  it "does not apply NFKC unicode normalization" do
+    expect(Addressable::URI.parse("/%C2%AF%C2%A0").normalize.path).to eq(
+      "/%C2%AF%C2%A0"
+    )
+  end
+end
+
 describe Addressable::URI, "when normalizing a partially encoded string" do
   it "should result in correct percent encoded sequence" do
     expect(Addressable::URI.normalize_component(