Skip to content

Commit

Permalink
Use ruby unicode normalize to avoid libidn C problems and heavy legac…
Browse files Browse the repository at this point in the history
…y ruby code
  • Loading branch information
jarthod committed Feb 14, 2023
1 parent 1fdd676 commit 9499a18
Show file tree
Hide file tree
Showing 7 changed files with 71 additions and 204 deletions.
34 changes: 34 additions & 0 deletions benchmark/unicode_normalize.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# /usr/bin/env ruby
# frozen_string_literal: true.

require "benchmark"
require "addressable/idna/pure.rb"
require "idn"

value = "fiᆵリ宠퐱卄.com"
expected = "fiᆵリ宠퐱卄.com"
N = 100_000

fail "ruby does not match" unless expected == value.unicode_normalize(:nfkc)
fail "libidn does not match" unless expected == IDN::Stringprep.nfkc_normalize(value)
fail "addressable does not match" unless expected == Addressable::IDNA.unicode_normalize_kc(value)

Benchmark.bmbm do |x|
x.report("pure") { N.times { Addressable::IDNA.unicode_normalize_kc(value) } }
x.report("libidn") { N.times { IDN::Stringprep.nfkc_normalize(value) } }
x.report("ruby") { N.times { value.unicode_normalize(:nfkc) } }
end

# February 14th 2023, before replacing the legacy pure normalize code:

# > ruby benchmark/unicode_normalize.rb
# Rehearsal ------------------------------------------
# pure 1.335230 0.000315 1.335545 ( 1.335657)
# libidn 0.058568 0.000000 0.058568 ( 0.058570)
# ruby 0.326008 0.000014 0.326022 ( 0.326026)
# --------------------------------- total: 1.720135sec

# user system total real
# pure 1.325948 0.000000 1.325948 ( 1.326054)
# libidn 0.058067 0.000000 0.058067 ( 0.058069)
# ruby 0.325062 0.000000 0.325062 ( 0.325115)
4 changes: 0 additions & 4 deletions lib/addressable/idna/native.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@ def self.punycode_decode(value)
IDN::Punycode.decode(value.to_s)
end

def self.unicode_normalize_kc(value)
IDN::Stringprep.nfkc_normalize(value.to_s)
end

def self.to_ascii(value)
value.to_s.split('.', -1).map do |segment|
if segment.size > 0 && segment.size < 64
Expand Down
186 changes: 2 additions & 184 deletions lib/addressable/idna/pure.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ module IDNA
# domain name as described in RFC 3490.
def self.to_ascii(input)
input = input.to_s unless input.is_a?(String)
input = input.dup
input = input.dup.force_encoding(Encoding::UTF_8).unicode_normalize(:nfkc)
if input.respond_to?(:force_encoding)
input.force_encoding(Encoding::ASCII_8BIT)
end
Expand All @@ -77,7 +77,7 @@ def self.to_ascii(input)
part.force_encoding(Encoding::ASCII_8BIT)
end
if part =~ UTF8_REGEX && part =~ UTF8_REGEX_MULTIBYTE
ACE_PREFIX + punycode_encode(unicode_normalize_kc(part))
ACE_PREFIX + punycode_encode(part)
else
part
end
Expand Down Expand Up @@ -112,15 +112,6 @@ def self.to_unicode(input)
output
end

# Unicode normalization form KC.
def self.unicode_normalize_kc(input)
input = input.to_s unless input.is_a?(String)
unpacked = input.unpack("U*")
unpacked =
unicode_compose(unicode_sort_canonical(unicode_decompose(unpacked)))
return unpacked.pack("U*")
end

##
# Unicode aware downcase method.
#
Expand All @@ -136,164 +127,6 @@ def self.unicode_downcase(input)
end
private_class_method :unicode_downcase

def self.unicode_compose(unpacked)
unpacked_result = []
length = unpacked.length

return unpacked if length == 0

starter = unpacked[0]
starter_cc = lookup_unicode_combining_class(starter)
starter_cc = 256 if starter_cc != 0
for i in 1...length
ch = unpacked[i]

if (starter_cc == 0 &&
(composite = unicode_compose_pair(starter, ch)) != nil)
starter = composite
else
unpacked_result << starter
starter = ch
end
end
unpacked_result << starter
return unpacked_result
end
private_class_method :unicode_compose

def self.unicode_compose_pair(ch_one, ch_two)
if ch_one >= HANGUL_LBASE && ch_one < HANGUL_LBASE + HANGUL_LCOUNT &&
ch_two >= HANGUL_VBASE && ch_two < HANGUL_VBASE + HANGUL_VCOUNT
# Hangul L + V
return HANGUL_SBASE + (
(ch_one - HANGUL_LBASE) * HANGUL_VCOUNT + (ch_two - HANGUL_VBASE)
) * HANGUL_TCOUNT
elsif ch_one >= HANGUL_SBASE &&
ch_one < HANGUL_SBASE + HANGUL_SCOUNT &&
(ch_one - HANGUL_SBASE) % HANGUL_TCOUNT == 0 &&
ch_two >= HANGUL_TBASE && ch_two < HANGUL_TBASE + HANGUL_TCOUNT
# Hangul LV + T
return ch_one + (ch_two - HANGUL_TBASE)
end

p = []

ucs4_to_utf8(ch_one, p)
ucs4_to_utf8(ch_two, p)

return lookup_unicode_composition(p)
end
private_class_method :unicode_compose_pair

def self.ucs4_to_utf8(char, buffer)
if char < 128
buffer << char
elsif char < 2048
buffer << (char >> 6 | 192)
buffer << (char & 63 | 128)
elsif char < 0x10000
buffer << (char >> 12 | 224)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x200000
buffer << (char >> 18 | 240)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x4000000
buffer << (char >> 24 | 248)
buffer << (char >> 18 & 63 | 128)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
elsif char < 0x80000000
buffer << (char >> 30 | 252)
buffer << (char >> 24 & 63 | 128)
buffer << (char >> 18 & 63 | 128)
buffer << (char >> 12 & 63 | 128)
buffer << (char >> 6 & 63 | 128)
buffer << (char & 63 | 128)
end
end
private_class_method :ucs4_to_utf8

def self.unicode_sort_canonical(unpacked)
unpacked = unpacked.dup
i = 1
length = unpacked.length

return unpacked if length < 2

while i < length
last = unpacked[i-1]
ch = unpacked[i]
last_cc = lookup_unicode_combining_class(last)
cc = lookup_unicode_combining_class(ch)
if cc != 0 && last_cc != 0 && last_cc > cc
unpacked[i] = last
unpacked[i-1] = ch
i -= 1 if i > 1
else
i += 1
end
end
return unpacked
end
private_class_method :unicode_sort_canonical

def self.unicode_decompose(unpacked)
unpacked_result = []
for cp in unpacked
if cp >= HANGUL_SBASE && cp < HANGUL_SBASE + HANGUL_SCOUNT
l, v, t = unicode_decompose_hangul(cp)
unpacked_result << l
unpacked_result << v if v
unpacked_result << t if t
else
dc = lookup_unicode_compatibility(cp)
unless dc
unpacked_result << cp
else
unpacked_result.concat(unicode_decompose(dc.unpack("U*")))
end
end
end
return unpacked_result
end
private_class_method :unicode_decompose

def self.unicode_decompose_hangul(codepoint)
sindex = codepoint - HANGUL_SBASE;
if sindex < 0 || sindex >= HANGUL_SCOUNT
l = codepoint
v = t = nil
return l, v, t
end
l = HANGUL_LBASE + sindex / HANGUL_NCOUNT
v = HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
t = HANGUL_TBASE + sindex % HANGUL_TCOUNT
if t == HANGUL_TBASE
t = nil
end
return l, v, t
end
private_class_method :unicode_decompose_hangul

def self.lookup_unicode_combining_class(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
(codepoint_data[UNICODE_DATA_COMBINING_CLASS] || 0) :
0)
end
private_class_method :lookup_unicode_combining_class

def self.lookup_unicode_compatibility(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
codepoint_data[UNICODE_DATA_COMPATIBILITY] : nil)
end
private_class_method :lookup_unicode_compatibility

def self.lookup_unicode_lowercase(codepoint)
codepoint_data = UNICODE_DATA[codepoint]
(codepoint_data ?
Expand All @@ -302,21 +135,6 @@ def self.lookup_unicode_lowercase(codepoint)
end
private_class_method :lookup_unicode_lowercase

def self.lookup_unicode_composition(unpacked)
return COMPOSITION_TABLE[unpacked]
end
private_class_method :lookup_unicode_composition

HANGUL_SBASE = 0xac00
HANGUL_LBASE = 0x1100
HANGUL_LCOUNT = 19
HANGUL_VBASE = 0x1161
HANGUL_VCOUNT = 21
HANGUL_TBASE = 0x11a7
HANGUL_TCOUNT = 28
HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT # 588
HANGUL_SCOUNT = HANGUL_LCOUNT * HANGUL_NCOUNT # 11172

UNICODE_DATA_COMBINING_CLASS = 0
UNICODE_DATA_EXCLUSION = 1
UNICODE_DATA_CANONICAL = 2
Expand Down
11 changes: 5 additions & 6 deletions lib/addressable/template.rb
Original file line number Diff line number Diff line change
Expand Up @@ -892,7 +892,7 @@ def join_values(operator, return_value)
# operator.
#
# @param [Hash, Array, String] value
# Normalizes keys and values with IDNA#unicode_normalize_kc
# Normalizes unicode keys and values with String#unicode_normalize (NFC)
#
# @return [Hash, Array, String] The normalized values
def normalize_value(value)
Expand All @@ -902,15 +902,14 @@ def normalize_value(value)

# Handle unicode normalization
if value.kind_of?(Array)
value.map! { |val| Addressable::IDNA.unicode_normalize_kc(val) }
value.map! { |val| normalize_value(val) }
elsif value.kind_of?(Hash)
value = value.inject({}) { |acc, (k, v)|
acc[Addressable::IDNA.unicode_normalize_kc(k)] =
Addressable::IDNA.unicode_normalize_kc(v)
acc[normalize_value(k)] = normalize_value(v)
acc
}
else
value = Addressable::IDNA.unicode_normalize_kc(value)
elsif value.encoding == Encoding::UTF_8
value = value.unicode_normalize(:nfc)
end
value
end
Expand Down
9 changes: 4 additions & 5 deletions lib/addressable/uri.rb
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ module CharacterClasses
PCHAR = (UNRESERVED + SUB_DELIMS + "\\:\\@").freeze
SCHEME = (ALPHA + DIGIT + "\\-\\+\\.").freeze
HOST = (UNRESERVED + SUB_DELIMS + "\\[\\:\\]").freeze
AUTHORITY = (PCHAR + "\\[\\:\\]").freeze
AUTHORITY = (PCHAR + "\\[\\]").freeze
PATH = (PCHAR + "\\/").freeze
QUERY = (PCHAR + "\\/\\?").freeze
FRAGMENT = (PCHAR + "\\/\\?").freeze
Expand Down Expand Up @@ -481,7 +481,7 @@ def self.unencode(uri, return_type=String, leave_encoded='')
leave_encoded.include?(c) ? sequence : c
end

result.force_encoding("utf-8")
result.force_encoding(Encoding::UTF_8)
if return_type == String
return result
elsif return_type == ::Addressable::URI
Expand Down Expand Up @@ -579,7 +579,7 @@ def self.normalize_component(component, character_class=
unencoded = self.unencode_component(component, String, leave_encoded)
begin
encoded = self.encode_component(
Addressable::IDNA.unicode_normalize_kc(unencoded),
unencoded.unicode_normalize(:nfc),
character_class,
leave_encoded
)
Expand Down Expand Up @@ -687,8 +687,7 @@ def self.normalized_encode(uri, return_type=String)
components.each do |key, value|
if value != nil
begin
components[key] =
Addressable::IDNA.unicode_normalize_kc(value.to_str)
components[key] = value.to_str.unicode_normalize(:nfc)
rescue ArgumentError
# Likely a malformed UTF-8 character, skip unicode normalization
components[key] = value.to_str
Expand Down
11 changes: 6 additions & 5 deletions spec/addressable/idna_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@
)).to eq("www.xn--8ws00zhy3a.com")
end

it "also accepts unicode strings encoded as ascii-8bit" do
expect(Addressable::IDNA.to_ascii(
"www.詹姆斯.com".b
)).to eq("www.xn--8ws00zhy3a.com")
end

it "should convert 'www.Iñtërnâtiônàlizætiøn.com' correctly" do
"www.Iñtërnâtiônàlizætiøn.com"
expect(Addressable::IDNA.to_ascii(
Expand Down Expand Up @@ -249,11 +255,6 @@
"example..host"
)).to eq("example..host")
end

it "should normalize 'string' correctly" do
expect(Addressable::IDNA.unicode_normalize_kc(:'string')).to eq("string")
expect(Addressable::IDNA.unicode_normalize_kc("string")).to eq("string")
end
end

describe Addressable::IDNA, "when using the pure-Ruby implementation" do
Expand Down
20 changes: 20 additions & 0 deletions spec/addressable/uri_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5953,6 +5953,26 @@ def to_str
end
end

describe Addressable::URI, "when normalizing a path with special unicode" do
it "does not stop at or ignore null bytes" do
expect(Addressable::URI.parse("/path%00segment/").normalize.path).to eq(
"/path%00segment/"
)
end

it "does apply NFC unicode normalization" do
expect(Addressable::URI.parse("/%E2%84%A6").normalize.path).to eq(
"/%CE%A9"
)
end

it "does not apply NFKC unicode normalization" do
expect(Addressable::URI.parse("/%C2%AF%C2%A0").normalize.path).to eq(
"/%C2%AF%C2%A0"
)
end
end

describe Addressable::URI, "when normalizing a partially encoded string" do
it "should result in correct percent encoded sequence" do
expect(Addressable::URI.normalize_component(
Expand Down

0 comments on commit 9499a18

Please sign in to comment.