Skip to content

Commit

Permalink
Merge pull request #2100 from davidopic/master
Browse files Browse the repository at this point in the history
Fix UTF-8 failures in Watch
  • Loading branch information
k8s-ci-robot committed Dec 20, 2023
2 parents 72c830d + 934d026 commit 1193741
Show file tree
Hide file tree
Showing 2 changed files with 144 additions and 11 deletions.
35 changes: 24 additions & 11 deletions kubernetes/base/watch/watch.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,33 @@ def _find_return_type(func):


def iter_resp_lines(resp):
prev = ""
for seg in resp.stream(amt=None, decode_content=False):
if isinstance(seg, bytes):
seg = seg.decode('utf8')
seg = prev + seg
lines = seg.split("\n")
if not seg.endswith("\n"):
prev = lines[-1]
lines = lines[:-1]
buffer = bytearray()
for segment in resp.stream(amt=None, decode_content=False):

# Append the segment (chunk) to the buffer
#
# Performance note: depending on contents of buffer and the type+value of segment,
# encoding segment into the buffer could be a wasteful step. The approach used here
# simplifies the logic farther down, but in the future it may be reasonable to
# sacrifice readability for performance.
if isinstance(segment, bytes):
buffer.extend(segment)
elif isinstance(segment, str):
buffer.extend(segment.encode("utf-8"))
else:
prev = ""
for line in lines:
raise TypeError(
f"Received invalid segment type, {type(segment)}, from stream. Accepts only 'str' or 'bytes'.")

# Split by newline (safe for utf-8 because multi-byte sequences cannot contain the newline byte)
next_newline = buffer.find(b'\n')
while next_newline != -1:
# Convert bytes to a valid utf-8 string, replacing any invalid utf-8 with the '�' character
line = buffer[:next_newline].decode(
"utf-8", errors="replace")
buffer = buffer[next_newline+1:]
if line:
yield line
next_newline = buffer.find(b'\n')


class Watch(object):
Expand Down
120 changes: 120 additions & 0 deletions kubernetes/base/watch/watch_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,133 @@ def test_watch_with_decode(self):
if count == 4:
w.stop()

# make sure that all three records were consumed by the stream
self.assertEqual(4, count)

fake_api.get_namespaces.assert_called_once_with(
_preload_content=False, watch=True)
fake_resp.stream.assert_called_once_with(
amt=None, decode_content=False)
fake_resp.close.assert_called_once()
fake_resp.release_conn.assert_called_once()

def test_watch_with_interspersed_newlines(self):
fake_resp = Mock()
fake_resp.close = Mock()
fake_resp.release_conn = Mock()
fake_resp.stream = Mock(
return_value=[
'\n',
'{"type": "ADDED", "object": {"metadata":',
'{"name": "test1","resourceVersion": "1"}}}\n{"type": "ADDED", ',
'"object": {"metadata": {"name": "test2", "resourceVersion": "2"}}}\n',
'\n',
'',
'{"type": "ADDED", "object": {"metadata": {"name": "test3", "resourceVersion": "3"}}}\n',
'\n\n\n',
'\n',
])

fake_api = Mock()
fake_api.get_namespaces = Mock(return_value=fake_resp)
fake_api.get_namespaces.__doc__ = ':return: V1NamespaceList'

w = Watch()
count = 0

# Consume all test events from the mock service, stopping when no more data is available.
# Note that "timeout_seconds" below is not a timeout; rather, it disables retries and is
# the only way to do so. Without that, the stream will re-read the test data forever.
for e in w.stream(fake_api.get_namespaces, timeout_seconds=1):
count += 1
self.assertEqual("test%d" % count, e['object'].metadata.name)
self.assertEqual(3, count)

def test_watch_with_multibyte_utf8(self):
fake_resp = Mock()
fake_resp.close = Mock()
fake_resp.release_conn = Mock()
fake_resp.stream = Mock(
return_value=[
# two-byte utf-8 character
'{"type":"MODIFIED","object":{"data":{"utf-8":"© 1"},"metadata":{"name":"test1","resourceVersion":"1"}}}\n',
# same copyright character expressed as bytes
b'{"type":"MODIFIED","object":{"data":{"utf-8":"\xC2\xA9 2"},"metadata":{"name":"test2","resourceVersion":"2"}}}\n'
# same copyright character with bytes split across two stream chunks
b'{"type":"MODIFIED","object":{"data":{"utf-8":"\xC2',
b'\xA9 3"},"metadata":{"n',
# more chunks of the same event, sent as a mix of bytes and strings
'ame":"test3","resourceVersion":"3"',
'}}}',
b'\n'
])

fake_api = Mock()
fake_api.get_configmaps = Mock(return_value=fake_resp)
fake_api.get_configmaps.__doc__ = ':return: V1ConfigMapList'

w = Watch()
count = 0

# Consume all test events from the mock service, stopping when no more data is available.
# Note that "timeout_seconds" below is not a timeout; rather, it disables retries and is
# the only way to do so. Without that, the stream will re-read the test data forever.
for event in w.stream(fake_api.get_configmaps, timeout_seconds=1):
count += 1
self.assertEqual("MODIFIED", event['type'])
self.assertEqual("test%d" % count, event['object'].metadata.name)
self.assertEqual("© %d" % count, event['object'].data["utf-8"])
self.assertEqual(
"%d" % count, event['object'].metadata.resource_version)
self.assertEqual("%d" % count, w.resource_version)
self.assertEqual(3, count)

def test_watch_with_invalid_utf8(self):
fake_resp = Mock()
fake_resp.close = Mock()
fake_resp.release_conn = Mock()
fake_resp.stream = Mock(
# test 1 uses 1 invalid utf-8 byte
# test 2 uses a sequence of 2 invalid utf-8 bytes
# test 3 uses a sequence of 3 invalid utf-8 bytes
return_value=[
# utf-8 sequence for 😄 is \xF0\x9F\x98\x84
# all other sequences below are invalid
# ref: https://www.w3.org/2001/06/utf-8-wrong/UTF-8-test.html
b'{"type":"MODIFIED","object":{"data":{"utf-8":"\xF0\x9F\x98\x84 1","invalid":"\x80 1"},"metadata":{"name":"test1"}}}\n',
b'{"type":"MODIFIED","object":{"data":{"utf-8":"\xF0\x9F\x98\x84 2","invalid":"\xC0\xAF 2"},"metadata":{"name":"test2"}}}\n',
# mix bytes/strings and split byte sequences across chunks
b'{"type":"MODIFIED","object":{"data":{"utf-8":"\xF0\x9F\x98',
b'\x84 ',
b'',
b'3","invalid":"\xE0\x80',
b'\xAF ',
'3"},"metadata":{"n',
'ame":"test3"',
'}}}',
b'\n'
])

fake_api = Mock()
fake_api.get_configmaps = Mock(return_value=fake_resp)
fake_api.get_configmaps.__doc__ = ':return: V1ConfigMapList'

w = Watch()
count = 0

# Consume all test events from the mock service, stopping when no more data is available.
# Note that "timeout_seconds" below is not a timeout; rather, it disables retries and is
# the only way to do so. Without that, the stream will re-read the test data forever.
for event in w.stream(fake_api.get_configmaps, timeout_seconds=1):
count += 1
self.assertEqual("MODIFIED", event['type'])
self.assertEqual("test%d" % count, event['object'].metadata.name)
self.assertEqual("😄 %d" % count, event['object'].data["utf-8"])
# expect N replacement characters in test N
self.assertEqual("� %d".replace('�', '�'*count) %
count, event['object'].data["invalid"])
self.assertEqual(3, count)

def test_watch_for_follow(self):
fake_resp = Mock()
fake_resp.close = Mock()
Expand Down

0 comments on commit 1193741

Please sign in to comment.