Insert replacement characters when decoding invalid UTF-8 sequences

RWKV · Jun 21, 2023 · 3767697 · 3767697
1 parent 53c19b6
commit 3767697
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/rwkv/rwkv_tokenizer.py b/rwkv/rwkv_tokenizer.py
@@ -112,7 +112,9 @@ def encode(self, src: str) -> List[int]:
         return self.encode_bytes(src.encode('utf-8'))
 
     def decode(self, tokens: List[int]) -> str:
-        return self.decode_bytes(tokens).decode('utf-8')
+        # 'replace' error handling mode will insert \uFFFD characters in place of malformed/partial UTF-8 sequences.
+        # Downstream code needs to detect \uFFFD and attempt to postpone decoding until more tokens arrive and UTF-8 sequences are complete.
+        return self.decode_bytes(tokens).decode('utf-8', errors='replace')
 
 def get_tokenizer(tokenizer: str = '20B') -> Tuple[
     Callable[[List[int]], str],