From 456f0ad4ee02096dd1f7f60d7bbb05dce7b449f6 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Sat, 17 Dec 2022 15:40:26 +0900 Subject: [PATCH] Let string splitters respect East Asian Width See also: https://www.unicode.org/reports/tr11/ --- CHANGES.md | 2 + src/black/lines.py | 3 +- src/black/strings.py | 35 ++++++++++++++ src/black/trans.py | 48 ++++++++++++------- .../preview/long_strings__east_asian_width.py | 25 ++++++++++ 5 files changed, 94 insertions(+), 19 deletions(-) create mode 100644 tests/data/preview/long_strings__east_asian_width.py diff --git a/CHANGES.md b/CHANGES.md index e1ad5e1f1cc..1b946b2d50f 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -22,6 +22,8 @@ - Long values in dict literals are now wrapped in parentheses; correspondingly unnecessary parentheses around short values in dict literals are now removed; long string lambda values are now wrapped in parentheses (#3440) +- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/) + (#3445) ### Configuration diff --git a/src/black/lines.py b/src/black/lines.py index 08281bcf370..5e32b664cf0 100644 --- a/src/black/lines.py +++ b/src/black/lines.py @@ -30,6 +30,7 @@ syms, whitespace, ) +from black.strings import str_width from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -718,7 +719,7 @@ def is_line_short_enough(line: Line, *, line_length: int, line_str: str = "") -> if not line_str: line_str = line_to_string(line) return ( - len(line_str) <= line_length + str_width(line_str) <= line_length and "\n" not in line_str # multiline strings and not line.contains_standalone_comments() ) diff --git a/src/black/strings.py b/src/black/strings.py index 9d0e2eb8430..bf41baf9fa3 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -4,6 +4,7 @@ import re import sys +import unicodedata from functools import lru_cache from typing import List, Pattern @@ -236,3 +237,37 @@ def normalize_string_quotes(s: str) -> str: return s # Prefer double quotes return f"{prefix}{new_quote}{new_body}{new_quote}" + + +def char_width(char: str) -> int: + """Return the width of a single character as it would be displayed in a + terminal or editor (which respects Unicode East Asian Width). + + Full width characters are counted as 2, while half width characters are + counted as 1. + """ + return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1 + + +def str_width(line_str: str) -> int: + """Return the width of `line_str` as it would be displayed in a terminal + or editor (which respects Unicode East Asian Width). + + You could utilize this function to determine, for example, if a string + is too wide to display in a terminal or editor. + """ + return sum(map(char_width, line_str)) + + +def count_chars_in_width(line_str: str, max_width: int) -> int: + """Count the number of characters in `line_str` that would fit in a + terminal or editor of `max_width` (which respects Unicode East Asian + Width). + """ + total_width = 0 + for i, char in enumerate(line_str): + width = char_width(char) + if width + total_width > max_width: + return i + total_width += width + return len(line_str) diff --git a/src/black/trans.py b/src/black/trans.py index b08a6d243d8..b78e139bd3c 100644 --- a/src/black/trans.py +++ b/src/black/trans.py @@ -48,9 +48,11 @@ from black.rusty import Err, Ok, Result from black.strings import ( assert_is_leaf_string, + count_chars_in_width, get_string_prefix, has_triple_quotes, normalize_string_quotes, + str_width, ) from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -71,6 +73,8 @@ class CannotTransform(Exception): TResult = Result[T, CannotTransform] # (T)ransform Result TMatchResult = TResult[Index] +SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"]) # East Asian stops + def TErr(err_msg: str) -> Err[CannotTransform]: """(T)ransform Err @@ -1040,7 +1044,7 @@ def _get_max_string_length(self, line: Line, string_idx: int) -> int: # WMA4 the length of the inline comment. offset += len(comment_leaf.value) - max_string_length = self.line_length - offset + max_string_length = count_chars_in_width(str(line), self.line_length - offset) return max_string_length @staticmethod @@ -1264,11 +1268,13 @@ def maybe_append_string_operators(new_line: Line) -> None: is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA ) - def max_last_string() -> int: + def max_last_string_column() -> int: """ Returns: - The max allowed length of the string value used for the last - line we will construct. + The max allowed width of the string value used for the last + line we will construct. Note that this value means the width + rather than the number of characters (e.g., many East Asian + characters expand to two columns). """ result = self.line_length result -= line.depth * 4 @@ -1276,14 +1282,14 @@ def max_last_string() -> int: result -= string_op_leaves_length return result - # --- Calculate Max Break Index (for string value) + # --- Calculate Max Break Width (for string value) # We start with the line length limit - max_break_idx = self.line_length + max_break_width = self.line_length # The last index of a string of length N is N-1. - max_break_idx -= 1 + max_break_width -= 1 # Leading whitespace is not present in the string value (e.g. Leaf.value). - max_break_idx -= line.depth * 4 - if max_break_idx < 0: + max_break_width -= line.depth * 4 + if max_break_width < 0: yield TErr( f"Unable to split {LL[string_idx].value} at such high of a line depth:" f" {line.depth}" @@ -1296,7 +1302,7 @@ def max_last_string() -> int: # line limit. use_custom_breakpoints = bool( custom_splits - and all(csplit.break_idx <= max_break_idx for csplit in custom_splits) + and all(csplit.break_idx <= max_break_width for csplit in custom_splits) ) # Temporary storage for the remaining chunk of the string line that @@ -1312,7 +1318,7 @@ def more_splits_should_be_made() -> bool: if use_custom_breakpoints: return len(custom_splits) > 1 else: - return len(rest_value) > max_last_string() + return str_width(rest_value) > max_last_string_column() string_line_results: List[Ok[Line]] = [] while more_splits_should_be_made(): @@ -1322,7 +1328,10 @@ def more_splits_should_be_made() -> bool: break_idx = csplit.break_idx else: # Algorithmic Split (automatic) - max_bidx = max_break_idx - string_op_leaves_length + max_bidx = ( + count_chars_in_width(rest_value, max_break_width) + - string_op_leaves_length + ) maybe_break_idx = self._get_break_idx(rest_value, max_bidx) if maybe_break_idx is None: # If we are unable to algorithmically determine a good split @@ -1414,7 +1423,7 @@ def more_splits_should_be_made() -> bool: # Try to fit them all on the same line with the last substring... if ( - len(temp_value) <= max_last_string() + str_width(temp_value) <= max_last_string_column() or LL[string_idx + 1].type == token.COMMA ): last_line.append(rest_leaf) @@ -1534,6 +1543,7 @@ def passes_all_checks(i: Index) -> bool: section of this classes' docstring would be be met by returning @i. """ is_space = string[i] == " " + is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS is_not_escaped = True j = i - 1 @@ -1546,7 +1556,7 @@ def passes_all_checks(i: Index) -> bool: and len(string[:i]) >= self.MIN_SUBSTR_SIZE ) return ( - is_space + (is_space or is_split_safe) and is_not_escaped and is_big_enough and not breaks_unsplittable_expression(i) @@ -1691,11 +1701,13 @@ def do_splitter_match(self, line: Line) -> TMatchResult: if string_idx is not None: string_value = line.leaves[string_idx].value - # If the string has no spaces... - if " " not in string_value: + # If the string has neither spaces nor East Asian stops... + if not any( + char == " " or char in SPLIT_SAFE_CHARS for char in string_value + ): # And will still violate the line length limit when split... - max_string_length = self.line_length - ((line.depth + 1) * 4) - if len(string_value) > max_string_length: + max_string_width = self.line_length - ((line.depth + 1) * 4) + if str_width(string_value) > max_string_width: # And has no associated custom splits... if not self.has_custom_splits(string_value): # Then we should NOT put this string on its own line. diff --git a/tests/data/preview/long_strings__east_asian_width.py b/tests/data/preview/long_strings__east_asian_width.py new file mode 100644 index 00000000000..ab0d9e5a916 --- /dev/null +++ b/tests/data/preview/long_strings__east_asian_width.py @@ -0,0 +1,25 @@ +# The following strings do not have not-so-many chars, but are long enough +# when these are rendered in a monospace font (if the renderer respects +# Unicode East Asian Width properties). +hangul = '코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이 필요한 문자열' +hanzi = '中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長,因此需要換行的字符串。' +japanese = 'コードポイントの数は少ないが、実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、改行が要る文字列' + +# output + +# The following strings do not have not-so-many chars, but are long enough +# when these are rendered in a monospace font (if the renderer respects +# Unicode East Asian Width properties). +hangul = ( + "코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이" + " 필요한 문자열" +) +hanzi = ( + "代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長," + "因此需要換行的字符串。" +) +japanese = ( + "コードポイントの数は少ないが、" + "実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、" + "改行が要る文字列" +)