From 1c06c4595f4a76b056fca44f0b7aafe5ec4e19cc Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Sat, 17 Dec 2022 15:40:26 +0900 Subject: [PATCH] Let string splitters respect East Asian Width See also: https://www.unicode.org/reports/tr11/ --- CHANGES.md | 5 ++ src/black/linegen.py | 36 ++++++++++---- src/black/lines.py | 8 +++- src/black/strings.py | 35 ++++++++++++++ src/black/trans.py | 48 ++++++++++++------- .../preview/long_strings__east_asian_width.py | 25 ++++++++++ 6 files changed, 128 insertions(+), 29 deletions(-) create mode 100644 tests/data/preview/long_strings__east_asian_width.py diff --git a/CHANGES.md b/CHANGES.md index 587ca8a2a0d..f01440ee208 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -33,6 +33,11 @@ string lambda values are now wrapped in parentheses (#3440) - Exclude string type annotations from improved string processing; fix crash when the return type annotation is stringified and spans across multiple lines (#3462) +- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/) + (#3445) +- Now long string literals can be split after East Asian commas and periods (`、` U+3001 + IDEOGRAPHIC COMMA, `。` U+3002 IDEOGRAPHIC FULL STOP, & `,` U+FF0C FULLWIDTH COMMA) + besides before spaces (#3445) ### Configuration diff --git a/src/black/linegen.py b/src/black/linegen.py index 2e75bc94506..83ed8ffcbe2 100644 --- a/src/black/linegen.py +++ b/src/black/linegen.py @@ -492,7 +492,12 @@ def transform_line( and not line.should_split_rhs and not line.magic_trailing_comma and ( - is_line_short_enough(line, line_length=mode.line_length, line_str=line_str) + is_line_short_enough( + line, + line_length=mode.line_length, + line_str=line_str, + preview=mode.preview, + ) or line.contains_unsplittable_type_ignore() ) and not (line.inside_brackets and line.contains_standalone_comments()) @@ -523,7 +528,9 @@ def _rhs( # *current* transformation fits in the line length. This is true only # for simple cases. All others require running more transforms via # `transform_line()`. This check doesn't know if those would succeed. - if is_line_short_enough(lines[0], line_length=mode.line_length): + if is_line_short_enough( + lines[0], line_length=mode.line_length, preview=mode.preview + ): yield from lines return @@ -751,13 +758,17 @@ def _maybe_split_omitting_optional_parens( and any(leaf.type in BRACKETS for leaf in rhs.head.leaves[:-1]) # the left side of assignment is short enough (the -1 is for the ending # optional paren) - and is_line_short_enough(rhs.head, line_length=line_length - 1) + and is_line_short_enough( + rhs.head, line_length=line_length - 1, preview=line.mode.preview + ) # the left side of assignment won't explode further because of magic # trailing comma and rhs.head.magic_trailing_comma is None # the split by omitting optional parens isn't preferred by some other # reason - and not _prefer_split_rhs_oop(rhs_oop, line_length=line_length) + and not _prefer_split_rhs_oop( + rhs_oop, line_length=line_length, preview=line.mode.preview + ) ): yield from _maybe_split_omitting_optional_parens( rhs_oop, line, line_length, features=features, omit=omit @@ -767,7 +778,9 @@ def _maybe_split_omitting_optional_parens( except CannotSplit as e: if not ( can_be_split(rhs.body) - or is_line_short_enough(rhs.body, line_length=line_length) + or is_line_short_enough( + rhs.body, line_length=line_length, preview=line.mode.preview + ) ): raise CannotSplit( "Splitting failed, body is still too long and can't be split." @@ -791,7 +804,7 @@ def _maybe_split_omitting_optional_parens( yield result -def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool: +def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int, preview: bool) -> bool: """ Returns whether we should prefer the result from a split omitting optional parens. """ @@ -811,7 +824,9 @@ def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool: # the first line still contains the `=`) any(leaf.type == token.EQUAL for leaf in rhs_oop.head.leaves) # the first line is short enough - and is_line_short_enough(rhs_oop.head, line_length=line_length) + and is_line_short_enough( + rhs_oop.head, line_length=line_length, preview=preview + ) ) # contains unsplittable type ignore or rhs_oop.head.contains_unsplittable_type_ignore() @@ -1428,7 +1443,9 @@ def run_transformer( or line.contains_multiline_strings() or result[0].contains_uncollapsable_type_comments() or result[0].contains_unsplittable_type_ignore() - or is_line_short_enough(result[0], line_length=mode.line_length) + or is_line_short_enough( + result[0], line_length=mode.line_length, preview=mode.preview + ) # If any leaves have no parents (which _can_ occur since # `transform(line)` potentially destroys the line's underlying node # structure), then we can't proceed. Doing so would cause the below @@ -1444,7 +1461,8 @@ def run_transformer( line_copy, transform, mode, features_fop, line_str=line_str ) if all( - is_line_short_enough(ln, line_length=mode.line_length) for ln in second_opinion + is_line_short_enough(ln, line_length=mode.line_length, preview=mode.preview) + for ln in second_opinion ): result = second_opinion return result diff --git a/src/black/lines.py b/src/black/lines.py index 2aa675c3b31..55b35e3245e 100644 --- a/src/black/lines.py +++ b/src/black/lines.py @@ -30,6 +30,7 @@ syms, whitespace, ) +from black.strings import str_width from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -711,15 +712,18 @@ def append_leaves( new_line.append(comment_leaf, preformatted=True) -def is_line_short_enough(line: Line, *, line_length: int, line_str: str = "") -> bool: +def is_line_short_enough( + line: Line, *, line_length: int, line_str: str = "", preview: bool +) -> bool: """Return True if `line` is no longer than `line_length`. Uses the provided `line_str` rendering, if any, otherwise computes a new one. """ if not line_str: line_str = line_to_string(line) + width = str_width(line_str) if preview else len(line_str) return ( - len(line_str) <= line_length + width <= line_length and "\n" not in line_str # multiline strings and not line.contains_standalone_comments() ) diff --git a/src/black/strings.py b/src/black/strings.py index 9d0e2eb8430..bf41baf9fa3 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -4,6 +4,7 @@ import re import sys +import unicodedata from functools import lru_cache from typing import List, Pattern @@ -236,3 +237,37 @@ def normalize_string_quotes(s: str) -> str: return s # Prefer double quotes return f"{prefix}{new_quote}{new_body}{new_quote}" + + +def char_width(char: str) -> int: + """Return the width of a single character as it would be displayed in a + terminal or editor (which respects Unicode East Asian Width). + + Full width characters are counted as 2, while half width characters are + counted as 1. + """ + return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1 + + +def str_width(line_str: str) -> int: + """Return the width of `line_str` as it would be displayed in a terminal + or editor (which respects Unicode East Asian Width). + + You could utilize this function to determine, for example, if a string + is too wide to display in a terminal or editor. + """ + return sum(map(char_width, line_str)) + + +def count_chars_in_width(line_str: str, max_width: int) -> int: + """Count the number of characters in `line_str` that would fit in a + terminal or editor of `max_width` (which respects Unicode East Asian + Width). + """ + total_width = 0 + for i, char in enumerate(line_str): + width = char_width(char) + if width + total_width > max_width: + return i + total_width += width + return len(line_str) diff --git a/src/black/trans.py b/src/black/trans.py index ec07f5eab74..3c7dfc74c02 100644 --- a/src/black/trans.py +++ b/src/black/trans.py @@ -48,9 +48,11 @@ from black.rusty import Err, Ok, Result from black.strings import ( assert_is_leaf_string, + count_chars_in_width, get_string_prefix, has_triple_quotes, normalize_string_quotes, + str_width, ) from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -71,6 +73,8 @@ class CannotTransform(Exception): TResult = Result[T, CannotTransform] # (T)ransform Result TMatchResult = TResult[List[Index]] +SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"]) # East Asian stops + def TErr(err_msg: str) -> Err[CannotTransform]: """(T)ransform Err @@ -1154,7 +1158,7 @@ def _get_max_string_length(self, line: Line, string_idx: int) -> int: # WMA4 the length of the inline comment. offset += len(comment_leaf.value) - max_string_length = self.line_length - offset + max_string_length = count_chars_in_width(str(line), self.line_length - offset) return max_string_length @staticmethod @@ -1385,11 +1389,13 @@ def maybe_append_string_operators(new_line: Line) -> None: is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA ) - def max_last_string() -> int: + def max_last_string_column() -> int: """ Returns: - The max allowed length of the string value used for the last - line we will construct. + The max allowed width of the string value used for the last + line we will construct. Note that this value means the width + rather than the number of characters (e.g., many East Asian + characters expand to two columns). """ result = self.line_length result -= line.depth * 4 @@ -1397,14 +1403,14 @@ def max_last_string() -> int: result -= string_op_leaves_length return result - # --- Calculate Max Break Index (for string value) + # --- Calculate Max Break Width (for string value) # We start with the line length limit - max_break_idx = self.line_length + max_break_width = self.line_length # The last index of a string of length N is N-1. - max_break_idx -= 1 + max_break_width -= 1 # Leading whitespace is not present in the string value (e.g. Leaf.value). - max_break_idx -= line.depth * 4 - if max_break_idx < 0: + max_break_width -= line.depth * 4 + if max_break_width < 0: yield TErr( f"Unable to split {LL[string_idx].value} at such high of a line depth:" f" {line.depth}" @@ -1417,7 +1423,7 @@ def max_last_string() -> int: # line limit. use_custom_breakpoints = bool( custom_splits - and all(csplit.break_idx <= max_break_idx for csplit in custom_splits) + and all(csplit.break_idx <= max_break_width for csplit in custom_splits) ) # Temporary storage for the remaining chunk of the string line that @@ -1433,7 +1439,7 @@ def more_splits_should_be_made() -> bool: if use_custom_breakpoints: return len(custom_splits) > 1 else: - return len(rest_value) > max_last_string() + return str_width(rest_value) > max_last_string_column() string_line_results: List[Ok[Line]] = [] while more_splits_should_be_made(): @@ -1443,7 +1449,10 @@ def more_splits_should_be_made() -> bool: break_idx = csplit.break_idx else: # Algorithmic Split (automatic) - max_bidx = max_break_idx - string_op_leaves_length + max_bidx = ( + count_chars_in_width(rest_value, max_break_width) + - string_op_leaves_length + ) maybe_break_idx = self._get_break_idx(rest_value, max_bidx) if maybe_break_idx is None: # If we are unable to algorithmically determine a good split @@ -1540,7 +1549,7 @@ def more_splits_should_be_made() -> bool: # Try to fit them all on the same line with the last substring... if ( - len(temp_value) <= max_last_string() + str_width(temp_value) <= max_last_string_column() or LL[string_idx + 1].type == token.COMMA ): last_line.append(rest_leaf) @@ -1660,6 +1669,7 @@ def passes_all_checks(i: Index) -> bool: section of this classes' docstring would be be met by returning @i. """ is_space = string[i] == " " + is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS is_not_escaped = True j = i - 1 @@ -1672,7 +1682,7 @@ def passes_all_checks(i: Index) -> bool: and len(string[:i]) >= self.MIN_SUBSTR_SIZE ) return ( - is_space + (is_space or is_split_safe) and is_not_escaped and is_big_enough and not breaks_unsplittable_expression(i) @@ -1817,11 +1827,13 @@ def do_splitter_match(self, line: Line) -> TMatchResult: if string_idx is not None: string_value = line.leaves[string_idx].value - # If the string has no spaces... - if " " not in string_value: + # If the string has neither spaces nor East Asian stops... + if not any( + char == " " or char in SPLIT_SAFE_CHARS for char in string_value + ): # And will still violate the line length limit when split... - max_string_length = self.line_length - ((line.depth + 1) * 4) - if len(string_value) > max_string_length: + max_string_width = self.line_length - ((line.depth + 1) * 4) + if str_width(string_value) > max_string_width: # And has no associated custom splits... if not self.has_custom_splits(string_value): # Then we should NOT put this string on its own line. diff --git a/tests/data/preview/long_strings__east_asian_width.py b/tests/data/preview/long_strings__east_asian_width.py new file mode 100644 index 00000000000..fb66a78ed8b --- /dev/null +++ b/tests/data/preview/long_strings__east_asian_width.py @@ -0,0 +1,25 @@ +# The following strings do not have not-so-many chars, but are long enough +# when these are rendered in a monospace font (if the renderer respects +# Unicode East Asian Width properties). +hangul = '코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이 필요한 문자열' +hanzi = '中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長,因此需要換行的字符串。' +japanese = 'コードポイントの数は少ないが、実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、改行が要る文字列' + +# output + +# The following strings do not have not-so-many chars, but are long enough +# when these are rendered in a monospace font (if the renderer respects +# Unicode East Asian Width properties). +hangul = ( + "코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이" + " 필요한 문자열" +) +hanzi = ( + "中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長," + "因此需要換行的字符串。" +) +japanese = ( + "コードポイントの数は少ないが、" + "実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、" + "改行が要る文字列" +)