Let string splitters respect East Asian Width

See also: https://www.unicode.org/reports/tr11/
psf · Dec 25, 2022 · 1c06c45 · 1c06c45
1 parent 3feff21
commit 1c06c45
Show file tree

Hide file tree

Showing 6 changed files with 128 additions and 29 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -33,6 +33,11 @@
   string lambda values are now wrapped in parentheses (#3440)
 - Exclude string type annotations from improved string processing; fix crash when the
   return type annotation is stringified and spans across multiple lines (#3462)
+- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/)
+  (#3445)
+- Now long string literals can be split after East Asian commas and periods (`、` U+3001
+  IDEOGRAPHIC COMMA, `。` U+3002 IDEOGRAPHIC FULL STOP, & `，` U+FF0C FULLWIDTH COMMA)
+  besides before spaces (#3445)
 
 ### Configuration
 

diff --git a/src/black/linegen.py b/src/black/linegen.py
@@ -492,7 +492,12 @@ def transform_line(
         and not line.should_split_rhs
         and not line.magic_trailing_comma
         and (
-            is_line_short_enough(line, line_length=mode.line_length, line_str=line_str)
+            is_line_short_enough(
+                line,
+                line_length=mode.line_length,
+                line_str=line_str,
+                preview=mode.preview,
+            )
             or line.contains_unsplittable_type_ignore()
         )
         and not (line.inside_brackets and line.contains_standalone_comments())
@@ -523,7 +528,9 @@ def _rhs(
                 # *current* transformation fits in the line length.  This is true only
                 # for simple cases.  All others require running more transforms via
                 # `transform_line()`.  This check doesn't know if those would succeed.
-                if is_line_short_enough(lines[0], line_length=mode.line_length):
+                if is_line_short_enough(
+                    lines[0], line_length=mode.line_length, preview=mode.preview
+                ):
                     yield from lines
                     return
 
@@ -751,13 +758,17 @@ def _maybe_split_omitting_optional_parens(
                 and any(leaf.type in BRACKETS for leaf in rhs.head.leaves[:-1])
                 # the left side of assignment is short enough (the -1 is for the ending
                 # optional paren)
-                and is_line_short_enough(rhs.head, line_length=line_length - 1)
+                and is_line_short_enough(
+                    rhs.head, line_length=line_length - 1, preview=line.mode.preview
+                )
                 # the left side of assignment won't explode further because of magic
                 # trailing comma
                 and rhs.head.magic_trailing_comma is None
                 # the split by omitting optional parens isn't preferred by some other
                 # reason
-                and not _prefer_split_rhs_oop(rhs_oop, line_length=line_length)
+                and not _prefer_split_rhs_oop(
+                    rhs_oop, line_length=line_length, preview=line.mode.preview
+                )
             ):
                 yield from _maybe_split_omitting_optional_parens(
                     rhs_oop, line, line_length, features=features, omit=omit
@@ -767,7 +778,9 @@ def _maybe_split_omitting_optional_parens(
         except CannotSplit as e:
             if not (
                 can_be_split(rhs.body)
-                or is_line_short_enough(rhs.body, line_length=line_length)
+                or is_line_short_enough(
+                    rhs.body, line_length=line_length, preview=line.mode.preview
+                )
             ):
                 raise CannotSplit(
                     "Splitting failed, body is still too long and can't be split."
@@ -791,7 +804,7 @@ def _maybe_split_omitting_optional_parens(
             yield result
 
 
-def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
+def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int, preview: bool) -> bool:
     """
     Returns whether we should prefer the result from a split omitting optional parens.
     """
@@ -811,7 +824,9 @@ def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
             # the first line still contains the `=`)
             any(leaf.type == token.EQUAL for leaf in rhs_oop.head.leaves)
             # the first line is short enough
-            and is_line_short_enough(rhs_oop.head, line_length=line_length)
+            and is_line_short_enough(
+                rhs_oop.head, line_length=line_length, preview=preview
+            )
         )
         # contains unsplittable type ignore
         or rhs_oop.head.contains_unsplittable_type_ignore()
@@ -1428,7 +1443,9 @@ def run_transformer(
         or line.contains_multiline_strings()
         or result[0].contains_uncollapsable_type_comments()
         or result[0].contains_unsplittable_type_ignore()
-        or is_line_short_enough(result[0], line_length=mode.line_length)
+        or is_line_short_enough(
+            result[0], line_length=mode.line_length, preview=mode.preview
+        )
         # If any leaves have no parents (which _can_ occur since
         # `transform(line)` potentially destroys the line's underlying node
         # structure), then we can't proceed. Doing so would cause the below
@@ -1444,7 +1461,8 @@ def run_transformer(
         line_copy, transform, mode, features_fop, line_str=line_str
     )
     if all(
-        is_line_short_enough(ln, line_length=mode.line_length) for ln in second_opinion
+        is_line_short_enough(ln, line_length=mode.line_length, preview=mode.preview)
+        for ln in second_opinion
     ):
         result = second_opinion
     return result
diff --git a/src/black/lines.py b/src/black/lines.py
@@ -30,6 +30,7 @@
     syms,
     whitespace,
 )
+from black.strings import str_width
 from blib2to3.pgen2 import token
 from blib2to3.pytree import Leaf, Node
 
@@ -711,15 +712,18 @@ def append_leaves(
             new_line.append(comment_leaf, preformatted=True)
 
 
-def is_line_short_enough(line: Line, *, line_length: int, line_str: str = "") -> bool:
+def is_line_short_enough(
+    line: Line, *, line_length: int, line_str: str = "", preview: bool
+) -> bool:
     """Return True if `line` is no longer than `line_length`.
 
     Uses the provided `line_str` rendering, if any, otherwise computes a new one.
     """
     if not line_str:
         line_str = line_to_string(line)
+    width = str_width(line_str) if preview else len(line_str)
     return (
-        len(line_str) <= line_length
+        width <= line_length
         and "\n" not in line_str  # multiline strings
         and not line.contains_standalone_comments()
     )

diff --git a/src/black/strings.py b/src/black/strings.py
@@ -4,6 +4,7 @@
 
 import re
 import sys
+import unicodedata
 from functools import lru_cache
 from typing import List, Pattern
 
@@ -236,3 +237,37 @@ def normalize_string_quotes(s: str) -> str:
         return s  # Prefer double quotes
 
     return f"{prefix}{new_quote}{new_body}{new_quote}"
+
+
+def char_width(char: str) -> int:
+    """Return the width of a single character as it would be displayed in a
+    terminal or editor (which respects Unicode East Asian Width).
+
+    Full width characters are counted as 2, while half width characters are
+    counted as 1.
+    """
+    return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1
+
+
+def str_width(line_str: str) -> int:
+    """Return the width of `line_str` as it would be displayed in a terminal
+    or editor (which respects Unicode East Asian Width).
+
+    You could utilize this function to determine, for example, if a string
+    is too wide to display in a terminal or editor.
+    """
+    return sum(map(char_width, line_str))
+
+
+def count_chars_in_width(line_str: str, max_width: int) -> int:
+    """Count the number of characters in `line_str` that would fit in a
+    terminal or editor of `max_width` (which respects Unicode East Asian
+    Width).
+    """
+    total_width = 0
+    for i, char in enumerate(line_str):
+        width = char_width(char)
+        if width + total_width > max_width:
+            return i
+        total_width += width
+    return len(line_str)
diff --git a/src/black/trans.py b/src/black/trans.py
@@ -48,9 +48,11 @@
 from black.rusty import Err, Ok, Result
 from black.strings import (
     assert_is_leaf_string,
+    count_chars_in_width,
     get_string_prefix,
     has_triple_quotes,
     normalize_string_quotes,
+    str_width,
 )
 from blib2to3.pgen2 import token
 from blib2to3.pytree import Leaf, Node
@@ -71,6 +73,8 @@ class CannotTransform(Exception):
 TResult = Result[T, CannotTransform]  # (T)ransform Result
 TMatchResult = TResult[List[Index]]
 
+SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"])  # East Asian stops
+
 
 def TErr(err_msg: str) -> Err[CannotTransform]:
     """(T)ransform Err
@@ -1154,7 +1158,7 @@ def _get_max_string_length(self, line: Line, string_idx: int) -> int:
             # WMA4 the length of the inline comment.
             offset += len(comment_leaf.value)
 
-        max_string_length = self.line_length - offset
+        max_string_length = count_chars_in_width(str(line), self.line_length - offset)
         return max_string_length
 
     @staticmethod
@@ -1385,26 +1389,28 @@ def maybe_append_string_operators(new_line: Line) -> None:
             is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA
         )
 
-        def max_last_string() -> int:
+        def max_last_string_column() -> int:
             """
             Returns:
-                The max allowed length of the string value used for the last
-                line we will construct.
+                The max allowed width of the string value used for the last
+                line we will construct.  Note that this value means the width
+                rather than the number of characters (e.g., many East Asian
+                characters expand to two columns).
             """
             result = self.line_length
             result -= line.depth * 4
             result -= 1 if ends_with_comma else 0
             result -= string_op_leaves_length
             return result
 
-        # --- Calculate Max Break Index (for string value)
+        # --- Calculate Max Break Width (for string value)
         # We start with the line length limit
-        max_break_idx = self.line_length
+        max_break_width = self.line_length
         # The last index of a string of length N is N-1.
-        max_break_idx -= 1
+        max_break_width -= 1
         # Leading whitespace is not present in the string value (e.g. Leaf.value).
-        max_break_idx -= line.depth * 4
-        if max_break_idx < 0:
+        max_break_width -= line.depth * 4
+        if max_break_width < 0:
             yield TErr(
                 f"Unable to split {LL[string_idx].value} at such high of a line depth:"
                 f" {line.depth}"
@@ -1417,7 +1423,7 @@ def max_last_string() -> int:
         # line limit.
         use_custom_breakpoints = bool(
             custom_splits
-            and all(csplit.break_idx <= max_break_idx for csplit in custom_splits)
+            and all(csplit.break_idx <= max_break_width for csplit in custom_splits)
         )
 
         # Temporary storage for the remaining chunk of the string line that
@@ -1433,7 +1439,7 @@ def more_splits_should_be_made() -> bool:
             if use_custom_breakpoints:
                 return len(custom_splits) > 1
             else:
-                return len(rest_value) > max_last_string()
+                return str_width(rest_value) > max_last_string_column()
 
         string_line_results: List[Ok[Line]] = []
         while more_splits_should_be_made():
@@ -1443,7 +1449,10 @@ def more_splits_should_be_made() -> bool:
                 break_idx = csplit.break_idx
             else:
                 # Algorithmic Split (automatic)
-                max_bidx = max_break_idx - string_op_leaves_length
+                max_bidx = (
+                    count_chars_in_width(rest_value, max_break_width)
+                    - string_op_leaves_length
+                )
                 maybe_break_idx = self._get_break_idx(rest_value, max_bidx)
                 if maybe_break_idx is None:
                     # If we are unable to algorithmically determine a good split
@@ -1540,7 +1549,7 @@ def more_splits_should_be_made() -> bool:
 
             # Try to fit them all on the same line with the last substring...
             if (
-                len(temp_value) <= max_last_string()
+                str_width(temp_value) <= max_last_string_column()
                 or LL[string_idx + 1].type == token.COMMA
             ):
                 last_line.append(rest_leaf)
@@ -1660,6 +1669,7 @@ def passes_all_checks(i: Index) -> bool:
                 section of this classes' docstring would be be met by returning @i.
             """
             is_space = string[i] == " "
+            is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS
 
             is_not_escaped = True
             j = i - 1
@@ -1672,7 +1682,7 @@ def passes_all_checks(i: Index) -> bool:
                 and len(string[:i]) >= self.MIN_SUBSTR_SIZE
             )
             return (
-                is_space
+                (is_space or is_split_safe)
                 and is_not_escaped
                 and is_big_enough
                 and not breaks_unsplittable_expression(i)
@@ -1817,11 +1827,13 @@ def do_splitter_match(self, line: Line) -> TMatchResult:
 
         if string_idx is not None:
             string_value = line.leaves[string_idx].value
-            # If the string has no spaces...
-            if " " not in string_value:
+            # If the string has neither spaces nor East Asian stops...
+            if not any(
+                char == " " or char in SPLIT_SAFE_CHARS for char in string_value
+            ):
                 # And will still violate the line length limit when split...
-                max_string_length = self.line_length - ((line.depth + 1) * 4)
-                if len(string_value) > max_string_length:
+                max_string_width = self.line_length - ((line.depth + 1) * 4)
+                if str_width(string_value) > max_string_width:
                     # And has no associated custom splits...
                     if not self.has_custom_splits(string_value):
                         # Then we should NOT put this string on its own line.

diff --git a/tests/data/preview/long_strings__east_asian_width.py b/tests/data/preview/long_strings__east_asian_width.py
@@ -0,0 +1,25 @@
+# The following strings do not have not-so-many chars, but are long enough
+# when these are rendered in a monospace font (if the renderer respects
+# Unicode East Asian Width properties).
+hangul = '코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이 필요한 문자열'
+hanzi = '中文測試：代碼點數量少，但在真正的終端模擬器或編輯器中呈現時太長，因此需要換行的字符串。'
+japanese = 'コードポイントの数は少ないが、実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、改行が要る文字列'
+
+# output
+
+# The following strings do not have not-so-many chars, but are long enough
+# when these are rendered in a monospace font (if the renderer respects
+# Unicode East Asian Width properties).
+hangul = (
+    "코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이"
+    " 필요한 문자열"
+)
+hanzi = (
+    "中文測試：代碼點數量少，但在真正的終端模擬器或編輯器中呈現時太長，"
+    "因此需要換行的字符串。"
+)
+japanese = (
+    "コードポイントの数は少ないが、"
+    "実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、"
+    "改行が要る文字列"
+)