Skip to content

Commit

Permalink
Let string splitters respect East Asian Width
Browse files Browse the repository at this point in the history
  • Loading branch information
dahlia committed Dec 25, 2022
1 parent 3feff21 commit 1c06c45
Show file tree
Hide file tree
Showing 6 changed files with 128 additions and 29 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Expand Up @@ -33,6 +33,11 @@
string lambda values are now wrapped in parentheses (#3440)
- Exclude string type annotations from improved string processing; fix crash when the
return type annotation is stringified and spans across multiple lines (#3462)
- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/)
(#3445)
- Now long string literals can be split after East Asian commas and periods (`` U+3001
IDEOGRAPHIC COMMA, `` U+3002 IDEOGRAPHIC FULL STOP, & `` U+FF0C FULLWIDTH COMMA)
besides before spaces (#3445)

### Configuration

Expand Down
36 changes: 27 additions & 9 deletions src/black/linegen.py
Expand Up @@ -492,7 +492,12 @@ def transform_line(
and not line.should_split_rhs
and not line.magic_trailing_comma
and (
is_line_short_enough(line, line_length=mode.line_length, line_str=line_str)
is_line_short_enough(
line,
line_length=mode.line_length,
line_str=line_str,
preview=mode.preview,
)
or line.contains_unsplittable_type_ignore()
)
and not (line.inside_brackets and line.contains_standalone_comments())
Expand Down Expand Up @@ -523,7 +528,9 @@ def _rhs(
# *current* transformation fits in the line length. This is true only
# for simple cases. All others require running more transforms via
# `transform_line()`. This check doesn't know if those would succeed.
if is_line_short_enough(lines[0], line_length=mode.line_length):
if is_line_short_enough(
lines[0], line_length=mode.line_length, preview=mode.preview
):
yield from lines
return

Expand Down Expand Up @@ -751,13 +758,17 @@ def _maybe_split_omitting_optional_parens(
and any(leaf.type in BRACKETS for leaf in rhs.head.leaves[:-1])
# the left side of assignment is short enough (the -1 is for the ending
# optional paren)
and is_line_short_enough(rhs.head, line_length=line_length - 1)
and is_line_short_enough(
rhs.head, line_length=line_length - 1, preview=line.mode.preview
)
# the left side of assignment won't explode further because of magic
# trailing comma
and rhs.head.magic_trailing_comma is None
# the split by omitting optional parens isn't preferred by some other
# reason
and not _prefer_split_rhs_oop(rhs_oop, line_length=line_length)
and not _prefer_split_rhs_oop(
rhs_oop, line_length=line_length, preview=line.mode.preview
)
):
yield from _maybe_split_omitting_optional_parens(
rhs_oop, line, line_length, features=features, omit=omit
Expand All @@ -767,7 +778,9 @@ def _maybe_split_omitting_optional_parens(
except CannotSplit as e:
if not (
can_be_split(rhs.body)
or is_line_short_enough(rhs.body, line_length=line_length)
or is_line_short_enough(
rhs.body, line_length=line_length, preview=line.mode.preview
)
):
raise CannotSplit(
"Splitting failed, body is still too long and can't be split."
Expand All @@ -791,7 +804,7 @@ def _maybe_split_omitting_optional_parens(
yield result


def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int, preview: bool) -> bool:
"""
Returns whether we should prefer the result from a split omitting optional parens.
"""
Expand All @@ -811,7 +824,9 @@ def _prefer_split_rhs_oop(rhs_oop: _RHSResult, line_length: int) -> bool:
# the first line still contains the `=`)
any(leaf.type == token.EQUAL for leaf in rhs_oop.head.leaves)
# the first line is short enough
and is_line_short_enough(rhs_oop.head, line_length=line_length)
and is_line_short_enough(
rhs_oop.head, line_length=line_length, preview=preview
)
)
# contains unsplittable type ignore
or rhs_oop.head.contains_unsplittable_type_ignore()
Expand Down Expand Up @@ -1428,7 +1443,9 @@ def run_transformer(
or line.contains_multiline_strings()
or result[0].contains_uncollapsable_type_comments()
or result[0].contains_unsplittable_type_ignore()
or is_line_short_enough(result[0], line_length=mode.line_length)
or is_line_short_enough(
result[0], line_length=mode.line_length, preview=mode.preview
)
# If any leaves have no parents (which _can_ occur since
# `transform(line)` potentially destroys the line's underlying node
# structure), then we can't proceed. Doing so would cause the below
Expand All @@ -1444,7 +1461,8 @@ def run_transformer(
line_copy, transform, mode, features_fop, line_str=line_str
)
if all(
is_line_short_enough(ln, line_length=mode.line_length) for ln in second_opinion
is_line_short_enough(ln, line_length=mode.line_length, preview=mode.preview)
for ln in second_opinion
):
result = second_opinion
return result
8 changes: 6 additions & 2 deletions src/black/lines.py
Expand Up @@ -30,6 +30,7 @@
syms,
whitespace,
)
from black.strings import str_width
from blib2to3.pgen2 import token
from blib2to3.pytree import Leaf, Node

Expand Down Expand Up @@ -711,15 +712,18 @@ def append_leaves(
new_line.append(comment_leaf, preformatted=True)


def is_line_short_enough(line: Line, *, line_length: int, line_str: str = "") -> bool:
def is_line_short_enough(
line: Line, *, line_length: int, line_str: str = "", preview: bool
) -> bool:
"""Return True if `line` is no longer than `line_length`.
Uses the provided `line_str` rendering, if any, otherwise computes a new one.
"""
if not line_str:
line_str = line_to_string(line)
width = str_width(line_str) if preview else len(line_str)
return (
len(line_str) <= line_length
width <= line_length
and "\n" not in line_str # multiline strings
and not line.contains_standalone_comments()
)
Expand Down
35 changes: 35 additions & 0 deletions src/black/strings.py
Expand Up @@ -4,6 +4,7 @@

import re
import sys
import unicodedata
from functools import lru_cache
from typing import List, Pattern

Expand Down Expand Up @@ -236,3 +237,37 @@ def normalize_string_quotes(s: str) -> str:
return s # Prefer double quotes

return f"{prefix}{new_quote}{new_body}{new_quote}"


def char_width(char: str) -> int:
"""Return the width of a single character as it would be displayed in a
terminal or editor (which respects Unicode East Asian Width).
Full width characters are counted as 2, while half width characters are
counted as 1.
"""
return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1


def str_width(line_str: str) -> int:
"""Return the width of `line_str` as it would be displayed in a terminal
or editor (which respects Unicode East Asian Width).
You could utilize this function to determine, for example, if a string
is too wide to display in a terminal or editor.
"""
return sum(map(char_width, line_str))


def count_chars_in_width(line_str: str, max_width: int) -> int:
"""Count the number of characters in `line_str` that would fit in a
terminal or editor of `max_width` (which respects Unicode East Asian
Width).
"""
total_width = 0
for i, char in enumerate(line_str):
width = char_width(char)
if width + total_width > max_width:
return i
total_width += width
return len(line_str)
48 changes: 30 additions & 18 deletions src/black/trans.py
Expand Up @@ -48,9 +48,11 @@
from black.rusty import Err, Ok, Result
from black.strings import (
assert_is_leaf_string,
count_chars_in_width,
get_string_prefix,
has_triple_quotes,
normalize_string_quotes,
str_width,
)
from blib2to3.pgen2 import token
from blib2to3.pytree import Leaf, Node
Expand All @@ -71,6 +73,8 @@ class CannotTransform(Exception):
TResult = Result[T, CannotTransform] # (T)ransform Result
TMatchResult = TResult[List[Index]]

SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"]) # East Asian stops


def TErr(err_msg: str) -> Err[CannotTransform]:
"""(T)ransform Err
Expand Down Expand Up @@ -1154,7 +1158,7 @@ def _get_max_string_length(self, line: Line, string_idx: int) -> int:
# WMA4 the length of the inline comment.
offset += len(comment_leaf.value)

max_string_length = self.line_length - offset
max_string_length = count_chars_in_width(str(line), self.line_length - offset)
return max_string_length

@staticmethod
Expand Down Expand Up @@ -1385,26 +1389,28 @@ def maybe_append_string_operators(new_line: Line) -> None:
is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA
)

def max_last_string() -> int:
def max_last_string_column() -> int:
"""
Returns:
The max allowed length of the string value used for the last
line we will construct.
The max allowed width of the string value used for the last
line we will construct. Note that this value means the width
rather than the number of characters (e.g., many East Asian
characters expand to two columns).
"""
result = self.line_length
result -= line.depth * 4
result -= 1 if ends_with_comma else 0
result -= string_op_leaves_length
return result

# --- Calculate Max Break Index (for string value)
# --- Calculate Max Break Width (for string value)
# We start with the line length limit
max_break_idx = self.line_length
max_break_width = self.line_length
# The last index of a string of length N is N-1.
max_break_idx -= 1
max_break_width -= 1
# Leading whitespace is not present in the string value (e.g. Leaf.value).
max_break_idx -= line.depth * 4
if max_break_idx < 0:
max_break_width -= line.depth * 4
if max_break_width < 0:
yield TErr(
f"Unable to split {LL[string_idx].value} at such high of a line depth:"
f" {line.depth}"
Expand All @@ -1417,7 +1423,7 @@ def max_last_string() -> int:
# line limit.
use_custom_breakpoints = bool(
custom_splits
and all(csplit.break_idx <= max_break_idx for csplit in custom_splits)
and all(csplit.break_idx <= max_break_width for csplit in custom_splits)
)

# Temporary storage for the remaining chunk of the string line that
Expand All @@ -1433,7 +1439,7 @@ def more_splits_should_be_made() -> bool:
if use_custom_breakpoints:
return len(custom_splits) > 1
else:
return len(rest_value) > max_last_string()
return str_width(rest_value) > max_last_string_column()

string_line_results: List[Ok[Line]] = []
while more_splits_should_be_made():
Expand All @@ -1443,7 +1449,10 @@ def more_splits_should_be_made() -> bool:
break_idx = csplit.break_idx
else:
# Algorithmic Split (automatic)
max_bidx = max_break_idx - string_op_leaves_length
max_bidx = (
count_chars_in_width(rest_value, max_break_width)
- string_op_leaves_length
)
maybe_break_idx = self._get_break_idx(rest_value, max_bidx)
if maybe_break_idx is None:
# If we are unable to algorithmically determine a good split
Expand Down Expand Up @@ -1540,7 +1549,7 @@ def more_splits_should_be_made() -> bool:

# Try to fit them all on the same line with the last substring...
if (
len(temp_value) <= max_last_string()
str_width(temp_value) <= max_last_string_column()
or LL[string_idx + 1].type == token.COMMA
):
last_line.append(rest_leaf)
Expand Down Expand Up @@ -1660,6 +1669,7 @@ def passes_all_checks(i: Index) -> bool:
section of this classes' docstring would be be met by returning @i.
"""
is_space = string[i] == " "
is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS

is_not_escaped = True
j = i - 1
Expand All @@ -1672,7 +1682,7 @@ def passes_all_checks(i: Index) -> bool:
and len(string[:i]) >= self.MIN_SUBSTR_SIZE
)
return (
is_space
(is_space or is_split_safe)
and is_not_escaped
and is_big_enough
and not breaks_unsplittable_expression(i)
Expand Down Expand Up @@ -1817,11 +1827,13 @@ def do_splitter_match(self, line: Line) -> TMatchResult:

if string_idx is not None:
string_value = line.leaves[string_idx].value
# If the string has no spaces...
if " " not in string_value:
# If the string has neither spaces nor East Asian stops...
if not any(
char == " " or char in SPLIT_SAFE_CHARS for char in string_value
):
# And will still violate the line length limit when split...
max_string_length = self.line_length - ((line.depth + 1) * 4)
if len(string_value) > max_string_length:
max_string_width = self.line_length - ((line.depth + 1) * 4)
if str_width(string_value) > max_string_width:
# And has no associated custom splits...
if not self.has_custom_splits(string_value):
# Then we should NOT put this string on its own line.
Expand Down
25 changes: 25 additions & 0 deletions tests/data/preview/long_strings__east_asian_width.py
@@ -0,0 +1,25 @@
# The following strings do not have not-so-many chars, but are long enough
# when these are rendered in a monospace font (if the renderer respects
# Unicode East Asian Width properties).
hangul = '코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이 필요한 문자열'
hanzi = '中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長,因此需要換行的字符串。'
japanese = 'コードポイントの数は少ないが、実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、改行が要る文字列'

# output

# The following strings do not have not-so-many chars, but are long enough
# when these are rendered in a monospace font (if the renderer respects
# Unicode East Asian Width properties).
hangul = (
"코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이"
" 필요한 문자열"
)
hanzi = (
"中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長,"
"因此需要換行的字符串。"
)
japanese = (
"コードポイントの数は少ないが、"
"実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、"
"改行が要る文字列"
)

0 comments on commit 1c06c45

Please sign in to comment.