From cd27157759eecb4342fe76f193ce5e1eab28f6f7 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Sat, 17 Dec 2022 15:40:26 +0900 Subject: [PATCH 1/4] Let string splitters respect East Asian Width See also: https://www.unicode.org/reports/tr11/ --- CHANGES.md | 5 ++ src/black/lines.py | 9 ++-- src/black/strings.py | 35 ++++++++++++++ src/black/trans.py | 48 ++++++++++++------- .../preview/long_strings__east_asian_width.py | 25 ++++++++++ 5 files changed, 101 insertions(+), 21 deletions(-) create mode 100644 tests/data/preview/long_strings__east_asian_width.py diff --git a/CHANGES.md b/CHANGES.md index 2fa0cb41b38..b23148d0f1a 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,6 +18,11 @@ entry (#3393) - `with` statements that contain two context managers will be consistently wrapped in parentheses (#3589) +- Let string splitters respect [East Asian Width](https://www.unicode.org/reports/tr11/) + (#3445) +- Now long string literals can be split after East Asian commas and periods (`、` U+3001 + IDEOGRAPHIC COMMA, `。` U+3002 IDEOGRAPHIC FULL STOP, & `,` U+FF0C FULLWIDTH COMMA) + besides before spaces (#3445) ### Configuration diff --git a/src/black/lines.py b/src/black/lines.py index 4b57d1f0ea8..979305a737d 100644 --- a/src/black/lines.py +++ b/src/black/lines.py @@ -33,6 +33,7 @@ syms, whitespace, ) +from black.strings import str_width from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -732,9 +733,11 @@ def is_line_short_enough( # noqa: C901 if not line_str: line_str = line_to_string(line) + width = str_width if mode.preview else len + if Preview.multiline_string_handling not in mode: return ( - len(line_str) <= mode.line_length + width(line_str) <= mode.line_length and "\n" not in line_str # multiline strings and not line.contains_standalone_comments() ) @@ -743,10 +746,10 @@ def is_line_short_enough( # noqa: C901 return False if "\n" not in line_str: # No multiline strings (MLS) present - return len(line_str) <= mode.line_length + return width(line_str) <= mode.line_length first, *_, last = line_str.split("\n") - if len(first) > mode.line_length or len(last) > mode.line_length: + if width(first) > mode.line_length or width(last) > mode.line_length: return False # Traverse the AST to examine the context of the multiline string (MLS), diff --git a/src/black/strings.py b/src/black/strings.py index 3e3bc12fe72..7f30e283a4a 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -4,6 +4,7 @@ import re import sys +import unicodedata from functools import lru_cache from typing import List, Match, Pattern @@ -278,3 +279,37 @@ def replace(m: Match[str]) -> str: return back_slashes + "N{" + groups["N"].upper() + "}" leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) + + +def char_width(char: str) -> int: + """Return the width of a single character as it would be displayed in a + terminal or editor (which respects Unicode East Asian Width). + + Full width characters are counted as 2, while half width characters are + counted as 1. + """ + return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1 + + +def str_width(line_str: str) -> int: + """Return the width of `line_str` as it would be displayed in a terminal + or editor (which respects Unicode East Asian Width). + + You could utilize this function to determine, for example, if a string + is too wide to display in a terminal or editor. + """ + return sum(map(char_width, line_str)) + + +def count_chars_in_width(line_str: str, max_width: int) -> int: + """Count the number of characters in `line_str` that would fit in a + terminal or editor of `max_width` (which respects Unicode East Asian + Width). + """ + total_width = 0 + for i, char in enumerate(line_str): + width = char_width(char) + if width + total_width > max_width: + return i + total_width += width + return len(line_str) diff --git a/src/black/trans.py b/src/black/trans.py index a6a416e71bc..95695f32b14 100644 --- a/src/black/trans.py +++ b/src/black/trans.py @@ -48,9 +48,11 @@ from black.rusty import Err, Ok, Result from black.strings import ( assert_is_leaf_string, + count_chars_in_width, get_string_prefix, has_triple_quotes, normalize_string_quotes, + str_width, ) from blib2to3.pgen2 import token from blib2to3.pytree import Leaf, Node @@ -71,6 +73,8 @@ class CannotTransform(Exception): TResult = Result[T, CannotTransform] # (T)ransform Result TMatchResult = TResult[List[Index]] +SPLIT_SAFE_CHARS = frozenset(["\u3001", "\u3002", "\uff0c"]) # East Asian stops + def TErr(err_msg: str) -> Err[CannotTransform]: """(T)ransform Err @@ -1164,7 +1168,7 @@ def _get_max_string_length(self, line: Line, string_idx: int) -> int: # WMA4 the length of the inline comment. offset += len(comment_leaf.value) - max_string_length = self.line_length - offset + max_string_length = count_chars_in_width(str(line), self.line_length - offset) return max_string_length @staticmethod @@ -1419,11 +1423,13 @@ def maybe_append_string_operators(new_line: Line) -> None: is_valid_index(string_idx + 1) and LL[string_idx + 1].type == token.COMMA ) - def max_last_string() -> int: + def max_last_string_column() -> int: """ Returns: - The max allowed length of the string value used for the last - line we will construct. + The max allowed width of the string value used for the last + line we will construct. Note that this value means the width + rather than the number of characters (e.g., many East Asian + characters expand to two columns). """ result = self.line_length result -= line.depth * 4 @@ -1431,14 +1437,14 @@ def max_last_string() -> int: result -= string_op_leaves_length return result - # --- Calculate Max Break Index (for string value) + # --- Calculate Max Break Width (for string value) # We start with the line length limit - max_break_idx = self.line_length + max_break_width = self.line_length # The last index of a string of length N is N-1. - max_break_idx -= 1 + max_break_width -= 1 # Leading whitespace is not present in the string value (e.g. Leaf.value). - max_break_idx -= line.depth * 4 - if max_break_idx < 0: + max_break_width -= line.depth * 4 + if max_break_width < 0: yield TErr( f"Unable to split {LL[string_idx].value} at such high of a line depth:" f" {line.depth}" @@ -1451,7 +1457,7 @@ def max_last_string() -> int: # line limit. use_custom_breakpoints = bool( custom_splits - and all(csplit.break_idx <= max_break_idx for csplit in custom_splits) + and all(csplit.break_idx <= max_break_width for csplit in custom_splits) ) # Temporary storage for the remaining chunk of the string line that @@ -1467,7 +1473,7 @@ def more_splits_should_be_made() -> bool: if use_custom_breakpoints: return len(custom_splits) > 1 else: - return len(rest_value) > max_last_string() + return str_width(rest_value) > max_last_string_column() string_line_results: List[Ok[Line]] = [] while more_splits_should_be_made(): @@ -1477,7 +1483,10 @@ def more_splits_should_be_made() -> bool: break_idx = csplit.break_idx else: # Algorithmic Split (automatic) - max_bidx = max_break_idx - string_op_leaves_length + max_bidx = ( + count_chars_in_width(rest_value, max_break_width) + - string_op_leaves_length + ) maybe_break_idx = self._get_break_idx(rest_value, max_bidx) if maybe_break_idx is None: # If we are unable to algorithmically determine a good split @@ -1574,7 +1583,7 @@ def more_splits_should_be_made() -> bool: # Try to fit them all on the same line with the last substring... if ( - len(temp_value) <= max_last_string() + str_width(temp_value) <= max_last_string_column() or LL[string_idx + 1].type == token.COMMA ): last_line.append(rest_leaf) @@ -1694,6 +1703,7 @@ def passes_all_checks(i: Index) -> bool: section of this classes' docstring would be be met by returning @i. """ is_space = string[i] == " " + is_split_safe = is_valid_index(i - 1) and string[i - 1] in SPLIT_SAFE_CHARS is_not_escaped = True j = i - 1 @@ -1706,7 +1716,7 @@ def passes_all_checks(i: Index) -> bool: and len(string[:i]) >= self.MIN_SUBSTR_SIZE ) return ( - is_space + (is_space or is_split_safe) and is_not_escaped and is_big_enough and not breaks_unsplittable_expression(i) @@ -1851,11 +1861,13 @@ def do_splitter_match(self, line: Line) -> TMatchResult: if string_idx is not None: string_value = line.leaves[string_idx].value - # If the string has no spaces... - if " " not in string_value: + # If the string has neither spaces nor East Asian stops... + if not any( + char == " " or char in SPLIT_SAFE_CHARS for char in string_value + ): # And will still violate the line length limit when split... - max_string_length = self.line_length - ((line.depth + 1) * 4) - if len(string_value) > max_string_length: + max_string_width = self.line_length - ((line.depth + 1) * 4) + if str_width(string_value) > max_string_width: # And has no associated custom splits... if not self.has_custom_splits(string_value): # Then we should NOT put this string on its own line. diff --git a/tests/data/preview/long_strings__east_asian_width.py b/tests/data/preview/long_strings__east_asian_width.py new file mode 100644 index 00000000000..fb66a78ed8b --- /dev/null +++ b/tests/data/preview/long_strings__east_asian_width.py @@ -0,0 +1,25 @@ +# The following strings do not have not-so-many chars, but are long enough +# when these are rendered in a monospace font (if the renderer respects +# Unicode East Asian Width properties). +hangul = '코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이 필요한 문자열' +hanzi = '中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長,因此需要換行的字符串。' +japanese = 'コードポイントの数は少ないが、実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、改行が要る文字列' + +# output + +# The following strings do not have not-so-many chars, but are long enough +# when these are rendered in a monospace font (if the renderer respects +# Unicode East Asian Width properties). +hangul = ( + "코드포인트 수는 적으나 실제 터미널이나 에디터에서 렌더링될 땐 너무 길어서 줄바꿈이" + " 필요한 문자열" +) +hanzi = ( + "中文測試:代碼點數量少,但在真正的終端模擬器或編輯器中呈現時太長," + "因此需要換行的字符串。" +) +japanese = ( + "コードポイントの数は少ないが、" + "実際の端末エミュレータやエディタでレンダリングされる時は長すぎる為、" + "改行が要る文字列" +) From 0800aeecab6c5a738b55b6d86c194579e5b32fcb Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Mon, 26 Dec 2022 04:48:38 +0900 Subject: [PATCH 2/4] Fast path to measure width of ASCII strings See also: https://github.com/psf/black/pull/3445#issuecomment-1364459424 --- src/black/strings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/black/strings.py b/src/black/strings.py index 7f30e283a4a..6849e8927c4 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -298,6 +298,9 @@ def str_width(line_str: str) -> int: You could utilize this function to determine, for example, if a string is too wide to display in a terminal or editor. """ + if line_str.isascii(): + # Fast path for most of strings which contains only characters in ASCII: + return len(line_str) return sum(map(char_width, line_str)) From ecb5fdffb02216b8718f47eeb4912a9d7e4c5bc2 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 10 Jan 2023 01:18:10 +0900 Subject: [PATCH 3/4] Generate more precise width table using wcwidth --- scripts/make_width_table.py | 69 +++++ src/black/_width_table.py | 484 ++++++++++++++++++++++++++++++++++++ src/black/strings.py | 25 +- 3 files changed, 574 insertions(+), 4 deletions(-) create mode 100644 scripts/make_width_table.py create mode 100644 src/black/_width_table.py diff --git a/scripts/make_width_table.py b/scripts/make_width_table.py new file mode 100644 index 00000000000..f15631e2bd0 --- /dev/null +++ b/scripts/make_width_table.py @@ -0,0 +1,69 @@ +"""Generates a width table for Unicode characters. + +This script generates a width table for Unicode characters that are not +narrow (width 1). The table is written to src/black/_width_table.py (note +that although this file is generated, it is checked into Git) and is used +by the char_width() function in src/black/strings.py. + +You should run this script when you upgrade wcwidth, which is expected to +happen when a new Unicode version is released. The generated table contains +the version of wcwidth and Unicode that it was generated for. + +In order to run this script, you need to install the latest version of wcwidth. +You can do this by running: + + pip install -U wcwidth + +""" +import sys +from os.path import basename, dirname, join +from typing import Iterable, Tuple + +import wcwidth + + +def make_width_table() -> Iterable[Tuple[int, int, int]]: + start_codepoint = -1 + end_codepoint = -1 + range_width = -2 + for codepoint in range(0, sys.maxunicode + 1): + width = wcwidth.wcwidth(chr(codepoint)) + if width == 1: + continue + if start_codepoint < 0: + start_codepoint = codepoint + range_width = width + elif width != range_width or codepoint != end_codepoint + 1: + yield (start_codepoint, end_codepoint, range_width) + start_codepoint = codepoint + range_width = width + end_codepoint = codepoint + if start_codepoint >= 0: + yield (start_codepoint, end_codepoint, range_width) + + +def main() -> None: + table_path = join(dirname(__file__), "..", "src", "black", "_width_table.py") + with open(table_path, "w") as f: + f.write( + f"""# Generated by {basename(__file__)} +# wcwidth {wcwidth.__version__} +# Unicode {wcwidth.list_versions()[-1]} +import sys +from typing import List, Tuple + +if sys.version_info < (3, 8): + from typing_extensions import Final +else: + from typing import Final + +WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [ +""" + ) + for triple in make_width_table(): + f.write(f" {triple!r},\n") + f.write("]\n") + + +if __name__ == "__main__": + main() diff --git a/src/black/_width_table.py b/src/black/_width_table.py new file mode 100644 index 00000000000..6923f597687 --- /dev/null +++ b/src/black/_width_table.py @@ -0,0 +1,484 @@ +# Generated by make_width_table.py +# wcwidth 0.2.6 +# Unicode 15.0.0 +import sys +from typing import List, Tuple + +if sys.version_info < (3, 8): + from typing_extensions import Final +else: + from typing import Final + +WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [ + (0, 0, 0), + (1, 31, -1), + (127, 159, -1), + (768, 879, 0), + (1155, 1161, 0), + (1425, 1469, 0), + (1471, 1471, 0), + (1473, 1474, 0), + (1476, 1477, 0), + (1479, 1479, 0), + (1552, 1562, 0), + (1611, 1631, 0), + (1648, 1648, 0), + (1750, 1756, 0), + (1759, 1764, 0), + (1767, 1768, 0), + (1770, 1773, 0), + (1809, 1809, 0), + (1840, 1866, 0), + (1958, 1968, 0), + (2027, 2035, 0), + (2045, 2045, 0), + (2070, 2073, 0), + (2075, 2083, 0), + (2085, 2087, 0), + (2089, 2093, 0), + (2137, 2139, 0), + (2200, 2207, 0), + (2250, 2273, 0), + (2275, 2306, 0), + (2362, 2362, 0), + (2364, 2364, 0), + (2369, 2376, 0), + (2381, 2381, 0), + (2385, 2391, 0), + (2402, 2403, 0), + (2433, 2433, 0), + (2492, 2492, 0), + (2497, 2500, 0), + (2509, 2509, 0), + (2530, 2531, 0), + (2558, 2558, 0), + (2561, 2562, 0), + (2620, 2620, 0), + (2625, 2626, 0), + (2631, 2632, 0), + (2635, 2637, 0), + (2641, 2641, 0), + (2672, 2673, 0), + (2677, 2677, 0), + (2689, 2690, 0), + (2748, 2748, 0), + (2753, 2757, 0), + (2759, 2760, 0), + (2765, 2765, 0), + (2786, 2787, 0), + (2810, 2815, 0), + (2817, 2817, 0), + (2876, 2876, 0), + (2879, 2879, 0), + (2881, 2884, 0), + (2893, 2893, 0), + (2901, 2902, 0), + (2914, 2915, 0), + (2946, 2946, 0), + (3008, 3008, 0), + (3021, 3021, 0), + (3072, 3072, 0), + (3076, 3076, 0), + (3132, 3132, 0), + (3134, 3136, 0), + (3142, 3144, 0), + (3146, 3149, 0), + (3157, 3158, 0), + (3170, 3171, 0), + (3201, 3201, 0), + (3260, 3260, 0), + (3263, 3263, 0), + (3270, 3270, 0), + (3276, 3277, 0), + (3298, 3299, 0), + (3328, 3329, 0), + (3387, 3388, 0), + (3393, 3396, 0), + (3405, 3405, 0), + (3426, 3427, 0), + (3457, 3457, 0), + (3530, 3530, 0), + (3538, 3540, 0), + (3542, 3542, 0), + (3633, 3633, 0), + (3636, 3642, 0), + (3655, 3662, 0), + (3761, 3761, 0), + (3764, 3772, 0), + (3784, 3790, 0), + (3864, 3865, 0), + (3893, 3893, 0), + (3895, 3895, 0), + (3897, 3897, 0), + (3953, 3966, 0), + (3968, 3972, 0), + (3974, 3975, 0), + (3981, 3991, 0), + (3993, 4028, 0), + (4038, 4038, 0), + (4141, 4144, 0), + (4146, 4151, 0), + (4153, 4154, 0), + (4157, 4158, 0), + (4184, 4185, 0), + (4190, 4192, 0), + (4209, 4212, 0), + (4226, 4226, 0), + (4229, 4230, 0), + (4237, 4237, 0), + (4253, 4253, 0), + (4352, 4447, 2), + (4957, 4959, 0), + (5906, 5908, 0), + (5938, 5939, 0), + (5970, 5971, 0), + (6002, 6003, 0), + (6068, 6069, 0), + (6071, 6077, 0), + (6086, 6086, 0), + (6089, 6099, 0), + (6109, 6109, 0), + (6155, 6157, 0), + (6159, 6159, 0), + (6277, 6278, 0), + (6313, 6313, 0), + (6432, 6434, 0), + (6439, 6440, 0), + (6450, 6450, 0), + (6457, 6459, 0), + (6679, 6680, 0), + (6683, 6683, 0), + (6742, 6742, 0), + (6744, 6750, 0), + (6752, 6752, 0), + (6754, 6754, 0), + (6757, 6764, 0), + (6771, 6780, 0), + (6783, 6783, 0), + (6832, 6862, 0), + (6912, 6915, 0), + (6964, 6964, 0), + (6966, 6970, 0), + (6972, 6972, 0), + (6978, 6978, 0), + (7019, 7027, 0), + (7040, 7041, 0), + (7074, 7077, 0), + (7080, 7081, 0), + (7083, 7085, 0), + (7142, 7142, 0), + (7144, 7145, 0), + (7149, 7149, 0), + (7151, 7153, 0), + (7212, 7219, 0), + (7222, 7223, 0), + (7376, 7378, 0), + (7380, 7392, 0), + (7394, 7400, 0), + (7405, 7405, 0), + (7412, 7412, 0), + (7416, 7417, 0), + (7616, 7679, 0), + (8203, 8207, 0), + (8232, 8238, 0), + (8288, 8291, 0), + (8400, 8432, 0), + (8986, 8987, 2), + (9001, 9002, 2), + (9193, 9196, 2), + (9200, 9200, 2), + (9203, 9203, 2), + (9725, 9726, 2), + (9748, 9749, 2), + (9800, 9811, 2), + (9855, 9855, 2), + (9875, 9875, 2), + (9889, 9889, 2), + (9898, 9899, 2), + (9917, 9918, 2), + (9924, 9925, 2), + (9934, 9934, 2), + (9940, 9940, 2), + (9962, 9962, 2), + (9970, 9971, 2), + (9973, 9973, 2), + (9978, 9978, 2), + (9981, 9981, 2), + (9989, 9989, 2), + (9994, 9995, 2), + (10024, 10024, 2), + (10060, 10060, 2), + (10062, 10062, 2), + (10067, 10069, 2), + (10071, 10071, 2), + (10133, 10135, 2), + (10160, 10160, 2), + (10175, 10175, 2), + (11035, 11036, 2), + (11088, 11088, 2), + (11093, 11093, 2), + (11503, 11505, 0), + (11647, 11647, 0), + (11744, 11775, 0), + (11904, 11929, 2), + (11931, 12019, 2), + (12032, 12245, 2), + (12272, 12283, 2), + (12288, 12329, 2), + (12330, 12333, 0), + (12334, 12350, 2), + (12353, 12438, 2), + (12441, 12442, 0), + (12443, 12543, 2), + (12549, 12591, 2), + (12593, 12686, 2), + (12688, 12771, 2), + (12784, 12830, 2), + (12832, 12871, 2), + (12880, 19903, 2), + (19968, 42124, 2), + (42128, 42182, 2), + (42607, 42610, 0), + (42612, 42621, 0), + (42654, 42655, 0), + (42736, 42737, 0), + (43010, 43010, 0), + (43014, 43014, 0), + (43019, 43019, 0), + (43045, 43046, 0), + (43052, 43052, 0), + (43204, 43205, 0), + (43232, 43249, 0), + (43263, 43263, 0), + (43302, 43309, 0), + (43335, 43345, 0), + (43360, 43388, 2), + (43392, 43394, 0), + (43443, 43443, 0), + (43446, 43449, 0), + (43452, 43453, 0), + (43493, 43493, 0), + (43561, 43566, 0), + (43569, 43570, 0), + (43573, 43574, 0), + (43587, 43587, 0), + (43596, 43596, 0), + (43644, 43644, 0), + (43696, 43696, 0), + (43698, 43700, 0), + (43703, 43704, 0), + (43710, 43711, 0), + (43713, 43713, 0), + (43756, 43757, 0), + (43766, 43766, 0), + (44005, 44005, 0), + (44008, 44008, 0), + (44013, 44013, 0), + (44032, 55203, 2), + (63744, 64255, 2), + (64286, 64286, 0), + (65024, 65039, 0), + (65040, 65049, 2), + (65056, 65071, 0), + (65072, 65106, 2), + (65108, 65126, 2), + (65128, 65131, 2), + (65281, 65376, 2), + (65504, 65510, 2), + (66045, 66045, 0), + (66272, 66272, 0), + (66422, 66426, 0), + (68097, 68099, 0), + (68101, 68102, 0), + (68108, 68111, 0), + (68152, 68154, 0), + (68159, 68159, 0), + (68325, 68326, 0), + (68900, 68903, 0), + (69291, 69292, 0), + (69373, 69375, 0), + (69446, 69456, 0), + (69506, 69509, 0), + (69633, 69633, 0), + (69688, 69702, 0), + (69744, 69744, 0), + (69747, 69748, 0), + (69759, 69761, 0), + (69811, 69814, 0), + (69817, 69818, 0), + (69826, 69826, 0), + (69888, 69890, 0), + (69927, 69931, 0), + (69933, 69940, 0), + (70003, 70003, 0), + (70016, 70017, 0), + (70070, 70078, 0), + (70089, 70092, 0), + (70095, 70095, 0), + (70191, 70193, 0), + (70196, 70196, 0), + (70198, 70199, 0), + (70206, 70206, 0), + (70209, 70209, 0), + (70367, 70367, 0), + (70371, 70378, 0), + (70400, 70401, 0), + (70459, 70460, 0), + (70464, 70464, 0), + (70502, 70508, 0), + (70512, 70516, 0), + (70712, 70719, 0), + (70722, 70724, 0), + (70726, 70726, 0), + (70750, 70750, 0), + (70835, 70840, 0), + (70842, 70842, 0), + (70847, 70848, 0), + (70850, 70851, 0), + (71090, 71093, 0), + (71100, 71101, 0), + (71103, 71104, 0), + (71132, 71133, 0), + (71219, 71226, 0), + (71229, 71229, 0), + (71231, 71232, 0), + (71339, 71339, 0), + (71341, 71341, 0), + (71344, 71349, 0), + (71351, 71351, 0), + (71453, 71455, 0), + (71458, 71461, 0), + (71463, 71467, 0), + (71727, 71735, 0), + (71737, 71738, 0), + (71995, 71996, 0), + (71998, 71998, 0), + (72003, 72003, 0), + (72148, 72151, 0), + (72154, 72155, 0), + (72160, 72160, 0), + (72193, 72202, 0), + (72243, 72248, 0), + (72251, 72254, 0), + (72263, 72263, 0), + (72273, 72278, 0), + (72281, 72283, 0), + (72330, 72342, 0), + (72344, 72345, 0), + (72752, 72758, 0), + (72760, 72765, 0), + (72767, 72767, 0), + (72850, 72871, 0), + (72874, 72880, 0), + (72882, 72883, 0), + (72885, 72886, 0), + (73009, 73014, 0), + (73018, 73018, 0), + (73020, 73021, 0), + (73023, 73029, 0), + (73031, 73031, 0), + (73104, 73105, 0), + (73109, 73109, 0), + (73111, 73111, 0), + (73459, 73460, 0), + (73472, 73473, 0), + (73526, 73530, 0), + (73536, 73536, 0), + (73538, 73538, 0), + (78912, 78912, 0), + (78919, 78933, 0), + (92912, 92916, 0), + (92976, 92982, 0), + (94031, 94031, 0), + (94095, 94098, 0), + (94176, 94179, 2), + (94180, 94180, 0), + (94192, 94193, 2), + (94208, 100343, 2), + (100352, 101589, 2), + (101632, 101640, 2), + (110576, 110579, 2), + (110581, 110587, 2), + (110589, 110590, 2), + (110592, 110882, 2), + (110898, 110898, 2), + (110928, 110930, 2), + (110933, 110933, 2), + (110948, 110951, 2), + (110960, 111355, 2), + (113821, 113822, 0), + (118528, 118573, 0), + (118576, 118598, 0), + (119143, 119145, 0), + (119163, 119170, 0), + (119173, 119179, 0), + (119210, 119213, 0), + (119362, 119364, 0), + (121344, 121398, 0), + (121403, 121452, 0), + (121461, 121461, 0), + (121476, 121476, 0), + (121499, 121503, 0), + (121505, 121519, 0), + (122880, 122886, 0), + (122888, 122904, 0), + (122907, 122913, 0), + (122915, 122916, 0), + (122918, 122922, 0), + (123023, 123023, 0), + (123184, 123190, 0), + (123566, 123566, 0), + (123628, 123631, 0), + (124140, 124143, 0), + (125136, 125142, 0), + (125252, 125258, 0), + (126980, 126980, 2), + (127183, 127183, 2), + (127374, 127374, 2), + (127377, 127386, 2), + (127488, 127490, 2), + (127504, 127547, 2), + (127552, 127560, 2), + (127568, 127569, 2), + (127584, 127589, 2), + (127744, 127776, 2), + (127789, 127797, 2), + (127799, 127868, 2), + (127870, 127891, 2), + (127904, 127946, 2), + (127951, 127955, 2), + (127968, 127984, 2), + (127988, 127988, 2), + (127992, 128062, 2), + (128064, 128064, 2), + (128066, 128252, 2), + (128255, 128317, 2), + (128331, 128334, 2), + (128336, 128359, 2), + (128378, 128378, 2), + (128405, 128406, 2), + (128420, 128420, 2), + (128507, 128591, 2), + (128640, 128709, 2), + (128716, 128716, 2), + (128720, 128722, 2), + (128725, 128727, 2), + (128732, 128735, 2), + (128747, 128748, 2), + (128756, 128764, 2), + (128992, 129003, 2), + (129008, 129008, 2), + (129292, 129338, 2), + (129340, 129349, 2), + (129351, 129535, 2), + (129648, 129660, 2), + (129664, 129672, 2), + (129680, 129725, 2), + (129727, 129733, 2), + (129742, 129755, 2), + (129760, 129768, 2), + (129776, 129784, 2), + (131072, 196605, 2), + (196608, 262141, 2), + (917760, 917999, 0), +] diff --git a/src/black/strings.py b/src/black/strings.py index 6849e8927c4..ac18aef51ed 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -4,7 +4,6 @@ import re import sys -import unicodedata from functools import lru_cache from typing import List, Match, Pattern @@ -15,6 +14,7 @@ else: from typing import Final +from black._width_table import WIDTH_TABLE STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. STRING_PREFIX_RE: Final = re.compile( @@ -281,14 +281,31 @@ def replace(m: Match[str]) -> str: leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) +@lru_cache(maxsize=4096) def char_width(char: str) -> int: """Return the width of a single character as it would be displayed in a terminal or editor (which respects Unicode East Asian Width). Full width characters are counted as 2, while half width characters are - counted as 1. + counted as 1. Also control characters are counted as 0. """ - return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1 + table = WIDTH_TABLE + codepoint = ord(char) + highest = len(table) - 1 + lowest = 0 + idx = highest // 2 + while True: + start_codepoint, end_codepoint, width = table[idx] + if codepoint < start_codepoint: + highest = idx - 1 + elif codepoint > end_codepoint: + lowest = idx + 1 + else: + return 0 if width < 0 else width + if highest < lowest: + break + idx = (highest + lowest) // 2 + return 1 def str_width(line_str: str) -> int: @@ -299,7 +316,7 @@ def str_width(line_str: str) -> int: is too wide to display in a terminal or editor. """ if line_str.isascii(): - # Fast path for most of strings which contains only characters in ASCII: + # Fast path for a line consisting of only ASCII characters return len(line_str) return sum(map(char_width, line_str)) From 0ab121e74523b4f0fe88917e7153758ac5af9d79 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Fri, 10 Feb 2023 22:05:20 +0900 Subject: [PATCH 4/4] Let width table treat 0-width chars as 1-width --- scripts/make_width_table.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scripts/make_width_table.py b/scripts/make_width_table.py index f15631e2bd0..09aca9c34b5 100644 --- a/scripts/make_width_table.py +++ b/scripts/make_width_table.py @@ -28,7 +28,11 @@ def make_width_table() -> Iterable[Tuple[int, int, int]]: range_width = -2 for codepoint in range(0, sys.maxunicode + 1): width = wcwidth.wcwidth(chr(codepoint)) - if width == 1: + if width <= 1: + # Ignore narrow characters along with zero-width characters so that + # they are treated as single-width. Note that treating zero-width + # characters as single-width is consistent with the heuristics built + # on top of str.isascii() in the str_width() function in strings.py. continue if start_codepoint < 0: start_codepoint = codepoint