From 71da60779766a46c92b6245a26e9e3f316f56083 Mon Sep 17 00:00:00 2001 From: Hong Minhee Date: Tue, 10 Jan 2023 01:18:10 +0900 Subject: [PATCH] Generate more precise width table using wcwidth --- pyproject.toml | 1 + scripts/make_width_table.py | 50 ++++ src/black/_width_table.py | 458 ++++++++++++++++++++++++++++++++++++ src/black/strings.py | 28 ++- 4 files changed, 532 insertions(+), 5 deletions(-) create mode 100644 scripts/make_width_table.py create mode 100644 src/black/_width_table.py diff --git a/pyproject.toml b/pyproject.toml index ab38908ba15..daafa30fafc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ colorama = ["colorama>=0.4.3"] uvloop = ["uvloop>=0.15.2"] d = [ "aiohttp>=3.7.4", + "wcwidth==0.2.5", ] jupyter = [ "ipython>=7.8.0", diff --git a/scripts/make_width_table.py b/scripts/make_width_table.py new file mode 100644 index 00000000000..bdc375aef9c --- /dev/null +++ b/scripts/make_width_table.py @@ -0,0 +1,50 @@ +import sys +from os.path import basename, dirname, join +from typing import Iterable, Tuple + +from wcwidth import wcwidth + + +def make_width_table() -> Iterable[Tuple[int, int, int]]: + start_codepoint = -1 + end_codepoint = -1 + range_width = -2 + for codepoint in range(0, sys.maxunicode + 1): + width = wcwidth(chr(codepoint)) + if width == 1: + continue + if start_codepoint < 0: + start_codepoint = codepoint + range_width = width + elif width != range_width or codepoint != end_codepoint + 1: + yield (start_codepoint, end_codepoint, range_width) + start_codepoint = codepoint + range_width = width + end_codepoint = codepoint + if start_codepoint >= 0: + yield (start_codepoint, end_codepoint, range_width) + + +def main() -> None: + table_path = join(dirname(__file__), "..", "src", "black", "_width_table.py") + with open(table_path, "w") as f: + f.write( + f"""# Generated by {basename(__file__)} +import sys +from typing import List, Tuple + +if sys.version_info < (3, 8): + from typing_extensions import Final +else: + from typing import Final + +WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [ +""" + ) + for triple in make_width_table(): + f.write(f" {triple!r},\n") + f.write("]\n") + + +if __name__ == "__main__": + main() diff --git a/src/black/_width_table.py b/src/black/_width_table.py new file mode 100644 index 00000000000..7fa91d51e44 --- /dev/null +++ b/src/black/_width_table.py @@ -0,0 +1,458 @@ +# Generated by make_width_table.py +import sys +from typing import List, Tuple + +if sys.version_info < (3, 8): + from typing_extensions import Final +else: + from typing import Final + +WIDTH_TABLE: Final[List[Tuple[int, int, int]]] = [ + (0, 0, 0), + (1, 31, -1), + (127, 159, -1), + (768, 879, 0), + (1155, 1161, 0), + (1425, 1469, 0), + (1471, 1471, 0), + (1473, 1474, 0), + (1476, 1477, 0), + (1479, 1479, 0), + (1552, 1562, 0), + (1611, 1631, 0), + (1648, 1648, 0), + (1750, 1756, 0), + (1759, 1764, 0), + (1767, 1768, 0), + (1770, 1773, 0), + (1809, 1809, 0), + (1840, 1866, 0), + (1958, 1968, 0), + (2027, 2035, 0), + (2045, 2045, 0), + (2070, 2073, 0), + (2075, 2083, 0), + (2085, 2087, 0), + (2089, 2093, 0), + (2137, 2139, 0), + (2259, 2273, 0), + (2275, 2306, 0), + (2362, 2362, 0), + (2364, 2364, 0), + (2369, 2376, 0), + (2381, 2381, 0), + (2385, 2391, 0), + (2402, 2403, 0), + (2433, 2433, 0), + (2492, 2492, 0), + (2497, 2500, 0), + (2509, 2509, 0), + (2530, 2531, 0), + (2558, 2558, 0), + (2561, 2562, 0), + (2620, 2620, 0), + (2625, 2626, 0), + (2631, 2632, 0), + (2635, 2637, 0), + (2641, 2641, 0), + (2672, 2673, 0), + (2677, 2677, 0), + (2689, 2690, 0), + (2748, 2748, 0), + (2753, 2757, 0), + (2759, 2760, 0), + (2765, 2765, 0), + (2786, 2787, 0), + (2810, 2815, 0), + (2817, 2817, 0), + (2876, 2876, 0), + (2879, 2879, 0), + (2881, 2884, 0), + (2893, 2893, 0), + (2901, 2902, 0), + (2914, 2915, 0), + (2946, 2946, 0), + (3008, 3008, 0), + (3021, 3021, 0), + (3072, 3072, 0), + (3076, 3076, 0), + (3134, 3136, 0), + (3142, 3144, 0), + (3146, 3149, 0), + (3157, 3158, 0), + (3170, 3171, 0), + (3201, 3201, 0), + (3260, 3260, 0), + (3263, 3263, 0), + (3270, 3270, 0), + (3276, 3277, 0), + (3298, 3299, 0), + (3328, 3329, 0), + (3387, 3388, 0), + (3393, 3396, 0), + (3405, 3405, 0), + (3426, 3427, 0), + (3457, 3457, 0), + (3530, 3530, 0), + (3538, 3540, 0), + (3542, 3542, 0), + (3633, 3633, 0), + (3636, 3642, 0), + (3655, 3662, 0), + (3761, 3761, 0), + (3764, 3772, 0), + (3784, 3789, 0), + (3864, 3865, 0), + (3893, 3893, 0), + (3895, 3895, 0), + (3897, 3897, 0), + (3953, 3966, 0), + (3968, 3972, 0), + (3974, 3975, 0), + (3981, 3991, 0), + (3993, 4028, 0), + (4038, 4038, 0), + (4141, 4144, 0), + (4146, 4151, 0), + (4153, 4154, 0), + (4157, 4158, 0), + (4184, 4185, 0), + (4190, 4192, 0), + (4209, 4212, 0), + (4226, 4226, 0), + (4229, 4230, 0), + (4237, 4237, 0), + (4253, 4253, 0), + (4352, 4447, 2), + (4957, 4959, 0), + (5906, 5908, 0), + (5938, 5940, 0), + (5970, 5971, 0), + (6002, 6003, 0), + (6068, 6069, 0), + (6071, 6077, 0), + (6086, 6086, 0), + (6089, 6099, 0), + (6109, 6109, 0), + (6155, 6157, 0), + (6277, 6278, 0), + (6313, 6313, 0), + (6432, 6434, 0), + (6439, 6440, 0), + (6450, 6450, 0), + (6457, 6459, 0), + (6679, 6680, 0), + (6683, 6683, 0), + (6742, 6742, 0), + (6744, 6750, 0), + (6752, 6752, 0), + (6754, 6754, 0), + (6757, 6764, 0), + (6771, 6780, 0), + (6783, 6783, 0), + (6832, 6848, 0), + (6912, 6915, 0), + (6964, 6964, 0), + (6966, 6970, 0), + (6972, 6972, 0), + (6978, 6978, 0), + (7019, 7027, 0), + (7040, 7041, 0), + (7074, 7077, 0), + (7080, 7081, 0), + (7083, 7085, 0), + (7142, 7142, 0), + (7144, 7145, 0), + (7149, 7149, 0), + (7151, 7153, 0), + (7212, 7219, 0), + (7222, 7223, 0), + (7376, 7378, 0), + (7380, 7392, 0), + (7394, 7400, 0), + (7405, 7405, 0), + (7412, 7412, 0), + (7416, 7417, 0), + (7616, 7673, 0), + (7675, 7679, 0), + (8203, 8207, 0), + (8232, 8238, 0), + (8288, 8291, 0), + (8400, 8432, 0), + (8986, 8987, 2), + (9001, 9002, 2), + (9193, 9196, 2), + (9200, 9200, 2), + (9203, 9203, 2), + (9725, 9726, 2), + (9748, 9749, 2), + (9800, 9811, 2), + (9855, 9855, 2), + (9875, 9875, 2), + (9889, 9889, 2), + (9898, 9899, 2), + (9917, 9918, 2), + (9924, 9925, 2), + (9934, 9934, 2), + (9940, 9940, 2), + (9962, 9962, 2), + (9970, 9971, 2), + (9973, 9973, 2), + (9978, 9978, 2), + (9981, 9981, 2), + (9989, 9989, 2), + (9994, 9995, 2), + (10024, 10024, 2), + (10060, 10060, 2), + (10062, 10062, 2), + (10067, 10069, 2), + (10071, 10071, 2), + (10133, 10135, 2), + (10160, 10160, 2), + (10175, 10175, 2), + (11035, 11036, 2), + (11088, 11088, 2), + (11093, 11093, 2), + (11503, 11505, 0), + (11647, 11647, 0), + (11744, 11775, 0), + (11904, 11929, 2), + (11931, 12019, 2), + (12032, 12245, 2), + (12272, 12283, 2), + (12288, 12329, 2), + (12330, 12333, 0), + (12334, 12350, 2), + (12353, 12438, 2), + (12441, 12442, 0), + (12443, 12543, 2), + (12549, 12591, 2), + (12593, 12686, 2), + (12688, 12771, 2), + (12784, 12830, 2), + (12832, 12871, 2), + (12880, 19903, 2), + (19968, 42124, 2), + (42128, 42182, 2), + (42607, 42610, 0), + (42612, 42621, 0), + (42654, 42655, 0), + (42736, 42737, 0), + (43010, 43010, 0), + (43014, 43014, 0), + (43019, 43019, 0), + (43045, 43046, 0), + (43052, 43052, 0), + (43204, 43205, 0), + (43232, 43249, 0), + (43263, 43263, 0), + (43302, 43309, 0), + (43335, 43345, 0), + (43360, 43388, 2), + (43392, 43394, 0), + (43443, 43443, 0), + (43446, 43449, 0), + (43452, 43453, 0), + (43493, 43493, 0), + (43561, 43566, 0), + (43569, 43570, 0), + (43573, 43574, 0), + (43587, 43587, 0), + (43596, 43596, 0), + (43644, 43644, 0), + (43696, 43696, 0), + (43698, 43700, 0), + (43703, 43704, 0), + (43710, 43711, 0), + (43713, 43713, 0), + (43756, 43757, 0), + (43766, 43766, 0), + (44005, 44005, 0), + (44008, 44008, 0), + (44013, 44013, 0), + (44032, 55203, 2), + (63744, 64255, 2), + (64286, 64286, 0), + (65024, 65039, 0), + (65040, 65049, 2), + (65056, 65071, 0), + (65072, 65106, 2), + (65108, 65126, 2), + (65128, 65131, 2), + (65281, 65376, 2), + (65504, 65510, 2), + (66045, 66045, 0), + (66272, 66272, 0), + (66422, 66426, 0), + (68097, 68099, 0), + (68101, 68102, 0), + (68108, 68111, 0), + (68152, 68154, 0), + (68159, 68159, 0), + (68325, 68326, 0), + (68900, 68903, 0), + (69291, 69292, 0), + (69446, 69456, 0), + (69633, 69633, 0), + (69688, 69702, 0), + (69759, 69761, 0), + (69811, 69814, 0), + (69817, 69818, 0), + (69888, 69890, 0), + (69927, 69931, 0), + (69933, 69940, 0), + (70003, 70003, 0), + (70016, 70017, 0), + (70070, 70078, 0), + (70089, 70092, 0), + (70095, 70095, 0), + (70191, 70193, 0), + (70196, 70196, 0), + (70198, 70199, 0), + (70206, 70206, 0), + (70367, 70367, 0), + (70371, 70378, 0), + (70400, 70401, 0), + (70459, 70460, 0), + (70464, 70464, 0), + (70502, 70508, 0), + (70512, 70516, 0), + (70712, 70719, 0), + (70722, 70724, 0), + (70726, 70726, 0), + (70750, 70750, 0), + (70835, 70840, 0), + (70842, 70842, 0), + (70847, 70848, 0), + (70850, 70851, 0), + (71090, 71093, 0), + (71100, 71101, 0), + (71103, 71104, 0), + (71132, 71133, 0), + (71219, 71226, 0), + (71229, 71229, 0), + (71231, 71232, 0), + (71339, 71339, 0), + (71341, 71341, 0), + (71344, 71349, 0), + (71351, 71351, 0), + (71453, 71455, 0), + (71458, 71461, 0), + (71463, 71467, 0), + (71727, 71735, 0), + (71737, 71738, 0), + (71995, 71996, 0), + (71998, 71998, 0), + (72003, 72003, 0), + (72148, 72151, 0), + (72154, 72155, 0), + (72160, 72160, 0), + (72193, 72202, 0), + (72243, 72248, 0), + (72251, 72254, 0), + (72263, 72263, 0), + (72273, 72278, 0), + (72281, 72283, 0), + (72330, 72342, 0), + (72344, 72345, 0), + (72752, 72758, 0), + (72760, 72765, 0), + (72767, 72767, 0), + (72850, 72871, 0), + (72874, 72880, 0), + (72882, 72883, 0), + (72885, 72886, 0), + (73009, 73014, 0), + (73018, 73018, 0), + (73020, 73021, 0), + (73023, 73029, 0), + (73031, 73031, 0), + (73104, 73105, 0), + (73109, 73109, 0), + (73111, 73111, 0), + (73459, 73460, 0), + (92912, 92916, 0), + (92976, 92982, 0), + (94031, 94031, 0), + (94095, 94098, 0), + (94176, 94179, 2), + (94180, 94180, 0), + (94192, 94193, 2), + (94208, 100343, 2), + (100352, 101589, 2), + (101632, 101640, 2), + (110592, 110878, 2), + (110928, 110930, 2), + (110948, 110951, 2), + (110960, 111355, 2), + (113821, 113822, 0), + (119143, 119145, 0), + (119163, 119170, 0), + (119173, 119179, 0), + (119210, 119213, 0), + (119362, 119364, 0), + (121344, 121398, 0), + (121403, 121452, 0), + (121461, 121461, 0), + (121476, 121476, 0), + (121499, 121503, 0), + (121505, 121519, 0), + (122880, 122886, 0), + (122888, 122904, 0), + (122907, 122913, 0), + (122915, 122916, 0), + (122918, 122922, 0), + (123184, 123190, 0), + (123628, 123631, 0), + (125136, 125142, 0), + (125252, 125258, 0), + (126980, 126980, 2), + (127183, 127183, 2), + (127374, 127374, 2), + (127377, 127386, 2), + (127488, 127490, 2), + (127504, 127547, 2), + (127552, 127560, 2), + (127568, 127569, 2), + (127584, 127589, 2), + (127744, 127776, 2), + (127789, 127797, 2), + (127799, 127868, 2), + (127870, 127891, 2), + (127904, 127946, 2), + (127951, 127955, 2), + (127968, 127984, 2), + (127988, 127988, 2), + (127992, 128062, 2), + (128064, 128064, 2), + (128066, 128252, 2), + (128255, 128317, 2), + (128331, 128334, 2), + (128336, 128359, 2), + (128378, 128378, 2), + (128405, 128406, 2), + (128420, 128420, 2), + (128507, 128591, 2), + (128640, 128709, 2), + (128716, 128716, 2), + (128720, 128722, 2), + (128725, 128727, 2), + (128747, 128748, 2), + (128756, 128764, 2), + (128992, 129003, 2), + (129292, 129338, 2), + (129340, 129349, 2), + (129351, 129400, 2), + (129402, 129483, 2), + (129485, 129535, 2), + (129648, 129652, 2), + (129656, 129658, 2), + (129664, 129670, 2), + (129680, 129704, 2), + (129712, 129718, 2), + (129728, 129730, 2), + (129744, 129750, 2), + (131072, 196605, 2), + (196608, 262141, 2), + (917760, 917999, 0), +] diff --git a/src/black/strings.py b/src/black/strings.py index b86145ee48e..263b9cf0c5e 100644 --- a/src/black/strings.py +++ b/src/black/strings.py @@ -4,7 +4,6 @@ import re import sys -import unicodedata from functools import lru_cache from typing import List, Pattern @@ -13,12 +12,14 @@ else: from typing import Final +from black._width_table import WIDTH_TABLE STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. STRING_PREFIX_RE: Final = re.compile( r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL ) FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") +COMMON_SINGLE_WIDTH_CHARS: Final = re.compile(r"^[\x20-\x7e\xa0-\u02ff]*$") def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: @@ -239,14 +240,31 @@ def normalize_string_quotes(s: str) -> str: return f"{prefix}{new_quote}{new_body}{new_quote}" +@lru_cache(maxsize=4096) def char_width(char: str) -> int: """Return the width of a single character as it would be displayed in a terminal or editor (which respects Unicode East Asian Width). Full width characters are counted as 2, while half width characters are - counted as 1. + counted as 1. Also control characters are counted as 0. """ - return 2 if unicodedata.east_asian_width(char) in ("F", "W") else 1 + table = WIDTH_TABLE + codepoint = ord(char) + highest = len(table) - 1 + lowest = 0 + idx = highest // 2 + while True: + start_codepoint, end_codepoint, width = table[idx] + if codepoint < start_codepoint: + highest = idx - 1 + elif codepoint > end_codepoint: + lowest = idx + 1 + else: + return 0 if width < 0 else width + if highest < lowest: + break + idx = (highest + lowest) // 2 + return 1 def str_width(line_str: str) -> int: @@ -256,8 +274,8 @@ def str_width(line_str: str) -> int: You could utilize this function to determine, for example, if a string is too wide to display in a terminal or editor. """ - if line_str.isascii(): - # Fast path for most of strings which contains only characters in ASCII: + if COMMON_SINGLE_WIDTH_CHARS.match(line_str): + # Fast path for a line consisting of only common single width chars return len(line_str) return sum(map(char_width, line_str))