Skip to content

Commit

Permalink
Normalise Hex and unicode escape sequences in string
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaReiser committed Dec 26, 2023
1 parent 2951339 commit 1401dbd
Show file tree
Hide file tree
Showing 11 changed files with 248 additions and 123 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -118,3 +118,5 @@
b'c'
)
}

b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,8 @@

# https://github.com/astral-sh/ruff/issues/7460
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']

a = f"""\x1F"""
a = """\x1F"""
a = """\\x1F"""
a = """\\\x1F"""
9 changes: 2 additions & 7 deletions crates/ruff_python_formatter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,13 +211,8 @@ if True:
#[test]
fn quick_test() {
let source = r#"
def main() -> None:
if True:
some_very_long_variable_name_abcdefghijk = Foo()
some_very_long_variable_name_abcdefghijk = some_very_long_variable_name_abcdefghijk[
some_very_long_variable_name_abcdefghijk.some_very_long_attribute_name
== "This is a very long string abcdefghijk"
]
x = "\N{ox}\N{OX}"
"#;
let source_type = PySourceType::Python;
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/bytes_literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{Quoting, StringPart};

#[derive(Default)]
Expand All @@ -17,6 +18,7 @@ impl FormatNodeRule<BytesLiteral> for FormatBytesLiteral {
&locator,
f.options().quote_style(),
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
)
.fmt(f)
}
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/f_string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::FString;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{Quoting, StringPart};

/// Formats an f-string which is part of a larger f-string expression.
Expand Down Expand Up @@ -31,6 +32,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
&locator,
f.options().quote_style(),
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
)
.fmt(f);

Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/string_literal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{docstring, Quoting, StringPart};
use crate::QuoteStyle;

Expand Down Expand Up @@ -61,6 +62,7 @@ impl Format<PyFormatContext<'_>> for FormatStringLiteral<'_> {
&locator,
quote_style,
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
);

if self.layout.is_docstring() {
Expand Down
5 changes: 5 additions & 0 deletions crates/ruff_python_formatter/src/preview.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte
pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool {
context.is_preview()
}

/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled.
pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool {
context.is_preview()
}
204 changes: 185 additions & 19 deletions crates/ruff_python_formatter/src/string/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,7 @@ impl StringPart {
locator: &'a Locator,
configured_style: QuoteStyle,
parent_docstring_quote_char: Option<QuoteChar>,
normalize_hex: bool,
) -> NormalizedString<'a> {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
let preferred_style = if self.quotes.triple {
Expand Down Expand Up @@ -310,7 +311,7 @@ impl StringPart {
configured_style
};

let raw_content = locator.slice(self.content_range);
let raw_content = &locator.slice(self.content_range);

let quotes = match quoting {
Quoting::Preserve => self.quotes,
Expand All @@ -327,7 +328,7 @@ impl StringPart {
}
};

let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix);
let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex);

NormalizedString {
prefix: self.prefix,
Expand Down Expand Up @@ -423,6 +424,10 @@ impl StringPrefix {
pub(super) const fn is_fstring(self) -> bool {
self.contains(StringPrefix::F_STRING)
}

pub(super) const fn is_byte(self) -> bool {
self.contains(StringPrefix::BYTE)
}
}

impl Format<PyFormatContext<'_>> for StringPrefix {
Expand Down Expand Up @@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
/// with the provided [`StringQuotes`] style.
///
/// Returns the normalized string and whether it contains new lines.
fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow<str> {
fn normalize_string(
input: &str,
quotes: StringQuotes,
prefix: StringPrefix,
normalize_hex: bool,
) -> Cow<str> {
// The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized.
let mut output = String::new();
Expand All @@ -734,15 +744,15 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
let preferred_quote = quote.as_char();
let opposite_quote = quote.invert().as_char();

let mut chars = input.char_indices().peekable();
let mut chars = input.char_indices();

let is_raw = prefix.is_raw_string();
let is_fstring = prefix.is_fstring();
let mut formatted_value_nesting = 0u32;

while let Some((index, c)) = chars.next() {
if is_fstring && matches!(c, '{' | '}') {
if chars.peek().copied().is_some_and(|(_, next)| next == c) {
if chars.as_str().starts_with(c) {
// Skip over the second character of the double braces
chars.next();
} else if c == '{' {
Expand All @@ -757,7 +767,7 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
output.push_str(&input[last_index..index]);

// Skip over the '\r' character, keep the `\n`
if chars.peek().copied().is_some_and(|(_, next)| next == '\n') {
if chars.as_str().starts_with('\n') {
chars.next();
}
// Replace the `\r` with a `\n`
Expand All @@ -766,24 +776,47 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
}

last_index = index + '\r'.len_utf8();
} else if !quotes.triple && !is_raw {
} else if !is_raw {
if c == '\\' {
if let Some((_, next)) = chars.peek().copied() {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
chars.next();
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
} else if next == preferred_quote {
// Quote is already escaped, skip over it.
chars.next();
} else if next == '\\' {
if let Some((_, next)) = chars.clone().next() {
if !quotes.triple {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
chars.next();
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
continue;
} else if next == preferred_quote {
// Quote is already escaped, skip over it.
chars.next();
continue;
}
}
if next == '\\' {
// Skip over escaped backslashes
chars.next();
} else if normalize_hex {
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
.and_then(|escape| escape.normalize(&chars.as_str()[next.len_utf8()..]))
{
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised {
output.push_str(&input[last_index..escape_start_offset]);
output.push_str(normalised);
last_index = escape_start_offset + normalised.len();
};

// Skip over the escape sequence characters
for _ in 0..escape_start_len + normalised.len() {
chars.next();
}
}
}
}
} else if c == preferred_quote && formatted_value_nesting == 0 {
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
// Escape the quote
output.push_str(&input[last_index..index]);
output.push('\\');
Expand All @@ -802,3 +835,136 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->

normalized
}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum UnicodeEscape {
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
Hex(usize),

/// An escaped unicode name (`\N{name}`)
CharacterName,
}

impl UnicodeEscape {
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
Some(match first {
'x' => UnicodeEscape::Hex(2),
'u' if allow_unicode => UnicodeEscape::Hex(4),
'U' if allow_unicode => UnicodeEscape::Hex(8),
'N' if allow_unicode => UnicodeEscape::CharacterName,
_ => return None,
})
}

/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
///
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
/// * `\N`: To use uppercase letters
fn normalize(self, input: &str) -> Option<Cow<str>> {
let mut normalised = String::new();

let len = match self {
UnicodeEscape::Hex(len) => {
// It's not a valid escape sequence if the input string has fewer characters
// left than required by the escape sequence.
if input.len() < len {
return None;
}

for (index, c) in input.char_indices().take(len) {
match c {
'0'..='9' | 'a'..='f' => {
if !normalised.is_empty() {
normalised.push(c);
}
}
'A'..='F' => {
if normalised.is_empty() {
normalised.reserve(len);
normalised.push_str(&input[..index]);
normalised.push(c.to_ascii_lowercase());
} else {
normalised.push(c.to_ascii_lowercase());
}
}
_ => {
// not a valid escape sequence
return None;
}
}
}

len
}
UnicodeEscape::CharacterName => {
let mut char_indices = input.char_indices();

if !matches!(char_indices.next(), Some((_, '{'))) {
return None;
}

loop {
if let Some((index, c)) = char_indices.next() {
match c {
'}' => {
if !normalised.is_empty() {
normalised.push('}');
}

// Name must be at least two characters long.
if index < 3 {
return None;
}

break index + '}'.len_utf8();
}
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
if !normalised.is_empty() {
normalised.push(c);
}
}
'a'..='z' => {
if normalised.is_empty() {
normalised.reserve(c.len_utf8() + '}'.len_utf8());
normalised.push_str(&input[..index]);
normalised.push(c.to_ascii_uppercase());
} else {
normalised.push(c.to_ascii_uppercase());
}
}
_ => {
// Seems like an invalid escape sequence, don't normalise it.
return None;
}
}
} else {
// Unterminated escape sequence, dont' normalise it.
return None;
}
}
}
};

Some(if normalised.is_empty() {
Cow::Borrowed(&input[..len])
} else {
Cow::Owned(normalised)
})
}
}

#[cfg(test)]
mod tests {
use crate::string::UnicodeEscape;
use std::borrow::Cow;

#[test]
fn normalize_32_escape() {
let escape_sequence = UnicodeEscape::new('U', true).unwrap();

assert_eq!(
Some(Cow::Owned("0001f60e".to_string())),
escape_sequence.normalize("0001F60E")
);
}
}

0 comments on commit 1401dbd

Please sign in to comment.