Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalise Hex and unicode escape sequences in string #9280

Merged
merged 1 commit into from Dec 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -118,3 +118,5 @@
b'c'
)
}

b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
Expand Up @@ -133,3 +133,8 @@

# https://github.com/astral-sh/ruff/issues/7460
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']

a = f"""\x1F"""
a = """\x1F"""
a = """\\x1F"""
a = """\\\x1F"""
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/bytes_literal.rs
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{Quoting, StringPart};

#[derive(Default)]
Expand All @@ -17,6 +18,7 @@ impl FormatNodeRule<BytesLiteral> for FormatBytesLiteral {
&locator,
f.options().quote_style(),
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
)
.fmt(f)
}
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/f_string.rs
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::FString;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{Quoting, StringPart};

/// Formats an f-string which is part of a larger f-string expression.
Expand Down Expand Up @@ -31,6 +32,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
&locator,
f.options().quote_style(),
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
)
.fmt(f);

Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/string_literal.rs
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{docstring, Quoting, StringPart};
use crate::QuoteStyle;

Expand Down Expand Up @@ -61,6 +62,7 @@ impl Format<PyFormatContext<'_>> for FormatStringLiteral<'_> {
&locator,
quote_style,
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
);

if self.layout.is_docstring() {
Expand Down
5 changes: 5 additions & 0 deletions crates/ruff_python_formatter/src/preview.rs
Expand Up @@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte
pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool {
context.is_preview()
}

/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled.
pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool {
context.is_preview()
}
218 changes: 202 additions & 16 deletions crates/ruff_python_formatter/src/string/mod.rs
Expand Up @@ -253,6 +253,7 @@ impl StringPart {
locator: &'a Locator,
configured_style: QuoteStyle,
parent_docstring_quote_char: Option<QuoteChar>,
normalize_hex: bool,
) -> NormalizedString<'a> {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
let preferred_style = if self.quotes.triple {
Expand Down Expand Up @@ -310,7 +311,7 @@ impl StringPart {
configured_style
};

let raw_content = locator.slice(self.content_range);
let raw_content = &locator.slice(self.content_range);

let quotes = match quoting {
Quoting::Preserve => self.quotes,
Expand All @@ -327,7 +328,7 @@ impl StringPart {
}
};

let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix);
let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex);

NormalizedString {
prefix: self.prefix,
Expand Down Expand Up @@ -423,6 +424,10 @@ impl StringPrefix {
pub(super) const fn is_fstring(self) -> bool {
self.contains(StringPrefix::F_STRING)
}

pub(super) const fn is_byte(self) -> bool {
self.contains(StringPrefix::BYTE)
}
}

impl Format<PyFormatContext<'_>> for StringPrefix {
Expand Down Expand Up @@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
/// with the provided [`StringQuotes`] style.
///
/// Returns the normalized string and whether it contains new lines.
fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow<str> {
fn normalize_string(
input: &str,
quotes: StringQuotes,
prefix: StringPrefix,
normalize_hex: bool,
) -> Cow<str> {
// The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized.
let mut output = String::new();
Expand Down Expand Up @@ -766,24 +776,50 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
}

last_index = index + '\r'.len_utf8();
} else if !quotes.triple && !is_raw {
} else if !is_raw {
if c == '\\' {
if let Some((_, next)) = chars.peek().copied() {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
chars.next();
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
} else if next == preferred_quote {
// Quote is already escaped, skip over it.
chars.next();
} else if next == '\\' {
if let Some((_, next)) = chars.clone().next() {
if next == '\\' {
// Skip over escaped backslashes
chars.next();
} else if normalize_hex {
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
.and_then(|escape| {
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
})
{
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised {
output.push_str(&input[last_index..escape_start_offset]);
output.push_str(normalised);
last_index = escape_start_offset + normalised.len();
};

// Move the `chars` iterator passed the escape sequence.
// Simply reassigning `chars` doesn't work because the indices` would
// then be off.
for _ in 0..next.len_utf8() + normalised.len() {
chars.next();
}
}
}

if !quotes.triple {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
chars.next();
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
} else if next == preferred_quote {
// Quote is already escaped, skip over it.
chars.next();
}
}
}
} else if c == preferred_quote && formatted_value_nesting == 0 {
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
// Escape the quote
output.push_str(&input[last_index..index]);
output.push('\\');
Expand All @@ -802,3 +838,153 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->

normalized
}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum UnicodeEscape {
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
Hex(usize),

/// An escaped unicode name (`\N{name}`)
CharacterName,
}

impl UnicodeEscape {
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
Some(match first {
'x' => UnicodeEscape::Hex(2),
'u' if allow_unicode => UnicodeEscape::Hex(4),
'U' if allow_unicode => UnicodeEscape::Hex(8),
'N' if allow_unicode => UnicodeEscape::CharacterName,
_ => return None,
})
}

/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
///
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
/// * `\N`: To use uppercase letters
fn normalize(self, input: &str) -> Option<Cow<str>> {
let mut normalised = String::new();
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried to avoid the allocation here when the string needs to be normalized, but making it work with the outer normalize_string was somewhat complicated (relative string indices, different iterators etc). That's why I decided to accept the allocation here, considering that we only allocate for non-normalised (unformatted) escape sequences (rare).

There's also the case that we need to be able to recover in case the escape sequence is invalid, which means we can't perform the normalisation in place (and directly push to the outer output String)


let len = match self {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This turned out more complicated than I hoped it would. Maybe a candidate to use a Regex? But I also think it's not worth bothering much. It's relatively straightforward code and I don't expect many changes to it,

UnicodeEscape::Hex(len) => {
// It's not a valid escape sequence if the input string has fewer characters
// left than required by the escape sequence.
if input.len() < len {
return None;
}

for (index, c) in input.char_indices().take(len) {
match c {
'0'..='9' | 'a'..='f' => {
if !normalised.is_empty() {
normalised.push(c);
}
}
'A'..='F' => {
if normalised.is_empty() {
normalised.reserve(len);
normalised.push_str(&input[..index]);
normalised.push(c.to_ascii_lowercase());
} else {
normalised.push(c.to_ascii_lowercase());
}
}
_ => {
// not a valid escape sequence
return None;
}
Comment on lines +893 to +896
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this happen? I'm getting syntax errors with invalid input (both ruff and cpython)

}
}

len
}
UnicodeEscape::CharacterName => {
let mut char_indices = input.char_indices();

if !matches!(char_indices.next(), Some((_, '{'))) {
return None;
}

loop {
if let Some((index, c)) = char_indices.next() {
match c {
'}' => {
if !normalised.is_empty() {
normalised.push('}');
}

// Name must be at least two characters long.
if index < 3 {
return None;
}

break index + '}'.len_utf8();
}
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
if !normalised.is_empty() {
normalised.push(c);
}
}
'a'..='z' => {
if normalised.is_empty() {
normalised.reserve(c.len_utf8() + '}'.len_utf8());
normalised.push_str(&input[..index]);
normalised.push(c.to_ascii_uppercase());
} else {
normalised.push(c.to_ascii_uppercase());
}
}
_ => {
// Seems like an invalid escape sequence, don't normalise it.
return None;
}
}
} else {
// Unterminated escape sequence, dont' normalise it.
return None;
}
}
}
};

Some(if normalised.is_empty() {
Cow::Borrowed(&input[..len])
} else {
Cow::Owned(normalised)
})
}
}

#[cfg(test)]
mod tests {
use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape};
use std::borrow::Cow;

#[test]
fn normalize_32_escape() {
let escape_sequence = UnicodeEscape::new('U', true).unwrap();

assert_eq!(
Some(Cow::Owned("0001f60e".to_string())),
escape_sequence.normalize("0001F60E")
);
}

#[test]
fn normalize_hex_in_byte_string() {
let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";

let normalized = normalize_string(
input,
StringQuotes {
triple: false,
quote_char: QuoteChar::Double,
},
StringPrefix::BYTE,
true,
);

assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized);
}
}