Skip to content

Commit

Permalink
Normalise Hex and unicode escape sequences in string
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaReiser committed Dec 27, 2023
1 parent 2951339 commit 4a49cdb
Show file tree
Hide file tree
Showing 11 changed files with 265 additions and 120 deletions.
Expand Up @@ -118,3 +118,5 @@
b'c'
)
}

b"Unicode Escape sequence don't apply to bytes: \N{0x} \u{ABCD} \U{ABCDEFGH}"
Expand Up @@ -133,3 +133,8 @@

# https://github.com/astral-sh/ruff/issues/7460
trailing_preferred_quote_texts = [''' "''', ''' ""''', ''' """''', ''' """"''']

a = f"""\x1F"""
a = """\x1F"""
a = """\\x1F"""
a = """\\\x1F"""
9 changes: 2 additions & 7 deletions crates/ruff_python_formatter/src/lib.rs
Expand Up @@ -211,13 +211,8 @@ if True:
#[test]
fn quick_test() {
let source = r#"
def main() -> None:
if True:
some_very_long_variable_name_abcdefghijk = Foo()
some_very_long_variable_name_abcdefghijk = some_very_long_variable_name_abcdefghijk[
some_very_long_variable_name_abcdefghijk.some_very_long_attribute_name
== "This is a very long string abcdefghijk"
]
x = "\N{ox}\N{OX}"
"#;
let source_type = PySourceType::Python;
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/bytes_literal.rs
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::BytesLiteral;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{Quoting, StringPart};

#[derive(Default)]
Expand All @@ -17,6 +18,7 @@ impl FormatNodeRule<BytesLiteral> for FormatBytesLiteral {
&locator,
f.options().quote_style(),
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
)
.fmt(f)
}
Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/f_string.rs
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::FString;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{Quoting, StringPart};

/// Formats an f-string which is part of a larger f-string expression.
Expand Down Expand Up @@ -31,6 +32,7 @@ impl Format<PyFormatContext<'_>> for FormatFString<'_> {
&locator,
f.options().quote_style(),
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
)
.fmt(f);

Expand Down
2 changes: 2 additions & 0 deletions crates/ruff_python_formatter/src/other/string_literal.rs
Expand Up @@ -2,6 +2,7 @@ use ruff_python_ast::StringLiteral;
use ruff_text_size::Ranged;

use crate::prelude::*;
use crate::preview::is_hex_codes_in_unicode_sequences_enabled;
use crate::string::{docstring, Quoting, StringPart};
use crate::QuoteStyle;

Expand Down Expand Up @@ -61,6 +62,7 @@ impl Format<PyFormatContext<'_>> for FormatStringLiteral<'_> {
&locator,
quote_style,
f.context().docstring(),
is_hex_codes_in_unicode_sequences_enabled(f.context()),
);

if self.layout.is_docstring() {
Expand Down
5 changes: 5 additions & 0 deletions crates/ruff_python_formatter/src/preview.rs
Expand Up @@ -57,3 +57,8 @@ pub(crate) const fn is_module_docstring_newlines_enabled(context: &PyFormatConte
pub(crate) const fn is_dummy_implementations_enabled(context: &PyFormatContext) -> bool {
context.is_preview()
}

/// Returns `true` if the [`hex_codes_in_unicode_sequences`](https://github.com/psf/black/pull/2916) preview style is enabled.
pub(crate) const fn is_hex_codes_in_unicode_sequences_enabled(context: &PyFormatContext) -> bool {
context.is_preview()
}
218 changes: 202 additions & 16 deletions crates/ruff_python_formatter/src/string/mod.rs
Expand Up @@ -253,6 +253,7 @@ impl StringPart {
locator: &'a Locator,
configured_style: QuoteStyle,
parent_docstring_quote_char: Option<QuoteChar>,
normalize_hex: bool,
) -> NormalizedString<'a> {
// Per PEP 8, always prefer double quotes for triple-quoted strings.
let preferred_style = if self.quotes.triple {
Expand Down Expand Up @@ -310,7 +311,7 @@ impl StringPart {
configured_style
};

let raw_content = locator.slice(self.content_range);
let raw_content = &locator.slice(self.content_range);

let quotes = match quoting {
Quoting::Preserve => self.quotes,
Expand All @@ -327,7 +328,7 @@ impl StringPart {
}
};

let normalized = normalize_string(locator.slice(self.content_range), quotes, self.prefix);
let normalized = normalize_string(raw_content, quotes, self.prefix, normalize_hex);

NormalizedString {
prefix: self.prefix,
Expand Down Expand Up @@ -423,6 +424,10 @@ impl StringPrefix {
pub(super) const fn is_fstring(self) -> bool {
self.contains(StringPrefix::F_STRING)
}

pub(super) const fn is_byte(self) -> bool {
self.contains(StringPrefix::BYTE)
}
}

impl Format<PyFormatContext<'_>> for StringPrefix {
Expand Down Expand Up @@ -722,7 +727,12 @@ impl TryFrom<char> for QuoteChar {
/// with the provided [`StringQuotes`] style.
///
/// Returns the normalized string and whether it contains new lines.
fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) -> Cow<str> {
fn normalize_string(
input: &str,
quotes: StringQuotes,
prefix: StringPrefix,
normalize_hex: bool,
) -> Cow<str> {
// The normalized string if `input` is not yet normalized.
// `output` must remain empty if `input` is already normalized.
let mut output = String::new();
Expand Down Expand Up @@ -766,24 +776,50 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->
}

last_index = index + '\r'.len_utf8();
} else if !quotes.triple && !is_raw {
} else if !is_raw {
if c == '\\' {
if let Some((_, next)) = chars.peek().copied() {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
chars.next();
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
} else if next == preferred_quote {
// Quote is already escaped, skip over it.
chars.next();
} else if next == '\\' {
if let Some((_, next)) = chars.clone().next() {
if next == '\\' {
// Skip over escaped backslashes
chars.next();
} else if normalize_hex {
if let Some(normalised) = UnicodeEscape::new(next, !prefix.is_byte())
.and_then(|escape| {
escape.normalize(&input[index + c.len_utf8() + next.len_utf8()..])
})
{
// Length of the `\` plus the length of the escape sequence character (`u` | `U` | `x`)
let escape_start_len = '\\'.len_utf8() + next.len_utf8();
let escape_start_offset = index + escape_start_len;
if let Cow::Owned(normalised) = &normalised {
output.push_str(&input[last_index..escape_start_offset]);
output.push_str(normalised);
last_index = escape_start_offset + normalised.len();
};

// Move the `chars` iterator passed the escape sequence.
// Simply reassigning `chars` doesn't work because the indices` would
// then be off.
for _ in 0..next.len_utf8() + normalised.len() {
chars.next();
}
}
}

if !quotes.triple {
#[allow(clippy::if_same_then_else)]
if next == opposite_quote && formatted_value_nesting == 0 {
// Remove the escape by ending before the backslash and starting again with the quote
chars.next();
output.push_str(&input[last_index..index]);
last_index = index + '\\'.len_utf8();
} else if next == preferred_quote {
// Quote is already escaped, skip over it.
chars.next();
}
}
}
} else if c == preferred_quote && formatted_value_nesting == 0 {
} else if !quotes.triple && c == preferred_quote && formatted_value_nesting == 0 {
// Escape the quote
output.push_str(&input[last_index..index]);
output.push('\\');
Expand All @@ -802,3 +838,153 @@ fn normalize_string(input: &str, quotes: StringQuotes, prefix: StringPrefix) ->

normalized
}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
enum UnicodeEscape {
/// A hex escape sequence of either 2 (`\x`), 4 (`\u`) or 8 (`\U`) hex characters.
Hex(usize),

/// An escaped unicode name (`\N{name}`)
CharacterName,
}

impl UnicodeEscape {
fn new(first: char, allow_unicode: bool) -> Option<UnicodeEscape> {
Some(match first {
'x' => UnicodeEscape::Hex(2),
'u' if allow_unicode => UnicodeEscape::Hex(4),
'U' if allow_unicode => UnicodeEscape::Hex(8),
'N' if allow_unicode => UnicodeEscape::CharacterName,
_ => return None,
})
}

/// Normalises `\u..`, `\U..`, `\x..` and `\N{..}` escape sequences to:
///
/// * `\u`, `\U'` and `\x`: To use lower case for the characters `a-f`.
/// * `\N`: To use uppercase letters
fn normalize(self, input: &str) -> Option<Cow<str>> {
let mut normalised = String::new();

let len = match self {
UnicodeEscape::Hex(len) => {
// It's not a valid escape sequence if the input string has fewer characters
// left than required by the escape sequence.
if input.len() < len {
return None;
}

for (index, c) in input.char_indices().take(len) {
match c {
'0'..='9' | 'a'..='f' => {
if !normalised.is_empty() {
normalised.push(c);
}
}
'A'..='F' => {
if normalised.is_empty() {
normalised.reserve(len);
normalised.push_str(&input[..index]);
normalised.push(c.to_ascii_lowercase());
} else {
normalised.push(c.to_ascii_lowercase());
}
}
_ => {
// not a valid escape sequence
return None;
}
}
}

len
}
UnicodeEscape::CharacterName => {
let mut char_indices = input.char_indices();

if !matches!(char_indices.next(), Some((_, '{'))) {
return None;
}

loop {
if let Some((index, c)) = char_indices.next() {
match c {
'}' => {
if !normalised.is_empty() {
normalised.push('}');
}

// Name must be at least two characters long.
if index < 3 {
return None;
}

break index + '}'.len_utf8();
}
'0'..='9' | 'A'..='Z' | ' ' | '-' => {
if !normalised.is_empty() {
normalised.push(c);
}
}
'a'..='z' => {
if normalised.is_empty() {
normalised.reserve(c.len_utf8() + '}'.len_utf8());
normalised.push_str(&input[..index]);
normalised.push(c.to_ascii_uppercase());
} else {
normalised.push(c.to_ascii_uppercase());
}
}
_ => {
// Seems like an invalid escape sequence, don't normalise it.
return None;
}
}
} else {
// Unterminated escape sequence, dont' normalise it.
return None;
}
}
}
};

Some(if normalised.is_empty() {
Cow::Borrowed(&input[..len])
} else {
Cow::Owned(normalised)
})
}
}

#[cfg(test)]
mod tests {
use crate::string::{normalize_string, QuoteChar, StringPrefix, StringQuotes, UnicodeEscape};
use std::borrow::Cow;

#[test]
fn normalize_32_escape() {
let escape_sequence = UnicodeEscape::new('U', true).unwrap();

assert_eq!(
Some(Cow::Owned("0001f60e".to_string())),
escape_sequence.normalize("0001F60E")
);
}

#[test]
fn normalize_hex_in_byte_string() {
let input = r"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A";

let normalized = normalize_string(
input,
StringQuotes {
triple: false,
quote_char: QuoteChar::Double,
},
StringPrefix::BYTE,
true,
);

assert_eq!(r"\x89\x50\x4e\x47\x0d\x0a\x1a\x0a", &normalized);
}
}

0 comments on commit 4a49cdb

Please sign in to comment.