Skip to content

Commit

Permalink
Update identifier Unicode character validation to match Python spec (#…
Browse files Browse the repository at this point in the history
…7209)

Co-authored-by: Micha Reiser <micha@reiser.io>
  • Loading branch information
LaBatata101 and MichaReiser committed Sep 7, 2023
1 parent fda48af commit 041cdb9
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 5 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ toml = { version = "0.7.2" }
tracing = "0.1.37"
tracing-indicatif = "0.3.4"
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
unic-ucd-ident = "0.9.0"
unicode-width = "0.1.10"
uuid = { version = "1.4.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
wsl = { version = "0.1.0" }
Expand Down
2 changes: 1 addition & 1 deletion crates/ruff_python_parser/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ lalrpop-util = { version = "0.20.0", default-features = false }
num-bigint = { workspace = true }
num-traits = { workspace = true }
unic-emoji-char = "0.9.0"
unic-ucd-ident = "0.9.0"
unic-ucd-ident = { workspace = true }
unicode_names2 = { version = "0.6.0", git = "https://github.com/youknowone/unicode_names2.git", rev = "4ce16aa85cbcdd9cc830410f1a72ef9a235f2fde" }
rustc-hash = { workspace = true }
static_assertions = "1.1.0"
Expand Down
1 change: 1 addition & 0 deletions crates/ruff_python_stdlib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ license = { workspace = true }
[lib]

[dependencies]
unic-ucd-ident = { workspace = true }
41 changes: 38 additions & 3 deletions crates/ruff_python_stdlib/src/identifiers.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
use unic_ucd_ident::{is_xid_continue, is_xid_start};

use crate::keyword::is_keyword;

/// Returns `true` if a string is a valid Python identifier (e.g., variable
/// name).
pub fn is_identifier(name: &str) -> bool {
// Is the first character a letter or underscore?
let mut chars = name.chars();
if !chars.next().is_some_and(|c| c.is_alphabetic() || c == '_') {
if !chars.next().is_some_and(is_identifier_start) {
return false;
}

// Are the rest of the characters letters, digits, or underscores?
if !chars.all(|c| c.is_alphanumeric() || c == '_') {
if !chars.all(is_identifier_continuation) {
return false;
}

Expand All @@ -22,6 +24,21 @@ pub fn is_identifier(name: &str) -> bool {
true
}

// Checks if the character c is a valid starting character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_start(c: char) -> bool {
matches!(c, 'a'..='z' | 'A'..='Z' | '_') || is_xid_start(c)
}

// Checks if the character c is a valid continuation character as described
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
fn is_identifier_continuation(c: char) -> bool {
match c {
'a'..='z' | 'A'..='Z' | '_' | '0'..='9' => true,
c => is_xid_continue(c),
}
}

/// Returns `true` if a string is a private identifier, such that, when the
/// identifier is defined in a class definition, it will be mangled prior to
/// code generation.
Expand Down Expand Up @@ -76,7 +93,25 @@ pub fn is_migration_name(name: &str) -> bool {

#[cfg(test)]
mod tests {
use crate::identifiers::{is_migration_name, is_module_name};
use crate::identifiers::{is_identifier, is_migration_name, is_module_name};

#[test]
fn valid_identifiers() {
assert!(is_identifier("_abc"));
assert!(is_identifier("abc"));
assert!(is_identifier("_"));
assert!(is_identifier("a_b_c"));
assert!(is_identifier("abc123"));
assert!(is_identifier("abc_123"));
assert!(is_identifier("漢字"));
assert!(is_identifier("ひらがな"));
assert!(is_identifier("العربية"));
assert!(is_identifier("кириллица"));
assert!(is_identifier("πr"));
assert!(!is_identifier(""));
assert!(!is_identifier("percentile_co³t"));
assert!(!is_identifier("HelloWorld❤️"));
}

#[test]
fn module_name() {
Expand Down
2 changes: 1 addition & 1 deletion crates/ruff_python_trivia/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ ruff_source_file = { path = "../ruff_source_file" }

memchr = { workspace = true }
smallvec = { workspace = true }
unic-ucd-ident = "0.9.0"
unic-ucd-ident = { workspace = true }

[dev-dependencies]
insta = { workspace = true }
Expand Down

0 comments on commit 041cdb9

Please sign in to comment.