From 647aa8ef6a2c463176377a61ae6c1dcc97d8408f Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Tue, 14 Apr 2026 15:16:36 -0400 Subject: [PATCH 1/2] fix: Python-Rust combining char diff in isalnum Related to: #7518 Rust and Python differ on alphanumeric characters. Rust follows the Unicode standard closer than Python. This means that is_alphanumeric (char function in Rust) is different from isalnum (Python). To fix the discrepancy, RustPython needs to mimic Python by rejecting certain characters. Some classes of combining characters count as alphanumeric in Rust but not Python. Combining characters are accent marks that are combined with other characters to create a single grapheme. It's possible that this PR is not exhaustive. I fixed the combining character issue BUT I don't know the full range of discrepancies. --- crates/vm/src/builtins/str.rs | 14 +++++++++++--- extra_tests/snippets/builtin_str.py | 9 +++++++++ extra_tests/snippets/stdlib_re.py | 3 +++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 870d3b72a74..92bd0124867 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -44,8 +44,12 @@ use rustpython_common::{ wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; -use icu_properties::props::{ - BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart, +use icu_properties::{ + CodePointMapData, + props::{ + BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, + XidContinue, XidStart, + }, }; use unicode_casing::CharExt; @@ -946,7 +950,11 @@ impl PyStr { #[pymethod] fn isalnum(&self) -> bool { - !self.data.is_empty() && self.char_all(char::is_alphanumeric) + let map = CodePointMapData::::new(); + !self.data.is_empty() + && self.char_all(|c| { + c.is_alphanumeric() && map.get(c) == CanonicalCombiningClass::NotReordered + }) } #[pymethod] diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 3d54643b3ce..7e84d79f7fc 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -73,6 +73,15 @@ # assert "DZ".title() == "Dz" assert a.isalpha() +# Combining characters differ slightly between Rust and Python +assert "\u006e".isalnum() +assert not "\u0303".isalnum() +assert not "\u006e\u0303".isalnum() +assert "\u00f1".isalnum() +assert not "\u0345".isalnum() +for raw in range(0x0363, 0x036f): + assert not chr(raw).isalnum() + s = "1 2 3" assert s.split(" ", 1) == ["1", "2 3"] assert s.rsplit(" ", 1) == ["1 2", "3"] diff --git a/extra_tests/snippets/stdlib_re.py b/extra_tests/snippets/stdlib_re.py index 53f21f91734..0577d366eca 100644 --- a/extra_tests/snippets/stdlib_re.py +++ b/extra_tests/snippets/stdlib_re.py @@ -79,3 +79,6 @@ # Test of fix re.fullmatch POSSESSIVE_REPEAT, issue #7183 assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38") assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38" + +# Combining characters; issue #7518 +# assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)" From f79d6df5bb8fba3af09c1bf8023e6780c6079fbe Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Tue, 14 Apr 2026 20:35:39 -0400 Subject: [PATCH 2/2] fix: Ignore combining characters in SRE Closes: #7518 --- Cargo.lock | 1 + crates/sre_engine/Cargo.toml | 1 + crates/sre_engine/src/string.rs | 6 +++++- crates/vm/src/builtins/str.rs | 13 +++++-------- extra_tests/snippets/builtin_str.py | 2 +- extra_tests/snippets/stdlib_re.py | 2 +- 6 files changed, 14 insertions(+), 11 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 20bfc4578ff..16941b8265a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3337,6 +3337,7 @@ version = "0.5.0" dependencies = [ "bitflags 2.11.0", "criterion", + "icu_properties", "num_enum", "optional", "rustpython-wtf8", diff --git a/crates/sre_engine/Cargo.toml b/crates/sre_engine/Cargo.toml index 4f899e6b3e9..8400a34b567 100644 --- a/crates/sre_engine/Cargo.toml +++ b/crates/sre_engine/Cargo.toml @@ -19,6 +19,7 @@ rustpython-wtf8 = { workspace = true } num_enum = { workspace = true } bitflags = { workspace = true } optional = { workspace = true } +icu_properties = { workspace = true } [dev-dependencies] criterion = { workspace = true } diff --git a/crates/sre_engine/src/string.rs b/crates/sre_engine/src/string.rs index 489819bfb3e..b4b3a6092d3 100644 --- a/crates/sre_engine/src/string.rs +++ b/crates/sre_engine/src/string.rs @@ -1,3 +1,4 @@ +use icu_properties::props::{CanonicalCombiningClass, EnumeratedProperty}; use rustpython_wtf8::Wtf8; #[derive(Debug, Clone, Copy)] @@ -443,7 +444,10 @@ pub(crate) const fn is_uni_linebreak(ch: u32) -> bool { pub(crate) fn is_uni_alnum(ch: u32) -> bool { // TODO: check with cpython char::try_from(ch) - .map(|x| x.is_alphanumeric()) + .map(|x| { + x.is_alphanumeric() + && CanonicalCombiningClass::for_char(x) == CanonicalCombiningClass::NotReordered + }) .unwrap_or(false) } #[inline] diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 92bd0124867..d74259b849c 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -44,12 +44,9 @@ use rustpython_common::{ wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; -use icu_properties::{ - CodePointMapData, - props::{ - BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, - XidContinue, XidStart, - }, +use icu_properties::props::{ + BidiClass, BinaryProperty, CanonicalCombiningClass, EnumeratedProperty, GeneralCategory, + XidContinue, XidStart, }; use unicode_casing::CharExt; @@ -950,10 +947,10 @@ impl PyStr { #[pymethod] fn isalnum(&self) -> bool { - let map = CodePointMapData::::new(); !self.data.is_empty() && self.char_all(|c| { - c.is_alphanumeric() && map.get(c) == CanonicalCombiningClass::NotReordered + c.is_alphanumeric() + && CanonicalCombiningClass::for_char(c) == CanonicalCombiningClass::NotReordered }) } diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 7e84d79f7fc..61cbf63ea9a 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -79,7 +79,7 @@ assert not "\u006e\u0303".isalnum() assert "\u00f1".isalnum() assert not "\u0345".isalnum() -for raw in range(0x0363, 0x036f): +for raw in range(0x0363, 0x036F): assert not chr(raw).isalnum() s = "1 2 3" diff --git a/extra_tests/snippets/stdlib_re.py b/extra_tests/snippets/stdlib_re.py index 0577d366eca..8613ddd30fc 100644 --- a/extra_tests/snippets/stdlib_re.py +++ b/extra_tests/snippets/stdlib_re.py @@ -81,4 +81,4 @@ assert re.fullmatch(r"([0-9]++(?:\.[0-9]+)*+)", "1.25.38").group(0) == "1.25.38" # Combining characters; issue #7518 -# assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)" +assert not re.match(r"\w", "\u0345"), r"\w should not match U+0345 (category Mn)"