From 6abe80a431411b8d03f87c8dfa3a124fac2111f9 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 02:47:03 +0200 Subject: [PATCH 01/12] Use maintained crates --- Cargo.lock | 336 ++++++++++++++++++++++++-------------- Cargo.toml | 7 +- crates/literal/Cargo.toml | 2 +- crates/stdlib/Cargo.toml | 8 +- crates/vm/Cargo.toml | 5 +- 5 files changed, 222 insertions(+), 136 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1045146ee0d..5b34f510f2c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1503,6 +1503,90 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + [[package]] name = "indexmap" version = "2.13.0" @@ -1824,6 +1908,12 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + [[package]] name = "lock_api" version = "0.4.14" @@ -1944,12 +2034,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" -[[package]] -name = "matches" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" - [[package]] name = "md-5" version = "0.10.6" @@ -2479,6 +2563,15 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -3091,7 +3184,7 @@ dependencies = [ [[package]] name = "rustpython-compiler-source" -version = "0.5.0+deprecated" +version = "0.4.1+deprecated" dependencies = [ "rustpython-ruff_source_file", "rustpython-ruff_text_size", @@ -3149,12 +3242,12 @@ name = "rustpython-literal" version = "0.5.0" dependencies = [ "hexf-parse", + "icu_properties", "is-macro", "lexical-parse-float", "num-traits", "rand 0.9.2", "rustpython-wtf8", - "unic-ucd-category", ] [[package]] @@ -3275,6 +3368,8 @@ dependencies = [ "gethostname", "hex", "hmac", + "icu_normalizer", + "icu_properties", "indexmap", "itertools 0.14.0", "libc", @@ -3327,11 +3422,6 @@ dependencies = [ "termios", "tk-sys", "ucd", - "unic-char-property", - "unic-normal", - "unic-ucd-age", - "unic-ucd-bidi", - "unic-ucd-category", "unicode-bidi-mirroring", "unicode_names2 2.0.0", "uuid", @@ -3368,6 +3458,7 @@ dependencies = [ "glob", "half", "hex", + "icu_properties", "indexmap", "is-macro", "itertools 0.14.0", @@ -3411,9 +3502,6 @@ dependencies = [ "thiserror 2.0.18", "timsort", "uname", - "unic-ucd-bidi", - "unic-ucd-category", - "unic-ucd-ident", "unicode-casing", "wasm-bindgen", "which", @@ -3964,6 +4052,16 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "639ce8ef6d2ba56be0383a94dd13b92138d58de44c62618303bb798fa92bdc00" +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -4085,111 +4183,6 @@ dependencies = [ "libc", ] -[[package]] -name = "unic-char-property" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221" -dependencies = [ - "unic-char-range", -] - -[[package]] -name = "unic-char-range" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc" - -[[package]] -name = "unic-common" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" - -[[package]] -name = "unic-normal" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f09d64d33589a94628bc2aeb037f35c2e25f3f049c7348b5aa5580b48e6bba62" -dependencies = [ - "unic-ucd-normal", -] - -[[package]] -name = "unic-ucd-age" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c8cfdfe71af46b871dc6af2c24fcd360e2f3392ee4c5111877f2947f311671c" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-bidi" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1d568b51222484e1f8209ce48caa6b430bf352962b877d592c29ab31fb53d8c" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-category" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8d4591f5fcfe1bd4453baaf803c40e1b1e69ff8455c47620440b46efef91c0" -dependencies = [ - "matches", - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-hangul" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb1dc690e19010e1523edb9713224cba5ef55b54894fe33424439ec9a40c0054" -dependencies = [ - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-ident" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e230a37c0381caa9219d67cf063aa3a375ffed5bf541a452db16e744bdab6987" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-normal" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86aed873b8202d22b13859dda5fe7c001d271412c31d411fd9b827e030569410" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-hangul", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-version" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4" -dependencies = [ - "unic-common", -] - [[package]] name = "unicode-bidi-mirroring" version = "0.4.0" @@ -4283,6 +4276,18 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" @@ -4814,6 +4819,18 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + [[package]] name = "x509-cert" version = "0.2.5" @@ -4851,6 +4868,29 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8aa498d22c9bbaf482329839bc5620c46be275a19a812e9a22a2b07529a642a" +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.34" @@ -4871,6 +4911,27 @@ dependencies = [ "syn", ] +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zeroize" version = "1.8.2" @@ -4891,6 +4952,39 @@ dependencies = [ "syn", ] +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zlib-rs" version = "0.5.5" diff --git a/Cargo.toml b/Cargo.toml index 3b4069d47a1..87c0c506a53 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -222,13 +222,10 @@ strum = "0.28" strum_macros = "0.28" syn = "2" thiserror = "2.0" +icu_properties = "2" +icu_normalizer = "2" unicode-casing = "0.1.1" -unic-char-property = "0.9.0" -unic-normal = "0.9.0" unic-ucd-age = "0.9.0" -unic-ucd-bidi = "0.9.0" -unic-ucd-category = "0.9.0" -unic-ucd-ident = "0.9.0" unicode_names2 = "2.0.0" unicode-bidi-mirroring = "0.4" widestring = "1.2.0" diff --git a/crates/literal/Cargo.toml b/crates/literal/Cargo.toml index bd6a2699742..3f0bec33c30 100644 --- a/crates/literal/Cargo.toml +++ b/crates/literal/Cargo.toml @@ -15,7 +15,7 @@ hexf-parse = "0.2.1" is-macro.workspace = true lexical-parse-float = { version = "1.0.6", features = ["format"] } num-traits = { workspace = true } -unic-ucd-category = { workspace = true } +icu_properties = { workspace = true } [dev-dependencies] rand = { workspace = true } diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index df29f09a5a7..1c6b0684a70 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -78,11 +78,9 @@ constant_time_eq = { workspace = true } ## unicode stuff unicode_names2 = { workspace = true } # update version all at the same time -unic-char-property = { workspace = true } -unic-normal = { workspace = true } -unic-ucd-bidi = { workspace = true } -unic-ucd-category = { workspace = true } -unic-ucd-age = { workspace = true } +icu_properties = { workspace = true } +icu_normalizer = { workspace = true } +#unic-ucd-age = { workspace = true } ucd = "0.1.1" unicode-bidi-mirroring = { workspace = true } diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 4eac727c2fa..b721418a4cc 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -86,10 +86,7 @@ timsort = "0.1.2" # TODO: use unic for this; needed for title case: # https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 unicode-casing = { workspace = true } -# update version all at the same time -unic-ucd-bidi = { workspace = true } -unic-ucd-category = { workspace = true } -unic-ucd-ident = { workspace = true } +icu_properties = { workspace = true } [target.'cfg(unix)'.dependencies] rustix = { workspace = true } From 36c388619f6565801895ae7bb5899135a43ff139 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 02:47:21 +0200 Subject: [PATCH 02/12] Fix literal --- crates/literal/src/char.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/crates/literal/src/char.rs b/crates/literal/src/char.rs index cd64f6dfa9e..5b446cc1a19 100644 --- a/crates/literal/src/char.rs +++ b/crates/literal/src/char.rs @@ -1,4 +1,4 @@ -use unic_ucd_category::GeneralCategory; +use icu_properties::props::{EnumeratedProperty, GeneralCategory}; /// According to python following categories aren't printable: /// * Cc (Other, Control) @@ -10,6 +10,17 @@ use unic_ucd_category::GeneralCategory; /// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) /// * Zs (Separator, Space) other than ASCII space('\x20'). pub fn is_printable(c: char) -> bool { - let cat = GeneralCategory::of(c); - !(cat.is_other() || cat.is_separator()) + let cat = GeneralCategory::for_char(c); + + !matches!( + cat, + GeneralCategory::SpaceSeparator + | GeneralCategory::LineSeparator + | GeneralCategory::ParagraphSeparator + | GeneralCategory::Control + | GeneralCategory::Format + | GeneralCategory::Surrogate + | GeneralCategory::PrivateUse + | GeneralCategory::Unassigned + ) } From 23335bddc4e5dab8bdea6b500a19c7e7d37a3054 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 02:59:31 +0200 Subject: [PATCH 03/12] Fix some of vm --- crates/vm/src/builtins/str.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 8707c5cf769..564684bafa9 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -43,9 +43,11 @@ use rustpython_common::{ str::DeduceStrKind, wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; -use unic_ucd_bidi::BidiClass; -use unic_ucd_category::GeneralCategory; -use unic_ucd_ident::{is_xid_continue, is_xid_start}; + +use icu_properties::{ + CodePointSetData, + props::{BidiClass, GeneralCategory, XidContinue, XidStart}, +}; use unicode_casing::CharExt; impl<'a> TryFromBorrowedObject<'a> for String { @@ -1355,9 +1357,15 @@ impl PyStr { pub fn isidentifier(&self) -> bool { let Some(s) = self.to_str() else { return false }; let mut chars = s.chars(); - let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c)); + + let xid_start = CodePointSetData::new::(); + let is_identifier_start = chars + .next() + .is_some_and(|c| c == '_' || xid_start.contains(c)); + // a string is not an identifier if it has whitespace or starts with a number - is_identifier_start && chars.all(is_xid_continue) + let xid_continue = CodePointSetData::new::(); + is_identifier_start && chars.all(|c| xid_continue.contains(c)) } // https://docs.python.org/3/library/stdtypes.html#str.translate From de8cd0ad564fc3ea9d2f58c71807a940f8384997 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 03:14:42 +0200 Subject: [PATCH 04/12] Fix vm --- crates/vm/src/builtins/str.rs | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 564684bafa9..c4f2d2a3d91 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -46,7 +46,7 @@ use rustpython_common::{ use icu_properties::{ CodePointSetData, - props::{BidiClass, GeneralCategory, XidContinue, XidStart}, + props::{BidiClass, EnumeratedProperty, GeneralCategory, XidContinue, XidStart}, }; use unicode_casing::CharExt; @@ -968,7 +968,9 @@ impl PyStr { #[pymethod] fn isdecimal(&self) -> bool { !self.data.is_empty() - && self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber) + && self.char_all(|c| { + matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber) + }) } fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult { @@ -1093,11 +1095,17 @@ impl PyStr { #[pymethod] fn isspace(&self) -> bool { - use unic_ucd_bidi::bidi_class::abbr_names::*; !self.data.is_empty() && self.char_all(|c| { - GeneralCategory::of(c) == GeneralCategory::SpaceSeparator - || matches!(BidiClass::of(c), WS | B | S) + matches!( + GeneralCategory::for_char(c), + GeneralCategory::SpaceSeparator + ) || matches!( + BidiClass::for_char(c), + BidiClass::WhiteSpace + | BidiClass::ParagraphSeparator + | BidiClass::SegmentSeparator + ) }) } From 15b4a828e710d70c1b1fae52b224ca6d75aa4e09 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:08:59 +0200 Subject: [PATCH 05/12] Fix stdlib --- Cargo.lock | 42 +++++++++++++++++++ crates/stdlib/Cargo.toml | 2 +- crates/stdlib/src/unicodedata.rs | 72 ++++++++++++++++++++++++-------- 3 files changed, 97 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5b34f510f2c..ebb3d2541af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3422,6 +3422,7 @@ dependencies = [ "termios", "tk-sys", "ucd", + "unic-ucd-age", "unicode-bidi-mirroring", "unicode_names2 2.0.0", "uuid", @@ -4183,6 +4184,47 @@ dependencies = [ "libc", ] +[[package]] +name = "unic-char-property" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221" +dependencies = [ + "unic-char-range", +] + +[[package]] +name = "unic-char-range" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc" + +[[package]] +name = "unic-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" + +[[package]] +name = "unic-ucd-age" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c8cfdfe71af46b871dc6af2c24fcd360e2f3392ee4c5111877f2947f311671c" +dependencies = [ + "unic-char-property", + "unic-char-range", + "unic-ucd-version", +] + +[[package]] +name = "unic-ucd-version" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4" +dependencies = [ + "unic-common", +] + [[package]] name = "unicode-bidi-mirroring" version = "0.4.0" diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index 1c6b0684a70..63ce29a8c24 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -80,7 +80,7 @@ unicode_names2 = { workspace = true } # update version all at the same time icu_properties = { workspace = true } icu_normalizer = { workspace = true } -#unic-ucd-age = { workspace = true } +unic-ucd-age = { workspace = true } ucd = "0.1.1" unicode-bidi-mirroring = { workspace = true } diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index 5664fd0c36e..eab9afbc8ab 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -40,15 +40,18 @@ mod unicodedata { builtins::{PyModule, PyStrRef}, function::OptionalArg, }; + + use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + use icu_properties::{ + CodePointSetData, + props::{ + BidiClass, BidiMirrored, EnumeratedProperty, GeneralCategory, NamedEnumeratedProperty, + }, + }; use itertools::Itertools; use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType}; - use unic_char_property::EnumeratedCharProperty; - use unic_normal::StrNormalForm; use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; - use unic_ucd_bidi::BidiClass; - use unic_ucd_category::GeneralCategory; - use unicode_bidi_mirroring::is_mirroring; pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py) -> PyResult<()> { __module_exec(vm, module); @@ -117,9 +120,9 @@ mod unicodedata { .extract_char(character, vm)? .map_or(GeneralCategory::Unassigned, |c| { c.to_char() - .map_or(GeneralCategory::Surrogate, GeneralCategory::of) + .map_or(GeneralCategory::Surrogate, GeneralCategory::for_char) }) - .abbr_name() + .short_name() .to_owned()) } @@ -165,8 +168,8 @@ mod unicodedata { let bidi = match self.extract_char(character, vm)? { Some(c) => c .to_char() - .map_or(BidiClass::LeftToRight, BidiClass::of) - .abbr_name(), + .map_or(BidiClass::LeftToRight, BidiClass::for_char) + .short_name(), None => "", }; Ok(bidi) @@ -190,10 +193,26 @@ mod unicodedata { fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { let text = unistr.as_wtf8(); let normalized_text = match form { - Nfc => text.map_utf8(|s| s.nfc()).collect(), - Nfkc => text.map_utf8(|s| s.nfkc()).collect(), - Nfd => text.map_utf8(|s| s.nfd()).collect(), - Nfkd => text.map_utf8(|s| s.nfkd()).collect(), + Nfc => { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkc => { + let normalizer = ComposingNormalizerBorrowed::new_nfkc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } }; Ok(normalized_text) } @@ -202,10 +221,26 @@ mod unicodedata { fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { let text = unistr.as_wtf8(); let normalized: Wtf8Buf = match form { - Nfc => text.map_utf8(|s| s.nfc()).collect(), - Nfkc => text.map_utf8(|s| s.nfkc()).collect(), - Nfd => text.map_utf8(|s| s.nfd()).collect(), - Nfkd => text.map_utf8(|s| s.nfkd()).collect(), + Nfc => { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkc => { + let normalizer = ComposingNormalizerBorrowed::new_nfkc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } }; Ok(text == &*normalized) } @@ -216,7 +251,8 @@ mod unicodedata { Some(c) => { if let Some(ch) = c.to_char() { // Check if the character is mirrored in bidirectional text using Unicode standard - Ok(if is_mirroring(ch) { 1 } else { 0 }) + let bidi_mirrored = CodePointSetData::new::(); + Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 }) } else { Ok(0) } From 4789fc0ccb8bf7ba26487aa1767e978d986608df Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:23:00 +0200 Subject: [PATCH 06/12] Migrate more at stdlib --- Cargo.lock | 7 ------- crates/stdlib/Cargo.toml | 1 - crates/stdlib/src/unicodedata.rs | 32 +++++++++----------------------- 3 files changed, 9 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ebb3d2541af..5fd981fd135 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3423,7 +3423,6 @@ dependencies = [ "tk-sys", "ucd", "unic-ucd-age", - "unicode-bidi-mirroring", "unicode_names2 2.0.0", "uuid", "webpki-roots", @@ -4225,12 +4224,6 @@ dependencies = [ "unic-common", ] -[[package]] -name = "unicode-bidi-mirroring" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfa6e8c60bb66d49db113e0125ee8711b7647b5579dc7f5f19c42357ed039fe" - [[package]] name = "unicode-casing" version = "0.1.1" diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index 63ce29a8c24..f828507d6cf 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -82,7 +82,6 @@ icu_properties = { workspace = true } icu_normalizer = { workspace = true } unic-ucd-age = { workspace = true } ucd = "0.1.1" -unicode-bidi-mirroring = { workspace = true } # compression adler32 = "1.2.0" diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index eab9afbc8ab..391e0dada90 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -45,12 +45,13 @@ mod unicodedata { use icu_properties::{ CodePointSetData, props::{ - BidiClass, BidiMirrored, EnumeratedProperty, GeneralCategory, NamedEnumeratedProperty, + BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty, + GeneralCategory, NamedEnumeratedProperty, }, }; use itertools::Itertools; use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; - use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType}; + use ucd::{Codepoint, DecompositionType, Number, NumericType}; use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py) -> PyResult<()> { @@ -185,8 +186,8 @@ mod unicodedata { Ok(self .extract_char(character, vm)? .and_then(|c| c.to_char()) - .map_or(EastAsianWidth::Neutral, |c| c.east_asian_width()) - .abbr_name()) + .map_or(EastAsianWidth::Neutral, |c| EastAsianWidth::for_char(c)) + .short_name()) } #[pymethod] @@ -262,11 +263,13 @@ mod unicodedata { } #[pymethod] - fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self .extract_char(character, vm)? .and_then(|c| c.to_char()) - .map_or(0, |ch| ch.canonical_combining_class() as i32)) + .map_or(0, |ch| { + CanonicalCombiningClass::for_char(ch).to_icu4c_value() + })) } #[pymethod] @@ -375,23 +378,6 @@ mod unicodedata { } } - trait EastAsianWidthAbbrName { - fn abbr_name(&self) -> &'static str; - } - - impl EastAsianWidthAbbrName for EastAsianWidth { - fn abbr_name(&self) -> &'static str { - match self { - Self::Narrow => "Na", - Self::Wide => "W", - Self::Neutral => "N", - Self::Ambiguous => "A", - Self::FullWidth => "F", - Self::HalfWidth => "H", - } - } - } - #[pyattr] fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef { Ucd { From b0583ab24c299da647a7953a5c6e055b72471576 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 10:38:21 +0200 Subject: [PATCH 07/12] Use shortform --- crates/vm/src/builtins/str.rs | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index c4f2d2a3d91..cf2c008fc32 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -44,9 +44,8 @@ use rustpython_common::{ wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; -use icu_properties::{ - CodePointSetData, - props::{BidiClass, EnumeratedProperty, GeneralCategory, XidContinue, XidStart}, +use icu_properties::props::{ + BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart, }; use unicode_casing::CharExt; @@ -1366,14 +1365,12 @@ impl PyStr { let Some(s) = self.to_str() else { return false }; let mut chars = s.chars(); - let xid_start = CodePointSetData::new::(); let is_identifier_start = chars .next() - .is_some_and(|c| c == '_' || xid_start.contains(c)); + .is_some_and(|c| c == '_' || XidStart::for_char(c)); // a string is not an identifier if it has whitespace or starts with a number - let xid_continue = CodePointSetData::new::(); - is_identifier_start && chars.all(|c| xid_continue.contains(c)) + is_identifier_start && chars.all(|c| XidContinue::for_char(c)) } // https://docs.python.org/3/library/stdtypes.html#str.translate From 47dadaf57ca88811be022771e7204c8094a7b060 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:33:56 +0200 Subject: [PATCH 08/12] Mark failing test --- Lib/test/test_str.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 11e2abb82c5..68037923283 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -854,6 +854,7 @@ def test_isprintable(self): self.assertTrue('\U0001F46F'.isprintable()) self.assertFalse('\U000E0020'.isprintable()) + @unittest.expectedFailure # TODO: RUSTPYTHON @support.requires_resource('cpu') def test_isprintable_invariant(self): for codepoint in range(sys.maxunicode + 1): From 273ad65c84d22ee50eba128619ae395e5b5deea9 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:36:14 +0200 Subject: [PATCH 09/12] Fix test marks --- Lib/test/test_unicodedata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index ceae20e8cb2..bb54818ecb0 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -232,7 +232,6 @@ def test_issue10254(self): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) - @unittest.expectedFailure # TODO: RUSTPYTHON; ? + def test_issue29456(self): # Fix #29456 u1176_str_a = '\u1100\u1176\u11a8' @@ -389,6 +388,7 @@ def unistr(data): data = [int(x, 16) for x in data.split(" ")] return "".join([chr(x) for x in data]) + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true : 13055 @requires_resource('network') @requires_resource('cpu') def test_normalization(self): From 5a776f57ed853195874507862d1c428911d10601 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:44:55 +0200 Subject: [PATCH 10/12] cargo shear --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 87c0c506a53..7bd8b8f3374 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -227,7 +227,6 @@ icu_normalizer = "2" unicode-casing = "0.1.1" unic-ucd-age = "0.9.0" unicode_names2 = "2.0.0" -unicode-bidi-mirroring = "0.4" widestring = "1.2.0" windows-sys = "0.61.2" wasm-bindgen = "0.2.106" From 0cc47eb68d5b9f3b0a2bfe16a8a7323f8a5c38b8 Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:47:27 +0200 Subject: [PATCH 11/12] Clippy --- crates/vm/src/builtins/str.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index cf2c008fc32..b31dc6ccc9d 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -1370,7 +1370,7 @@ impl PyStr { .is_some_and(|c| c == '_' || XidStart::for_char(c)); // a string is not an identifier if it has whitespace or starts with a number - is_identifier_start && chars.all(|c| XidContinue::for_char(c)) + is_identifier_start && chars.all(XidContinue::for_char) } // https://docs.python.org/3/library/stdtypes.html#str.translate From 3245cd147b9c05844f5a8d9407e2842c0ab8b00c Mon Sep 17 00:00:00 2001 From: ShaharNaveh <50263213+ShaharNaveh@users.noreply.github.com> Date: Wed, 1 Apr 2026 11:57:06 +0200 Subject: [PATCH 12/12] clippy --- crates/stdlib/src/unicodedata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index 391e0dada90..6ee5b0c2ee8 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -186,7 +186,7 @@ mod unicodedata { Ok(self .extract_char(character, vm)? .and_then(|c| c.to_char()) - .map_or(EastAsianWidth::Neutral, |c| EastAsianWidth::for_char(c)) + .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char) .short_name()) }