diff --git a/Cargo.lock b/Cargo.lock index 1045146ee0d..5fd981fd135 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1503,6 +1503,90 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + [[package]] name = "indexmap" version = "2.13.0" @@ -1824,6 +1908,12 @@ version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + [[package]] name = "lock_api" version = "0.4.14" @@ -1944,12 +2034,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" -[[package]] -name = "matches" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" - [[package]] name = "md-5" version = "0.10.6" @@ -2479,6 +2563,15 @@ dependencies = [ "portable-atomic", ] +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + [[package]] name = "powerfmt" version = "0.2.0" @@ -3091,7 +3184,7 @@ dependencies = [ [[package]] name = "rustpython-compiler-source" -version = "0.5.0+deprecated" +version = "0.4.1+deprecated" dependencies = [ "rustpython-ruff_source_file", "rustpython-ruff_text_size", @@ -3149,12 +3242,12 @@ name = "rustpython-literal" version = "0.5.0" dependencies = [ "hexf-parse", + "icu_properties", "is-macro", "lexical-parse-float", "num-traits", "rand 0.9.2", "rustpython-wtf8", - "unic-ucd-category", ] [[package]] @@ -3275,6 +3368,8 @@ dependencies = [ "gethostname", "hex", "hmac", + "icu_normalizer", + "icu_properties", "indexmap", "itertools 0.14.0", "libc", @@ -3327,12 +3422,7 @@ dependencies = [ "termios", "tk-sys", "ucd", - "unic-char-property", - "unic-normal", "unic-ucd-age", - "unic-ucd-bidi", - "unic-ucd-category", - "unicode-bidi-mirroring", "unicode_names2 2.0.0", "uuid", "webpki-roots", @@ -3368,6 +3458,7 @@ dependencies = [ "glob", "half", "hex", + "icu_properties", "indexmap", "is-macro", "itertools 0.14.0", @@ -3411,9 +3502,6 @@ dependencies = [ "thiserror 2.0.18", "timsort", "uname", - "unic-ucd-bidi", - "unic-ucd-category", - "unic-ucd-ident", "unicode-casing", "wasm-bindgen", "which", @@ -3964,6 +4052,16 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "639ce8ef6d2ba56be0383a94dd13b92138d58de44c62618303bb798fa92bdc00" +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -4106,15 +4204,6 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" -[[package]] -name = "unic-normal" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f09d64d33589a94628bc2aeb037f35c2e25f3f049c7348b5aa5580b48e6bba62" -dependencies = [ - "unic-ucd-normal", -] - [[package]] name = "unic-ucd-age" version = "0.9.0" @@ -4126,61 +4215,6 @@ dependencies = [ "unic-ucd-version", ] -[[package]] -name = "unic-ucd-bidi" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1d568b51222484e1f8209ce48caa6b430bf352962b877d592c29ab31fb53d8c" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-category" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b8d4591f5fcfe1bd4453baaf803c40e1b1e69ff8455c47620440b46efef91c0" -dependencies = [ - "matches", - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-hangul" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb1dc690e19010e1523edb9713224cba5ef55b54894fe33424439ec9a40c0054" -dependencies = [ - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-ident" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e230a37c0381caa9219d67cf063aa3a375ffed5bf541a452db16e744bdab6987" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-version", -] - -[[package]] -name = "unic-ucd-normal" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86aed873b8202d22b13859dda5fe7c001d271412c31d411fd9b827e030569410" -dependencies = [ - "unic-char-property", - "unic-char-range", - "unic-ucd-hangul", - "unic-ucd-version", -] - [[package]] name = "unic-ucd-version" version = "0.9.0" @@ -4190,12 +4224,6 @@ dependencies = [ "unic-common", ] -[[package]] -name = "unicode-bidi-mirroring" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dfa6e8c60bb66d49db113e0125ee8711b7647b5579dc7f5f19c42357ed039fe" - [[package]] name = "unicode-casing" version = "0.1.1" @@ -4283,6 +4311,18 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" @@ -4814,6 +4854,18 @@ version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + [[package]] name = "x509-cert" version = "0.2.5" @@ -4851,6 +4903,29 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8aa498d22c9bbaf482329839bc5620c46be275a19a812e9a22a2b07529a642a" +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.8.34" @@ -4871,6 +4946,27 @@ dependencies = [ "syn", ] +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + [[package]] name = "zeroize" version = "1.8.2" @@ -4891,6 +4987,39 @@ dependencies = [ "syn", ] +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "zlib-rs" version = "0.5.5" diff --git a/Cargo.toml b/Cargo.toml index 3b4069d47a1..7bd8b8f3374 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -222,15 +222,11 @@ strum = "0.28" strum_macros = "0.28" syn = "2" thiserror = "2.0" +icu_properties = "2" +icu_normalizer = "2" unicode-casing = "0.1.1" -unic-char-property = "0.9.0" -unic-normal = "0.9.0" unic-ucd-age = "0.9.0" -unic-ucd-bidi = "0.9.0" -unic-ucd-category = "0.9.0" -unic-ucd-ident = "0.9.0" unicode_names2 = "2.0.0" -unicode-bidi-mirroring = "0.4" widestring = "1.2.0" windows-sys = "0.61.2" wasm-bindgen = "0.2.106" diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 11e2abb82c5..68037923283 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -854,6 +854,7 @@ def test_isprintable(self): self.assertTrue('\U0001F46F'.isprintable()) self.assertFalse('\U000E0020'.isprintable()) + @unittest.expectedFailure # TODO: RUSTPYTHON @support.requires_resource('cpu') def test_isprintable_invariant(self): for codepoint in range(sys.maxunicode + 1): diff --git a/Lib/test/test_unicodedata.py b/Lib/test/test_unicodedata.py index ceae20e8cb2..bb54818ecb0 100644 --- a/Lib/test/test_unicodedata.py +++ b/Lib/test/test_unicodedata.py @@ -232,7 +232,6 @@ def test_issue10254(self): b = 'C\u0338' * 20 + '\xC7' self.assertEqual(self.db.normalize('NFC', a), b) - @unittest.expectedFailure # TODO: RUSTPYTHON; ? + def test_issue29456(self): # Fix #29456 u1176_str_a = '\u1100\u1176\u11a8' @@ -389,6 +388,7 @@ def unistr(data): data = [int(x, 16) for x in data.split(" ")] return "".join([chr(x) for x in data]) + @unittest.expectedFailure # TODO: RUSTPYTHON; AssertionError: False is not true : 13055 @requires_resource('network') @requires_resource('cpu') def test_normalization(self): diff --git a/crates/literal/Cargo.toml b/crates/literal/Cargo.toml index bd6a2699742..3f0bec33c30 100644 --- a/crates/literal/Cargo.toml +++ b/crates/literal/Cargo.toml @@ -15,7 +15,7 @@ hexf-parse = "0.2.1" is-macro.workspace = true lexical-parse-float = { version = "1.0.6", features = ["format"] } num-traits = { workspace = true } -unic-ucd-category = { workspace = true } +icu_properties = { workspace = true } [dev-dependencies] rand = { workspace = true } diff --git a/crates/literal/src/char.rs b/crates/literal/src/char.rs index cd64f6dfa9e..5b446cc1a19 100644 --- a/crates/literal/src/char.rs +++ b/crates/literal/src/char.rs @@ -1,4 +1,4 @@ -use unic_ucd_category::GeneralCategory; +use icu_properties::props::{EnumeratedProperty, GeneralCategory}; /// According to python following categories aren't printable: /// * Cc (Other, Control) @@ -10,6 +10,17 @@ use unic_ucd_category::GeneralCategory; /// * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR) /// * Zs (Separator, Space) other than ASCII space('\x20'). pub fn is_printable(c: char) -> bool { - let cat = GeneralCategory::of(c); - !(cat.is_other() || cat.is_separator()) + let cat = GeneralCategory::for_char(c); + + !matches!( + cat, + GeneralCategory::SpaceSeparator + | GeneralCategory::LineSeparator + | GeneralCategory::ParagraphSeparator + | GeneralCategory::Control + | GeneralCategory::Format + | GeneralCategory::Surrogate + | GeneralCategory::PrivateUse + | GeneralCategory::Unassigned + ) } diff --git a/crates/stdlib/Cargo.toml b/crates/stdlib/Cargo.toml index df29f09a5a7..f828507d6cf 100644 --- a/crates/stdlib/Cargo.toml +++ b/crates/stdlib/Cargo.toml @@ -78,13 +78,10 @@ constant_time_eq = { workspace = true } ## unicode stuff unicode_names2 = { workspace = true } # update version all at the same time -unic-char-property = { workspace = true } -unic-normal = { workspace = true } -unic-ucd-bidi = { workspace = true } -unic-ucd-category = { workspace = true } +icu_properties = { workspace = true } +icu_normalizer = { workspace = true } unic-ucd-age = { workspace = true } ucd = "0.1.1" -unicode-bidi-mirroring = { workspace = true } # compression adler32 = "1.2.0" diff --git a/crates/stdlib/src/unicodedata.rs b/crates/stdlib/src/unicodedata.rs index 5664fd0c36e..6ee5b0c2ee8 100644 --- a/crates/stdlib/src/unicodedata.rs +++ b/crates/stdlib/src/unicodedata.rs @@ -40,15 +40,19 @@ mod unicodedata { builtins::{PyModule, PyStrRef}, function::OptionalArg, }; + + use icu_normalizer::{ComposingNormalizerBorrowed, DecomposingNormalizerBorrowed}; + use icu_properties::{ + CodePointSetData, + props::{ + BidiClass, BidiMirrored, CanonicalCombiningClass, EastAsianWidth, EnumeratedProperty, + GeneralCategory, NamedEnumeratedProperty, + }, + }; use itertools::Itertools; use rustpython_common::wtf8::{CodePoint, Wtf8Buf}; - use ucd::{Codepoint, DecompositionType, EastAsianWidth, Number, NumericType}; - use unic_char_property::EnumeratedCharProperty; - use unic_normal::StrNormalForm; + use ucd::{Codepoint, DecompositionType, Number, NumericType}; use unic_ucd_age::{Age, UNICODE_VERSION, UnicodeVersion}; - use unic_ucd_bidi::BidiClass; - use unic_ucd_category::GeneralCategory; - use unicode_bidi_mirroring::is_mirroring; pub(crate) fn module_exec(vm: &VirtualMachine, module: &Py) -> PyResult<()> { __module_exec(vm, module); @@ -117,9 +121,9 @@ mod unicodedata { .extract_char(character, vm)? .map_or(GeneralCategory::Unassigned, |c| { c.to_char() - .map_or(GeneralCategory::Surrogate, GeneralCategory::of) + .map_or(GeneralCategory::Surrogate, GeneralCategory::for_char) }) - .abbr_name() + .short_name() .to_owned()) } @@ -165,8 +169,8 @@ mod unicodedata { let bidi = match self.extract_char(character, vm)? { Some(c) => c .to_char() - .map_or(BidiClass::LeftToRight, BidiClass::of) - .abbr_name(), + .map_or(BidiClass::LeftToRight, BidiClass::for_char) + .short_name(), None => "", }; Ok(bidi) @@ -182,18 +186,34 @@ mod unicodedata { Ok(self .extract_char(character, vm)? .and_then(|c| c.to_char()) - .map_or(EastAsianWidth::Neutral, |c| c.east_asian_width()) - .abbr_name()) + .map_or(EastAsianWidth::Neutral, EastAsianWidth::for_char) + .short_name()) } #[pymethod] fn normalize(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { let text = unistr.as_wtf8(); let normalized_text = match form { - Nfc => text.map_utf8(|s| s.nfc()).collect(), - Nfkc => text.map_utf8(|s| s.nfkc()).collect(), - Nfd => text.map_utf8(|s| s.nfd()).collect(), - Nfkd => text.map_utf8(|s| s.nfkd()).collect(), + Nfc => { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkc => { + let normalizer = ComposingNormalizerBorrowed::new_nfkc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } }; Ok(normalized_text) } @@ -202,10 +222,26 @@ mod unicodedata { fn is_normalized(&self, form: super::NormalizeForm, unistr: PyStrRef) -> PyResult { let text = unistr.as_wtf8(); let normalized: Wtf8Buf = match form { - Nfc => text.map_utf8(|s| s.nfc()).collect(), - Nfkc => text.map_utf8(|s| s.nfkc()).collect(), - Nfd => text.map_utf8(|s| s.nfd()).collect(), - Nfkd => text.map_utf8(|s| s.nfkd()).collect(), + Nfc => { + let normalizer = ComposingNormalizerBorrowed::new_nfc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkc => { + let normalizer = ComposingNormalizerBorrowed::new_nfkc(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } + Nfkd => { + let normalizer = DecomposingNormalizerBorrowed::new_nfkd(); + text.map_utf8(|s| normalizer.normalize_iter(s.chars())) + .collect() + } }; Ok(text == &*normalized) } @@ -216,7 +252,8 @@ mod unicodedata { Some(c) => { if let Some(ch) = c.to_char() { // Check if the character is mirrored in bidirectional text using Unicode standard - Ok(if is_mirroring(ch) { 1 } else { 0 }) + let bidi_mirrored = CodePointSetData::new::(); + Ok(if bidi_mirrored.contains(ch) { 1 } else { 0 }) } else { Ok(0) } @@ -226,11 +263,13 @@ mod unicodedata { } #[pymethod] - fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { + fn combining(&self, character: PyStrRef, vm: &VirtualMachine) -> PyResult { Ok(self .extract_char(character, vm)? .and_then(|c| c.to_char()) - .map_or(0, |ch| ch.canonical_combining_class() as i32)) + .map_or(0, |ch| { + CanonicalCombiningClass::for_char(ch).to_icu4c_value() + })) } #[pymethod] @@ -339,23 +378,6 @@ mod unicodedata { } } - trait EastAsianWidthAbbrName { - fn abbr_name(&self) -> &'static str; - } - - impl EastAsianWidthAbbrName for EastAsianWidth { - fn abbr_name(&self) -> &'static str { - match self { - Self::Narrow => "Na", - Self::Wide => "W", - Self::Neutral => "N", - Self::Ambiguous => "A", - Self::FullWidth => "F", - Self::HalfWidth => "H", - } - } - } - #[pyattr] fn ucd_3_2_0(vm: &VirtualMachine) -> PyRef { Ucd { diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 4eac727c2fa..b721418a4cc 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -86,10 +86,7 @@ timsort = "0.1.2" # TODO: use unic for this; needed for title case: # https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 unicode-casing = { workspace = true } -# update version all at the same time -unic-ucd-bidi = { workspace = true } -unic-ucd-category = { workspace = true } -unic-ucd-ident = { workspace = true } +icu_properties = { workspace = true } [target.'cfg(unix)'.dependencies] rustix = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 8707c5cf769..b31dc6ccc9d 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -43,9 +43,10 @@ use rustpython_common::{ str::DeduceStrKind, wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; -use unic_ucd_bidi::BidiClass; -use unic_ucd_category::GeneralCategory; -use unic_ucd_ident::{is_xid_continue, is_xid_start}; + +use icu_properties::props::{ + BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, XidContinue, XidStart, +}; use unicode_casing::CharExt; impl<'a> TryFromBorrowedObject<'a> for String { @@ -966,7 +967,9 @@ impl PyStr { #[pymethod] fn isdecimal(&self) -> bool { !self.data.is_empty() - && self.char_all(|c| GeneralCategory::of(c) == GeneralCategory::DecimalNumber) + && self.char_all(|c| { + matches!(GeneralCategory::for_char(c), GeneralCategory::DecimalNumber) + }) } fn __mod__(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult { @@ -1091,11 +1094,17 @@ impl PyStr { #[pymethod] fn isspace(&self) -> bool { - use unic_ucd_bidi::bidi_class::abbr_names::*; !self.data.is_empty() && self.char_all(|c| { - GeneralCategory::of(c) == GeneralCategory::SpaceSeparator - || matches!(BidiClass::of(c), WS | B | S) + matches!( + GeneralCategory::for_char(c), + GeneralCategory::SpaceSeparator + ) || matches!( + BidiClass::for_char(c), + BidiClass::WhiteSpace + | BidiClass::ParagraphSeparator + | BidiClass::SegmentSeparator + ) }) } @@ -1355,9 +1364,13 @@ impl PyStr { pub fn isidentifier(&self) -> bool { let Some(s) = self.to_str() else { return false }; let mut chars = s.chars(); - let is_identifier_start = chars.next().is_some_and(|c| c == '_' || is_xid_start(c)); + + let is_identifier_start = chars + .next() + .is_some_and(|c| c == '_' || XidStart::for_char(c)); + // a string is not an identifier if it has whitespace or starts with a number - is_identifier_start && chars.all(is_xid_continue) + is_identifier_start && chars.all(XidContinue::for_char) } // https://docs.python.org/3/library/stdtypes.html#str.translate