diff --git a/Cargo.lock b/Cargo.lock index 508dc691457..5caf17d6f9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1514,6 +1514,28 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_casemap" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "070f98b5b82798fcb93654bf96ed9f40064fc44c86f51a09ea711092cd5cc5be" +dependencies = [ + "icu_casemap_data", + "icu_collections", + "icu_locale_core", + "icu_properties", + "icu_provider", + "potential_utf", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_casemap_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "846b0857ca091204be3c874bc93daaf89d4777e8d2d20b0d3ffe8f671d98014b" + [[package]] name = "icu_collections" version = "2.2.0" @@ -1522,12 +1544,28 @@ checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "serde", "utf8_iter", "yoke", "zerofrom", "zerovec", ] +[[package]] +name = "icu_locale" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5a396343c7208121dc86e35623d3dfe19814a7613cfd14964994cdc9c9a2e26" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_locale_data", + "icu_provider", + "potential_utf", + "tinystr", + "zerovec", +] + [[package]] name = "icu_locale_core" version = "2.2.0" @@ -1536,11 +1574,18 @@ checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", + "serde", "tinystr", "writeable", "zerovec", ] +[[package]] +name = "icu_locale_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fdcc9ac77c6d74ff5cf6e65ef3181d6af32003b16fce3a77fb451d2f695993" + [[package]] name = "icu_normalizer" version = "2.2.0" @@ -1592,6 +1637,8 @@ checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", + "serde", + "stable_deref_trait", "writeable", "yoke", "zerofrom", @@ -2607,6 +2654,8 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ + "serde_core", + "writeable", "zerovec", ] @@ -3524,6 +3573,8 @@ dependencies = [ "glob", "half", "hex", + "icu_casemap", + "icu_locale", "icu_properties", "indexmap", "is-macro", @@ -3567,11 +3618,11 @@ dependencies = [ "strum_macros", "thiserror 2.0.18", "timsort", - "unicode-casing", "wasm-bindgen", "which", "widestring", "windows-sys 0.61.2", + "writeable", ] [[package]] @@ -4139,6 +4190,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", + "serde_core", "zerovec", ] @@ -4295,12 +4347,6 @@ dependencies = [ "unic-common", ] -[[package]] -name = "unicode-casing" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061dbb8cc7f108532b6087a0065eff575e892a4bcb503dc57323a197457cc202" - [[package]] name = "unicode-ident" version = "1.0.22" @@ -4992,6 +5038,7 @@ dependencies = [ "displaydoc", "yoke", "zerofrom", + "zerovec", ] [[package]] @@ -5000,6 +5047,7 @@ version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ + "serde", "yoke", "zerofrom", "zerovec-derive", diff --git a/Cargo.toml b/Cargo.toml index d926b5f5e2e..53886778bab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -288,11 +288,12 @@ termios = "0.3.3" thiserror = "2.0" timsort = "0.1.2" tk-sys = { git = "https://github.com/arihant2math/tkinter.git", tag = "v0.2.0" } +icu_casemap = "2" +icu_locale = "2" icu_properties = "2" icu_normalizer = "2" uuid = "1.23.1" ucd = "0.1.1" -unicode-casing = "0.1.1" unic-ucd-age = "0.9.0" unicode_names2 = "2.0.0" widestring = "1.2.0" @@ -303,6 +304,7 @@ which = "8" x509-cert = "0.2.5" x509-parser = "0.18" xml = "1.2" +writeable = "0.6" # Lints diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 6d0e935c1c6..702650e1f2e 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -938,7 +938,6 @@ def test_upper(self): self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') self.assertEqual('\u2177'.upper(), '\u2167') - @unittest.expectedFailure # TODO: RUSTPYTHON; ? ^ def test_capitalize(self): string_tests.StringLikeTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') @@ -956,7 +955,6 @@ def test_capitalize(self): self.assertEqual('finnish'.capitalize(), 'Finnish') self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') - @unittest.expectedFailure # TODO: RUSTPYTHON; ? ^ def test_title(self): super().test_title() self.assertEqual('\U0001044F'.title(), '\U00010427') diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 16425fda8e2..869f3418f11 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -82,10 +82,10 @@ result-like = { workspace = true } timsort = { workspace = true } ## unicode stuff -# TODO: use unic for this; needed for title case: -# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 -unicode-casing = { workspace = true } +icu_casemap = { workspace = true } +icu_locale = { workspace = true } icu_properties = { workspace = true } +writeable = { workspace = true } [target.'cfg(unix)'.dependencies] rustix = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 402cde304ab..60c588ba47c 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -27,6 +27,7 @@ use crate::{ AsMapping, AsNumber, AsSequence, Comparable, Constructor, Hashable, IterNext, Iterable, PyComparisonOp, Representable, SelfIter, }, + utils::VecFmtWriter, }; use alloc::{borrow::Cow, fmt}; use ascii::{AsciiChar, AsciiStr, AsciiString}; @@ -44,11 +45,13 @@ use rustpython_common::{ wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; +use icu_casemap::TitlecaseMapper; +use icu_locale::LanguageIdentifier; use icu_properties::props::{ - BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup, - Lowercase, NumericType, Uppercase, XidContinue, XidStart, + BidiClass, BinaryProperty, CaseIgnorable, Cased, EnumeratedProperty, GeneralCategory, + GeneralCategoryGroup, Lowercase, NumericType, Uppercase, XidContinue, XidStart, }; -use unicode_casing::CharExt; +use writeable::Writeable; impl<'a> TryFromBorrowedObject<'a> for String { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { @@ -776,25 +779,38 @@ impl PyStr { s.into() } PyKindStr::Utf8(s) => { - let mut chars = s.chars(); - let mut out = String::with_capacity(s.len()); - if let Some(c) = chars.next() { - out.extend(c.to_titlecase()); - out.push_str(&chars.as_str().to_lowercase()); + let mut chars = s.char_indices(); + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + titlecase_first(s, &mut chars, &mut out); + for (i, ch) in chars { + lowercase_or_sigma(ch, s, i, &mut out); } - out.into() + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } PyKindStr::Wtf8(s) => { - let mut out = Wtf8Buf::with_capacity(s.len()); - let mut chars = s.code_points(); - if let Some(ch) = chars.next() { - match ch.to_char() { - Some(ch) => out.extend(ch.to_titlecase()), - None => out.push(ch), + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + let mut chunks = s.as_bytes().utf8_chunks(); + + if let Some(first) = chunks.next() { + let s = first.valid(); + let mut chars = s.char_indices(); + titlecase_first(s, &mut chars, &mut out); + for (i, ch) in chars { + lowercase_or_sigma(ch, s, i, &mut out); + } + out.0.extend(first.invalid()); + } + // This loop is only hit if the WTF-8 buffer contains invalid Unicode. Otherwise, + // everything is handled above without chunking. + for chunk in chunks { + let s = chunk.valid(); + for (i, ch) in s.char_indices() { + lowercase_or_sigma(ch, s, i, &mut out); } - out.push_wtf8(&chars.as_wtf8().to_lowercase()); + out.0.extend(chunk.invalid()); } - out + + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } } } @@ -1061,30 +1077,28 @@ impl PyStr { #[pymethod] fn title(&self) -> Wtf8Buf { - let mut title = Wtf8Buf::with_capacity(self.data.len()); - let mut previous_is_cased = false; - for c_orig in self.as_wtf8().code_points() { - let c = c_orig.to_char_lossy(); - if c.is_lowercase() { - if !previous_is_cased { - title.extend(c.to_titlecase()); - } else { - title.push_char(c); - } - previous_is_cased = true; - } else if c.is_uppercase() || c.is_titlecase() { - if previous_is_cased { - title.extend(c.to_lowercase()); - } else { - title.extend(c.to_titlecase()); + match self.as_str_kind() { + PyKindStr::Ascii(_) => unsafe { + Wtf8Buf::from_bytes_unchecked(crate::bytes_inner::title_ascii(self.as_bytes())) + }, + PyKindStr::Utf8(s) => { + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + titlecase_string(s, &mut out); + // SAFETY: `s` is valid UTF-8 and titlecase_string only works on Unicode. + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } + } + PyKindStr::Wtf8(s) => { + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + for chunk in s.as_bytes().utf8_chunks() { + titlecase_string(chunk.valid(), &mut out); + out.0.extend(chunk.invalid()); } - previous_is_cased = true; - } else { - previous_is_cased = false; - title.push(c_orig); + // SAFETY: + // * `s` is valid WTF-8; surrogate bytes were appended without processing. + // * TitlecaseMapper produces valid UTF-8. + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } } - title } #[pymethod] @@ -1316,7 +1330,9 @@ impl PyStr { let mut cased = false; let mut previous_is_cased = false; for c in self.as_wtf8().code_points().map(CodePoint::to_char_lossy) { - if c.is_uppercase() || c.is_titlecase() { + if c.is_uppercase() + || GeneralCategoryGroup::TitlecaseLetter.contains(GeneralCategory::for_char(c)) + { if previous_is_cased { return false; } @@ -1555,6 +1571,84 @@ impl PyStr { } } +/// Title case first char if it is cased or write as is. +/// +/// This matches CPython's behavior: +/// "123abc" -> "123abc" +/// "abc" -> "Abc" +fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) { + if let Some((first_pos, first_ch)) = chars.next() { + let first = &s[..first_pos + first_ch.len_utf8()]; + let tm = TitlecaseMapper::new(); + tm.titlecase_segment(first, &LanguageIdentifier::UNKNOWN, Default::default()) + .write_to(out) + .expect("Writing to an in-memory buffer cannot fail."); + } +} + +/// Title case a string following CPython conventions. +/// +/// CPython title cases each char in a segment. A "segment" is split by case ignorable characters +/// rather than whitespace. +/// "123abc" -> "123Abc" +/// "123abc456def" -> "123Abc456Def" +/// "123 abc" -> "123 Abc" +fn titlecase_string(s: &str, out: &mut VecFmtWriter) { + let mut previous_is_cased = false; + let mapper = TitlecaseMapper::new(); + for (i, ch) in s.char_indices() { + if previous_is_cased { + lowercase_or_sigma(ch, s, i, out); + } else { + let s = &s[i..i + ch.len_utf8()]; + mapper + .titlecase_segment(s, &LanguageIdentifier::UNKNOWN, Default::default()) + .write_to(out) + .expect("Writing to an in-memory buffer cannot fail."); + } + + previous_is_cased = Cased::for_char(ch); + } +} + +fn lowercase_or_sigma(ch: char, s: &str, i: usize, out: &mut VecFmtWriter) { + let sigma = 'Σ'; + if ch == sigma { + let sigma_cased = handle_capital_sigma(s, i); + let mut buf = [0u8; 4]; + let s = sigma_cased.encode_utf8(&mut buf); + out.0.extend(s.as_bytes()); + } else { + for ch in ch.to_lowercase() { + let mut buf = [0u8; 4]; + let s = ch.encode_utf8(&mut buf); + out.0.extend(s.as_bytes()); + } + } +} + +// Handle context-sensitive sigma. +// +// CPython handles sigma as a special case. This is more efficient than using icu4x to scan the +// entire string with CaseMapper because CaseMapper would allocate to produce a new string. The +// icu4x crates are robust but CPython's capitalize() is NOT so we can skip the extra allocs. +fn handle_capital_sigma(s: &str, i: usize) -> char { + let (left, rest) = s.split_at(i); + let right = &rest['Σ'.len_utf8()..]; + + // Check if any chars before or after sigma are cased. + let before = left + .chars() + .rev() + .find(|&ch| !CaseIgnorable::for_char(ch)) + .is_some_and(Cased::for_char); + let after = right + .chars() + .find(|&ch| !CaseIgnorable::for_char(ch)) + .is_some_and(Cased::for_char); + if before && !after { 'ς' } else { 'σ' } +} + impl PyRef { #[must_use] pub fn is_empty(&self) -> bool { diff --git a/crates/vm/src/bytes_inner.rs b/crates/vm/src/bytes_inner.rs index c8645245611..f1057d18826 100644 --- a/crates/vm/src/bytes_inner.rs +++ b/crates/vm/src/bytes_inner.rs @@ -920,28 +920,9 @@ impl PyBytesInner { } } + #[inline] pub fn title(&self) -> Vec { - let mut res = vec![]; - let mut spaced = true; - - for i in &self.elements { - match i { - b'A'..=b'Z' | b'a'..=b'z' => { - if spaced { - res.push(i.to_ascii_uppercase()); - spaced = false - } else { - res.push(i.to_ascii_lowercase()); - } - } - _ => { - res.push(*i); - spaced = true - } - } - } - - res + title_ascii(self.as_bytes()) } pub fn cformat(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult> { @@ -1236,3 +1217,24 @@ pub(crate) fn bytes_to_hex( pub(crate) const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } + +/// ASCII-only title casing. +/// +/// This is purposely naive as is CPython's implementation. +pub(crate) fn title_ascii(bytes: &[u8]) -> Vec { + let mut next_upper = true; + let mut out = Vec::with_capacity(bytes.len()); + for &b in bytes { + let b = if !b.is_ascii_alphabetic() { + next_upper = true; + b + } else if next_upper { + next_upper = false; + b.to_ascii_uppercase() + } else { + b.to_ascii_lowercase() + }; + out.push(b); + } + out +} diff --git a/crates/vm/src/utils.rs b/crates/vm/src/utils.rs index b5117ddd8d1..51e27123fc8 100644 --- a/crates/vm/src/utils.rs +++ b/crates/vm/src/utils.rs @@ -1,3 +1,5 @@ +use core::fmt; + use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; use crate::{ @@ -72,3 +74,16 @@ where Ok(repr) } + +/// Wrapper around a bytes vector that implements [`fmt::Write`]. +/// +/// # Safety +/// Don't assume the contents of the internal vector are valid UTF-8/WTF-8. +pub(crate) struct VecFmtWriter(pub Vec); + +impl fmt::Write for VecFmtWriter { + fn write_str(&mut self, s: &str) -> fmt::Result { + self.0.extend(s.bytes()); + Ok(()) + } +} diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 8cfb4f82b47..fde9deb8e0b 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -72,7 +72,7 @@ assert not a.isnumeric() assert a.istitle() assert "\u1c89".istitle() -# assert "DZ".title() == "Dz" +assert "DZ".title() == "Dz" assert a.isalpha() assert not "\u093f".isalpha() @@ -124,6 +124,12 @@ c = "hallo" assert c.capitalize() == "Hallo" +assert "ßello".capitalize() == "Ssello" +assert "İstanbul".capitalize() == "İstanbul" +assert "a\u0301bc".capitalize() == "Ábc" +assert "ΣΙΓΜΑ".capitalize() == "Σιγμα" +assert "😀hello".capitalize() == "😀hello" +assert "élan".capitalize() == "Élan" assert c.center(11, "-") == "---hallo---" assert ["koki".center(i, "|") for i in range(3, 10)] == [ "koki",