From 9d8d34e5e77602dec556832f090809e83ca8a354 Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Sun, 26 Apr 2026 13:53:04 -0400 Subject: [PATCH 1/4] PyBytes.title should be ASCII-only. --- crates/vm/src/builtins/str.rs | 9 ++++++ crates/vm/src/bytes_inner.rs | 44 +++++++++++++++-------------- extra_tests/snippets/builtin_str.py | 2 +- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 402cde304ab..abedd498a1a 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -1061,6 +1061,15 @@ impl PyStr { #[pymethod] fn title(&self) -> Wtf8Buf { + match self.as_str_kind() { + PyKindStr::Ascii(_) => unsafe { + Wtf8Buf::from_bytes_unchecked(crate::bytes_inner::title_ascii(self.as_bytes())) + }, + PyKindStr::Wtf8(_) | PyKindStr::Utf8(_) => self.title_non_ascii(), + } + } + + fn title_non_ascii(&self) -> Wtf8Buf { let mut title = Wtf8Buf::with_capacity(self.data.len()); let mut previous_is_cased = false; for c_orig in self.as_wtf8().code_points() { diff --git a/crates/vm/src/bytes_inner.rs b/crates/vm/src/bytes_inner.rs index c8645245611..98fbd2568aa 100644 --- a/crates/vm/src/bytes_inner.rs +++ b/crates/vm/src/bytes_inner.rs @@ -920,28 +920,9 @@ impl PyBytesInner { } } + #[inline] pub fn title(&self) -> Vec { - let mut res = vec![]; - let mut spaced = true; - - for i in &self.elements { - match i { - b'A'..=b'Z' | b'a'..=b'z' => { - if spaced { - res.push(i.to_ascii_uppercase()); - spaced = false - } else { - res.push(i.to_ascii_lowercase()); - } - } - _ => { - res.push(*i); - spaced = true - } - } - } - - res + title_ascii(self.as_bytes()) } pub fn cformat(&self, values: PyObjectRef, vm: &VirtualMachine) -> PyResult> { @@ -1236,3 +1217,24 @@ pub(crate) fn bytes_to_hex( pub(crate) const fn is_py_ascii_whitespace(b: u8) -> bool { matches!(b, b'\t' | b'\n' | b'\x0C' | b'\r' | b' ' | b'\x0B') } + +/// ASCII-only title casing. +/// +/// This is purposely naive as is CPython's implementation. +pub fn title_ascii(bytes: &[u8]) -> Vec { + let mut next_upper = true; + let mut out = Vec::with_capacity(bytes.len()); + for &b in bytes { + let b = if !b.is_ascii_alphabetic() { + next_upper = true; + b + } else if next_upper { + next_upper = false; + b.to_ascii_uppercase() + } else { + b.to_ascii_lowercase() + }; + out.push(b); + } + out +} diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 8cfb4f82b47..3e054fd43b2 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -72,7 +72,7 @@ assert not a.isnumeric() assert a.istitle() assert "\u1c89".istitle() -# assert "DZ".title() == "Dz" +assert "DZ".title() == "Dz" assert a.isalpha() assert not "\u093f".isalpha() From edc87f24d5584611c3bfd9c72ca2f9e6e917de66 Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Tue, 28 Apr 2026 13:47:32 -0400 Subject: [PATCH 2/4] Use icu_casemap over unicode-casing for titles `icu_casemap` is consistently maintained, official, and tracks the latest Unicode versions. RustPython is also using other `icu4x` crates, so using `icu_casemap` is more consistent. As with islower and isupper, tracking the latest Unicode version is important because character definitions shift over time which causes discrepancies between RustPython and CPython. This commit fixes title(). --- Cargo.lock | 55 +++++++++++++++++++++++++++++++++++ Cargo.toml | 3 ++ crates/vm/Cargo.toml | 3 ++ crates/vm/src/builtins/str.rs | 28 +++++++++++++++++- crates/vm/src/utils.rs | 15 ++++++++++ 5 files changed, 103 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 508dc691457..ac38484b859 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1514,6 +1514,28 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_casemap" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "070f98b5b82798fcb93654bf96ed9f40064fc44c86f51a09ea711092cd5cc5be" +dependencies = [ + "icu_casemap_data", + "icu_collections", + "icu_locale_core", + "icu_properties", + "icu_provider", + "potential_utf", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_casemap_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "846b0857ca091204be3c874bc93daaf89d4777e8d2d20b0d3ffe8f671d98014b" + [[package]] name = "icu_collections" version = "2.2.0" @@ -1522,12 +1544,28 @@ checksum = "2984d1cd16c883d7935b9e07e44071dca8d917fd52ecc02c04d5fa0b5a3f191c" dependencies = [ "displaydoc", "potential_utf", + "serde", "utf8_iter", "yoke", "zerofrom", "zerovec", ] +[[package]] +name = "icu_locale" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5a396343c7208121dc86e35623d3dfe19814a7613cfd14964994cdc9c9a2e26" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_locale_data", + "icu_provider", + "potential_utf", + "tinystr", + "zerovec", +] + [[package]] name = "icu_locale_core" version = "2.2.0" @@ -1536,11 +1574,18 @@ checksum = "92219b62b3e2b4d88ac5119f8904c10f8f61bf7e95b640d25ba3075e6cac2c29" dependencies = [ "displaydoc", "litemap", + "serde", "tinystr", "writeable", "zerovec", ] +[[package]] +name = "icu_locale_data" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fdcc9ac77c6d74ff5cf6e65ef3181d6af32003b16fce3a77fb451d2f695993" + [[package]] name = "icu_normalizer" version = "2.2.0" @@ -1592,6 +1637,8 @@ checksum = "139c4cf31c8b5f33d7e199446eff9c1e02decfc2f0eec2c8d71f65befa45b421" dependencies = [ "displaydoc", "icu_locale_core", + "serde", + "stable_deref_trait", "writeable", "yoke", "zerofrom", @@ -2607,6 +2654,8 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" dependencies = [ + "serde_core", + "writeable", "zerovec", ] @@ -3524,6 +3573,8 @@ dependencies = [ "glob", "half", "hex", + "icu_casemap", + "icu_locale", "icu_properties", "indexmap", "is-macro", @@ -3572,6 +3623,7 @@ dependencies = [ "which", "widestring", "windows-sys 0.61.2", + "writeable", ] [[package]] @@ -4139,6 +4191,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8323304221c2a851516f22236c5722a72eaa19749016521d6dff0824447d96d" dependencies = [ "displaydoc", + "serde_core", "zerovec", ] @@ -4992,6 +5045,7 @@ dependencies = [ "displaydoc", "yoke", "zerofrom", + "zerovec", ] [[package]] @@ -5000,6 +5054,7 @@ version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90f911cbc359ab6af17377d242225f4d75119aec87ea711a880987b18cd7b239" dependencies = [ + "serde", "yoke", "zerofrom", "zerovec-derive", diff --git a/Cargo.toml b/Cargo.toml index d926b5f5e2e..d16e799526c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -288,6 +288,8 @@ termios = "0.3.3" thiserror = "2.0" timsort = "0.1.2" tk-sys = { git = "https://github.com/arihant2math/tkinter.git", tag = "v0.2.0" } +icu_casemap = "2" +icu_locale = "2" icu_properties = "2" icu_normalizer = "2" uuid = "1.23.1" @@ -303,6 +305,7 @@ which = "8" x509-cert = "0.2.5" x509-parser = "0.18" xml = "1.2" +writeable = "0.6" # Lints diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 16425fda8e2..754dd508db2 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -85,7 +85,10 @@ timsort = { workspace = true } # TODO: use unic for this; needed for title case: # https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 unicode-casing = { workspace = true } +icu_casemap = { workspace = true } +icu_locale = { workspace = true } icu_properties = { workspace = true } +writeable = { workspace = true } [target.'cfg(unix)'.dependencies] rustix = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index abedd498a1a..2b47bc350c3 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -27,6 +27,7 @@ use crate::{ AsMapping, AsNumber, AsSequence, Comparable, Constructor, Hashable, IterNext, Iterable, PyComparisonOp, Representable, SelfIter, }, + utils::VecFmtWriter, }; use alloc::{borrow::Cow, fmt}; use ascii::{AsciiChar, AsciiStr, AsciiString}; @@ -44,11 +45,14 @@ use rustpython_common::{ wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, }; +use icu_casemap::TitlecaseMapper; +use icu_locale::LanguageIdentifier; use icu_properties::props::{ BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup, Lowercase, NumericType, Uppercase, XidContinue, XidStart, }; use unicode_casing::CharExt; +use writeable::Writeable; impl<'a> TryFromBorrowedObject<'a> for String { fn try_from_borrowed_object(vm: &VirtualMachine, obj: &'a PyObject) -> PyResult { @@ -1065,7 +1069,29 @@ impl PyStr { PyKindStr::Ascii(_) => unsafe { Wtf8Buf::from_bytes_unchecked(crate::bytes_inner::title_ascii(self.as_bytes())) }, - PyKindStr::Wtf8(_) | PyKindStr::Utf8(_) => self.title_non_ascii(), + PyKindStr::Utf8(s) => TitlecaseMapper::new() + .titlecase_segment_to_string(s, &LanguageIdentifier::UNKNOWN, Default::default()) + .to_string() + .into(), + PyKindStr::Wtf8(s) => { + let mut buf = VecFmtWriter(Vec::with_capacity(s.len())); + let mapper = TitlecaseMapper::new(); + for chunk in s.as_bytes().utf8_chunks() { + mapper + .titlecase_segment( + chunk.valid(), + &LanguageIdentifier::UNKNOWN, + Default::default(), + ) + .write_to(&mut buf) + .expect("Writing to an in-memory buffer cannot fail."); + buf.0.extend(chunk.invalid()); + } + // SAFETY: + // * `s` is valid WTF-8; surrogate bytes were appended without processing. + // * TitlecaseMapper produces valid UTF-8. + unsafe { Wtf8Buf::from_bytes_unchecked(buf.0) } + } } } diff --git a/crates/vm/src/utils.rs b/crates/vm/src/utils.rs index b5117ddd8d1..51e27123fc8 100644 --- a/crates/vm/src/utils.rs +++ b/crates/vm/src/utils.rs @@ -1,3 +1,5 @@ +use core::fmt; + use rustpython_common::wtf8::{Wtf8, Wtf8Buf}; use crate::{ @@ -72,3 +74,16 @@ where Ok(repr) } + +/// Wrapper around a bytes vector that implements [`fmt::Write`]. +/// +/// # Safety +/// Don't assume the contents of the internal vector are valid UTF-8/WTF-8. +pub(crate) struct VecFmtWriter(pub Vec); + +impl fmt::Write for VecFmtWriter { + fn write_str(&mut self, s: &str) -> fmt::Result { + self.0.extend(s.bytes()); + Ok(()) + } +} From 58d8b7cbae74de426faf9ba218514aee71bb228d Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Wed, 29 Apr 2026 20:30:28 -0400 Subject: [PATCH 3/4] Use icu_casemap for capitalize() I dropped unicode-casing because it's cleaner to use icu4x for everything. `icu4x` will also stay up to date whereas unicode-casing will need to be periodically updated with new Unicode tables. Dropping unicode-casing also removes some binary bloat due to the tables. `capitalize()` mimics CPython behavior more closely now as well. Notably, I implemented CPython's sigma edge case handler. --- Cargo.lock | 7 -- Cargo.toml | 1 - Lib/test/test_str.py | 1 - crates/vm/Cargo.toml | 3 - crates/vm/src/builtins/str.rs | 126 +++++++++++++++++----------- extra_tests/snippets/builtin_str.py | 6 ++ 6 files changed, 85 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ac38484b859..5caf17d6f9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3618,7 +3618,6 @@ dependencies = [ "strum_macros", "thiserror 2.0.18", "timsort", - "unicode-casing", "wasm-bindgen", "which", "widestring", @@ -4348,12 +4347,6 @@ dependencies = [ "unic-common", ] -[[package]] -name = "unicode-casing" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "061dbb8cc7f108532b6087a0065eff575e892a4bcb503dc57323a197457cc202" - [[package]] name = "unicode-ident" version = "1.0.22" diff --git a/Cargo.toml b/Cargo.toml index d16e799526c..53886778bab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -294,7 +294,6 @@ icu_properties = "2" icu_normalizer = "2" uuid = "1.23.1" ucd = "0.1.1" -unicode-casing = "0.1.1" unic-ucd-age = "0.9.0" unicode_names2 = "2.0.0" widestring = "1.2.0" diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index 6d0e935c1c6..f94ac25cbe6 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -938,7 +938,6 @@ def test_upper(self): self.assertEqual('\U0008fffe'.upper(), '\U0008fffe') self.assertEqual('\u2177'.upper(), '\u2167') - @unittest.expectedFailure # TODO: RUSTPYTHON; ? ^ def test_capitalize(self): string_tests.StringLikeTest.test_capitalize(self) self.assertEqual('\U0001044F'.capitalize(), '\U00010427') diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 754dd508db2..869f3418f11 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -82,9 +82,6 @@ result-like = { workspace = true } timsort = { workspace = true } ## unicode stuff -# TODO: use unic for this; needed for title case: -# https://github.com/RustPython/RustPython/pull/832#discussion_r275428939 -unicode-casing = { workspace = true } icu_casemap = { workspace = true } icu_locale = { workspace = true } icu_properties = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 2b47bc350c3..6fd090d36e9 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -48,10 +48,9 @@ use rustpython_common::{ use icu_casemap::TitlecaseMapper; use icu_locale::LanguageIdentifier; use icu_properties::props::{ - BidiClass, BinaryProperty, EnumeratedProperty, GeneralCategory, GeneralCategoryGroup, - Lowercase, NumericType, Uppercase, XidContinue, XidStart, + BidiClass, BinaryProperty, CaseIgnorable, Cased, EnumeratedProperty, GeneralCategory, + GeneralCategoryGroup, Lowercase, NumericType, Uppercase, XidContinue, XidStart, }; -use unicode_casing::CharExt; use writeable::Writeable; impl<'a> TryFromBorrowedObject<'a> for String { @@ -780,25 +779,33 @@ impl PyStr { s.into() } PyKindStr::Utf8(s) => { - let mut chars = s.chars(); - let mut out = String::with_capacity(s.len()); - if let Some(c) = chars.next() { - out.extend(c.to_titlecase()); - out.push_str(&chars.as_str().to_lowercase()); - } - out.into() + let mut chars = s.char_indices(); + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + titlecase_first(s, &mut chars, &mut out); + lowercase_and_sigma(s, &mut chars, &mut out); + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } PyKindStr::Wtf8(s) => { - let mut out = Wtf8Buf::with_capacity(s.len()); - let mut chars = s.code_points(); - if let Some(ch) = chars.next() { - match ch.to_char() { - Some(ch) => out.extend(ch.to_titlecase()), - None => out.push(ch), - } - out.push_wtf8(&chars.as_wtf8().to_lowercase()); + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + let mut chunks = s.as_bytes().utf8_chunks(); + + if let Some(first) = chunks.next() { + let s = first.valid(); + let mut chars = s.char_indices(); + titlecase_first(s, &mut chars, &mut out); + lowercase_and_sigma(s, &mut chars, &mut out); + out.0.extend(first.invalid()); } - out + // This loop is only hit if the WTF-8 buffer contains invalid Unicode. Otherwise, + // everything is handled above without chunking. + for chunk in chunks { + let s = chunk.valid(); + let mut chars = s.char_indices(); + lowercase_and_sigma(s, &mut chars, &mut out); + out.0.extend(chunk.invalid()); + } + + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } } } @@ -1095,33 +1102,6 @@ impl PyStr { } } - fn title_non_ascii(&self) -> Wtf8Buf { - let mut title = Wtf8Buf::with_capacity(self.data.len()); - let mut previous_is_cased = false; - for c_orig in self.as_wtf8().code_points() { - let c = c_orig.to_char_lossy(); - if c.is_lowercase() { - if !previous_is_cased { - title.extend(c.to_titlecase()); - } else { - title.push_char(c); - } - previous_is_cased = true; - } else if c.is_uppercase() || c.is_titlecase() { - if previous_is_cased { - title.extend(c.to_lowercase()); - } else { - title.extend(c.to_titlecase()); - } - previous_is_cased = true; - } else { - previous_is_cased = false; - title.push(c_orig); - } - } - title - } - #[pymethod] fn swapcase(&self) -> Wtf8Buf { let mut swapped_str = Wtf8Buf::with_capacity(self.data.len()); @@ -1351,7 +1331,9 @@ impl PyStr { let mut cased = false; let mut previous_is_cased = false; for c in self.as_wtf8().code_points().map(CodePoint::to_char_lossy) { - if c.is_uppercase() || c.is_titlecase() { + if c.is_uppercase() + || GeneralCategoryGroup::TitlecaseLetter.contains(GeneralCategory::for_char(c)) + { if previous_is_cased { return false; } @@ -1590,6 +1572,56 @@ impl PyStr { } } +fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) { + if let Some((first_pos, first_ch)) = chars.next() { + let first = &s[..first_pos + first_ch.len_utf8()]; + let tm = TitlecaseMapper::new(); + tm.titlecase_segment(first, &LanguageIdentifier::UNKNOWN, Default::default()) + .write_to(out) + .expect("Writing to an in-memory buffer cannot fail."); + } +} + +fn lowercase_and_sigma(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) { + let sigma = 'Σ'; + for (i, ch) in chars { + if ch == sigma { + let sigma_cased = handle_capital_sigma(s, i); + let mut buf = [0u8; 4]; + let s = sigma_cased.encode_utf8(&mut buf); + out.0.extend(s.as_bytes()); + } else { + for ch in ch.to_lowercase() { + let mut buf = [0u8; 4]; + let s = ch.encode_utf8(&mut buf); + out.0.extend(s.as_bytes()); + } + } + } +} + +// Handle context-sensitive sigma. +// +// CPython handles sigma as a special case. This is more efficient than using icu4x to scan the +// entire string with CaseMapper because CaseMapper would allocate to produce a new string. The +// icu4x crates are robust but CPython's capitalize() is NOT so we can skip the extra allocs. +fn handle_capital_sigma(s: &str, i: usize) -> char { + let (left, rest) = s.split_at(i); + let right = &rest['Σ'.len_utf8()..]; + + // Check if any chars before or after sigma are cased. + let before = left + .chars() + .rev() + .find(|&ch| !CaseIgnorable::for_char(ch)) + .is_some_and(Cased::for_char); + let after = right + .chars() + .find(|&ch| !CaseIgnorable::for_char(ch)) + .is_some_and(Cased::for_char); + if before && !after { 'ς' } else { 'σ' } +} + impl PyRef { #[must_use] pub fn is_empty(&self) -> bool { diff --git a/extra_tests/snippets/builtin_str.py b/extra_tests/snippets/builtin_str.py index 3e054fd43b2..fde9deb8e0b 100644 --- a/extra_tests/snippets/builtin_str.py +++ b/extra_tests/snippets/builtin_str.py @@ -124,6 +124,12 @@ c = "hallo" assert c.capitalize() == "Hallo" +assert "ßello".capitalize() == "Ssello" +assert "İstanbul".capitalize() == "İstanbul" +assert "a\u0301bc".capitalize() == "Ábc" +assert "ΣΙΓΜΑ".capitalize() == "Σιγμα" +assert "😀hello".capitalize() == "😀hello" +assert "élan".capitalize() == "Élan" assert c.center(11, "-") == "---hallo---" assert ["koki".center(i, "|") for i in range(3, 10)] == [ "koki", From ce75f328cec83c3ec715abeced2d66fa401fd666 Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Fri, 1 May 2026 20:38:45 -0400 Subject: [PATCH 4/4] Match CPython's title() exactly --- Lib/test/test_str.py | 1 - crates/vm/src/builtins/str.rs | 89 +++++++++++++++++++++++------------ crates/vm/src/bytes_inner.rs | 2 +- 3 files changed, 59 insertions(+), 33 deletions(-) diff --git a/Lib/test/test_str.py b/Lib/test/test_str.py index f94ac25cbe6..702650e1f2e 100644 --- a/Lib/test/test_str.py +++ b/Lib/test/test_str.py @@ -955,7 +955,6 @@ def test_capitalize(self): self.assertEqual('finnish'.capitalize(), 'Finnish') self.assertEqual('A\u0345\u03a3'.capitalize(), 'A\u0345\u03c2') - @unittest.expectedFailure # TODO: RUSTPYTHON; ? ^ def test_title(self): super().test_title() self.assertEqual('\U0001044F'.title(), '\U00010427') diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 6fd090d36e9..60c588ba47c 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -782,7 +782,9 @@ impl PyStr { let mut chars = s.char_indices(); let mut out = VecFmtWriter(Vec::with_capacity(s.len())); titlecase_first(s, &mut chars, &mut out); - lowercase_and_sigma(s, &mut chars, &mut out); + for (i, ch) in chars { + lowercase_or_sigma(ch, s, i, &mut out); + } unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } PyKindStr::Wtf8(s) => { @@ -793,15 +795,18 @@ impl PyStr { let s = first.valid(); let mut chars = s.char_indices(); titlecase_first(s, &mut chars, &mut out); - lowercase_and_sigma(s, &mut chars, &mut out); + for (i, ch) in chars { + lowercase_or_sigma(ch, s, i, &mut out); + } out.0.extend(first.invalid()); } // This loop is only hit if the WTF-8 buffer contains invalid Unicode. Otherwise, // everything is handled above without chunking. for chunk in chunks { let s = chunk.valid(); - let mut chars = s.char_indices(); - lowercase_and_sigma(s, &mut chars, &mut out); + for (i, ch) in s.char_indices() { + lowercase_or_sigma(ch, s, i, &mut out); + } out.0.extend(chunk.invalid()); } @@ -1076,28 +1081,22 @@ impl PyStr { PyKindStr::Ascii(_) => unsafe { Wtf8Buf::from_bytes_unchecked(crate::bytes_inner::title_ascii(self.as_bytes())) }, - PyKindStr::Utf8(s) => TitlecaseMapper::new() - .titlecase_segment_to_string(s, &LanguageIdentifier::UNKNOWN, Default::default()) - .to_string() - .into(), + PyKindStr::Utf8(s) => { + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); + titlecase_string(s, &mut out); + // SAFETY: `s` is valid UTF-8 and titlecase_string only works on Unicode. + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } + } PyKindStr::Wtf8(s) => { - let mut buf = VecFmtWriter(Vec::with_capacity(s.len())); - let mapper = TitlecaseMapper::new(); + let mut out = VecFmtWriter(Vec::with_capacity(s.len())); for chunk in s.as_bytes().utf8_chunks() { - mapper - .titlecase_segment( - chunk.valid(), - &LanguageIdentifier::UNKNOWN, - Default::default(), - ) - .write_to(&mut buf) - .expect("Writing to an in-memory buffer cannot fail."); - buf.0.extend(chunk.invalid()); + titlecase_string(chunk.valid(), &mut out); + out.0.extend(chunk.invalid()); } // SAFETY: // * `s` is valid WTF-8; surrogate bytes were appended without processing. // * TitlecaseMapper produces valid UTF-8. - unsafe { Wtf8Buf::from_bytes_unchecked(buf.0) } + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) } } } } @@ -1572,6 +1571,11 @@ impl PyStr { } } +/// Title case first char if it is cased or write as is. +/// +/// This matches CPython's behavior: +/// "123abc" -> "123abc" +/// "abc" -> "Abc" fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) { if let Some((first_pos, first_ch)) = chars.next() { let first = &s[..first_pos + first_ch.len_utf8()]; @@ -1582,20 +1586,43 @@ fn titlecase_first(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut Ve } } -fn lowercase_and_sigma(s: &str, chars: &mut core::str::CharIndices<'_>, out: &mut VecFmtWriter) { +/// Title case a string following CPython conventions. +/// +/// CPython title cases each char in a segment. A "segment" is split by case ignorable characters +/// rather than whitespace. +/// "123abc" -> "123Abc" +/// "123abc456def" -> "123Abc456Def" +/// "123 abc" -> "123 Abc" +fn titlecase_string(s: &str, out: &mut VecFmtWriter) { + let mut previous_is_cased = false; + let mapper = TitlecaseMapper::new(); + for (i, ch) in s.char_indices() { + if previous_is_cased { + lowercase_or_sigma(ch, s, i, out); + } else { + let s = &s[i..i + ch.len_utf8()]; + mapper + .titlecase_segment(s, &LanguageIdentifier::UNKNOWN, Default::default()) + .write_to(out) + .expect("Writing to an in-memory buffer cannot fail."); + } + + previous_is_cased = Cased::for_char(ch); + } +} + +fn lowercase_or_sigma(ch: char, s: &str, i: usize, out: &mut VecFmtWriter) { let sigma = 'Σ'; - for (i, ch) in chars { - if ch == sigma { - let sigma_cased = handle_capital_sigma(s, i); + if ch == sigma { + let sigma_cased = handle_capital_sigma(s, i); + let mut buf = [0u8; 4]; + let s = sigma_cased.encode_utf8(&mut buf); + out.0.extend(s.as_bytes()); + } else { + for ch in ch.to_lowercase() { let mut buf = [0u8; 4]; - let s = sigma_cased.encode_utf8(&mut buf); + let s = ch.encode_utf8(&mut buf); out.0.extend(s.as_bytes()); - } else { - for ch in ch.to_lowercase() { - let mut buf = [0u8; 4]; - let s = ch.encode_utf8(&mut buf); - out.0.extend(s.as_bytes()); - } } } } diff --git a/crates/vm/src/bytes_inner.rs b/crates/vm/src/bytes_inner.rs index 98fbd2568aa..f1057d18826 100644 --- a/crates/vm/src/bytes_inner.rs +++ b/crates/vm/src/bytes_inner.rs @@ -1221,7 +1221,7 @@ pub(crate) const fn is_py_ascii_whitespace(b: u8) -> bool { /// ASCII-only title casing. /// /// This is purposely naive as is CPython's implementation. -pub fn title_ascii(bytes: &[u8]) -> Vec { +pub(crate) fn title_ascii(bytes: &[u8]) -> Vec { let mut next_upper = true; let mut out = Vec::with_capacity(bytes.len()); for &b in bytes {