From 7884239b4c14650d6585b232efa25b559d5a4d3f Mon Sep 17 00:00:00 2001 From: Josh Megnauth Date: Mon, 4 May 2026 12:30:52 -0400 Subject: [PATCH] Use icu4x for casefold() For ASCII, casefold() is equivalent to lower casing the string. Casefolding is a Unicode operation defined by the standard. Both CPython and RustPython follow the standard without any frills. The icu4x set of crates implement casefold as per the standard, so RustPython can use the ICU crates instead of adding in a crate for one operation. This also has the extra benefit of consistency - consistent Unicode version and consistent crate usage. --- Cargo.lock | 10 ---------- Cargo.toml | 1 - crates/vm/Cargo.toml | 1 - crates/vm/src/builtins/str.rs | 37 +++++++++++++++++++++++------------ 4 files changed, 24 insertions(+), 25 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4e9b54bb8aa..470f9bc919b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -429,15 +429,6 @@ dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "caseless" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6fd507454086c8edfd769ca6ada439193cdb209c7681712ef6275cccbfe5d8" -dependencies = [ - "unicode-normalization", -] - [[package]] name = "cast" version = "0.3.0" @@ -3561,7 +3552,6 @@ dependencies = [ "ascii", "bitflags 2.11.0", "bstr", - "caseless", "chrono", "constant_time_eq", "crossbeam-utils", diff --git a/Cargo.toml b/Cargo.toml index 80976b1e1c7..4ba2fef2e48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -186,7 +186,6 @@ bitflags = "2.11.0" bitflagset = "0.0.3" bstr = "1" bzip2 = "0.6" -caseless = "0.2.2" chrono = { version = "0.4.44", default-features = false, features = ["clock", "std"] } console_error_panic_hook = "0.1" constant_time_eq = "0.4" diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 869f3418f11..e04e65fce2d 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -73,7 +73,6 @@ strum_macros = { workspace = true } thiserror = { workspace = true } memchr = { workspace = true } -caseless = { workspace = true } flamer = { workspace = true, optional = true } half = { workspace = true } psm = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 960c3581301..212b14c7487 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -45,10 +45,10 @@ use rustpython_common::{ hash, lock::PyMutex, str::DeduceStrKind, - wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, + wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Concat}, }; -use icu_casemap::TitlecaseMapper; +use icu_casemap::{CaseMapper, TitlecaseMapper}; use icu_locale::LanguageIdentifier; use icu_properties::props::{ BidiClass, BinaryProperty, CaseIgnorable, Cased, EnumeratedProperty, GeneralCategory, @@ -743,20 +743,31 @@ impl PyStr { } } - // casefold is much more aggressive than lower + // Case folding is a Unicode standard operation to erase case differences. + // + // Lower, upper, and title case are special properties. Case folding erases those + // differences. For ASCII, case folding is the same as lower case but other scripts have + // their own, well-defined mappings. #[pymethod] fn casefold(&self) -> Self { match self.as_str_kind() { - PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(), - PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(), - PyKindStr::Wtf8(w) => w - .chunks() - .map(|c| match c { - Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)), - Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c), - }) - .collect::() - .into(), + PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(), + PyKindStr::Utf8(s) => CaseMapper::new().fold_string(s).to_string().into(), + PyKindStr::Wtf8(w) => { + let mut out = VecFmtWriter(Vec::with_capacity(w.len())); + let mapper = CaseMapper::new(); + for chunk in w.as_bytes().utf8_chunks() { + mapper + .fold(chunk.valid()) + .write_to(&mut out) + .expect("Writing to an in-memory buffer cannot fail."); + out.0.extend(chunk.invalid()); + } + // SAFETY: + // * CaseMapper only produces valid UTF-8 + // * Surrogates are appended as-is + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }.into() + } } }