diff --git a/Cargo.lock b/Cargo.lock index 4e9b54bb8aa..470f9bc919b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -429,15 +429,6 @@ dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "caseless" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6fd507454086c8edfd769ca6ada439193cdb209c7681712ef6275cccbfe5d8" -dependencies = [ - "unicode-normalization", -] - [[package]] name = "cast" version = "0.3.0" @@ -3561,7 +3552,6 @@ dependencies = [ "ascii", "bitflags 2.11.0", "bstr", - "caseless", "chrono", "constant_time_eq", "crossbeam-utils", diff --git a/Cargo.toml b/Cargo.toml index 80976b1e1c7..4ba2fef2e48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -186,7 +186,6 @@ bitflags = "2.11.0" bitflagset = "0.0.3" bstr = "1" bzip2 = "0.6" -caseless = "0.2.2" chrono = { version = "0.4.44", default-features = false, features = ["clock", "std"] } console_error_panic_hook = "0.1" constant_time_eq = "0.4" diff --git a/crates/vm/Cargo.toml b/crates/vm/Cargo.toml index 869f3418f11..e04e65fce2d 100644 --- a/crates/vm/Cargo.toml +++ b/crates/vm/Cargo.toml @@ -73,7 +73,6 @@ strum_macros = { workspace = true } thiserror = { workspace = true } memchr = { workspace = true } -caseless = { workspace = true } flamer = { workspace = true, optional = true } half = { workspace = true } psm = { workspace = true } diff --git a/crates/vm/src/builtins/str.rs b/crates/vm/src/builtins/str.rs index 960c3581301..212b14c7487 100644 --- a/crates/vm/src/builtins/str.rs +++ b/crates/vm/src/builtins/str.rs @@ -45,10 +45,10 @@ use rustpython_common::{ hash, lock::PyMutex, str::DeduceStrKind, - wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Chunk, Wtf8Concat}, + wtf8::{CodePoint, Wtf8, Wtf8Buf, Wtf8Concat}, }; -use icu_casemap::TitlecaseMapper; +use icu_casemap::{CaseMapper, TitlecaseMapper}; use icu_locale::LanguageIdentifier; use icu_properties::props::{ BidiClass, BinaryProperty, CaseIgnorable, Cased, EnumeratedProperty, GeneralCategory, @@ -743,20 +743,31 @@ impl PyStr { } } - // casefold is much more aggressive than lower + // Case folding is a Unicode standard operation to erase case differences. + // + // Lower, upper, and title case are special properties. Case folding erases those + // differences. For ASCII, case folding is the same as lower case but other scripts have + // their own, well-defined mappings. #[pymethod] fn casefold(&self) -> Self { match self.as_str_kind() { - PyKindStr::Ascii(s) => caseless::default_case_fold_str(s.as_str()).into(), - PyKindStr::Utf8(s) => caseless::default_case_fold_str(s).into(), - PyKindStr::Wtf8(w) => w - .chunks() - .map(|c| match c { - Wtf8Chunk::Utf8(s) => Wtf8Buf::from_string(caseless::default_case_fold_str(s)), - Wtf8Chunk::Surrogate(c) => Wtf8Buf::from(c), - }) - .collect::() - .into(), + PyKindStr::Ascii(s) => s.to_ascii_lowercase().into(), + PyKindStr::Utf8(s) => CaseMapper::new().fold_string(s).to_string().into(), + PyKindStr::Wtf8(w) => { + let mut out = VecFmtWriter(Vec::with_capacity(w.len())); + let mapper = CaseMapper::new(); + for chunk in w.as_bytes().utf8_chunks() { + mapper + .fold(chunk.valid()) + .write_to(&mut out) + .expect("Writing to an in-memory buffer cannot fail."); + out.0.extend(chunk.invalid()); + } + // SAFETY: + // * CaseMapper only produces valid UTF-8 + // * Surrogates are appended as-is + unsafe { Wtf8Buf::from_bytes_unchecked(out.0) }.into() + } } }