From 3f9648eed48cd8b6cd35d0ae2ee5bbe25fa735ac Mon Sep 17 00:00:00 2001 From: Owen Jacobson Date: Mon, 21 Oct 2024 00:36:44 -0400 Subject: Canonicalize login and channel names. Canonicalization does two things: * It prevents duplicate names that differ only by case or only by normalization/encoding sequence; and * It makes certain name-based comparisons "case-insensitive" (generalizing via Unicode's case-folding rules). This change is complicated, as it means that every name now needs to be stored in two forms. Unfortunately, this is _very likely_ a breaking schema change. The migrations in this commit perform a best-effort attempt to canonicalize existing channel or login names, but it's likely any existing channels or logins with non-ASCII characters will not be canonicalize correctly. Since clients look at all channel names and all login names on boot, and since the code in this commit verifies canonicalization when reading from the database, this will effectively make the server un-usuable until any incorrectly-canonicalized values are either manually canonicalized, or removed It might be possible to do better with [the `icu` sqlite3 extension][icu], but (a) I'm not convinced of that and (b) this commit is already huge; adding database extension support would make it far larger. [icu]: https://sqlite.org/src/dir/ext/icu For some references on why it's worth storing usernames this way, see and the refernced talk, as well as . Bennett's treatment of this issue is, to my eye, much more readable than the referenced Unicode technical reports, and I'm inclined to trust his opinion given that he maintains a widely-used, internet-facing user registration library for Django. --- src/normalize/mod.rs | 36 ++++++++++++++++ src/normalize/string.rs | 112 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 src/normalize/mod.rs create mode 100644 src/normalize/string.rs (limited to 'src/normalize') diff --git a/src/normalize/mod.rs b/src/normalize/mod.rs new file mode 100644 index 0000000..6294201 --- /dev/null +++ b/src/normalize/mod.rs @@ -0,0 +1,36 @@ +mod string; + +pub mod nfc { + use std::string::String as StdString; + + use unicode_normalization::UnicodeNormalization as _; + + pub type String = super::string::String; + + #[derive(Clone, Debug, Default, Eq, PartialEq)] + pub struct Nfc; + + impl super::string::Normalize for Nfc { + fn normalize(&self, value: &str) -> StdString { + value.nfc().collect() + } + } +} + +pub mod ident { + use std::string::String as StdString; + + use unicode_casefold::UnicodeCaseFold as _; + use unicode_normalization::UnicodeNormalization as _; + + pub type String = super::string::String; + + #[derive(Clone, Debug, Default, Eq, PartialEq)] + pub struct Ident; + + impl super::string::Normalize for Ident { + fn normalize(&self, value: &str) -> StdString { + value.case_fold().nfkc().collect() + } + } +} diff --git a/src/normalize/string.rs b/src/normalize/string.rs new file mode 100644 index 0000000..a0d178c --- /dev/null +++ b/src/normalize/string.rs @@ -0,0 +1,112 @@ +use std::{fmt, string::String as StdString}; + +use sqlx::{ + encode::{Encode, IsNull}, + Database, Decode, Type, +}; + +pub trait Normalize: Clone + Default { + fn normalize(&self, value: &str) -> StdString; +} + +#[derive(Clone, Debug, Default, Eq, PartialEq, serde::Serialize, serde::Deserialize)] +#[serde(into = "StdString", from = "StdString")] +#[serde(bound = "N: Normalize")] +pub struct String(StdString, N); + +impl fmt::Display for String { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let Self(value, _) = self; + value.fmt(f) + } +} + +impl From for String +where + S: AsRef, + N: Normalize, +{ + fn from(value: S) -> Self { + let normalizer = N::default(); + let value = normalizer.normalize(value.as_ref()); + + Self(value, normalizer) + } +} + +impl From> for StdString { + fn from(value: String) -> Self { + let String(value, _) = value; + value + } +} + +impl std::ops::Deref for String { + type Target = StdString; + + fn deref(&self) -> &Self::Target { + let Self(value, _) = self; + value + } +} + +// Type is manually implemented so that we can implement Decode to do +// normalization on read. Implementation is otherwise based on +// `#[derive(sqlx::Type)]` with the `#[sqlx(transparent)]` attribute. +impl Type for String +where + DB: Database, + StdString: Type, +{ + fn type_info() -> ::TypeInfo { + >::type_info() + } + + fn compatible(ty: &::TypeInfo) -> bool { + >::compatible(ty) + } +} + +impl<'r, DB, N> Decode<'r, DB> for String +where + DB: Database, + StdString: Decode<'r, DB>, + N: Normalize, +{ + fn decode(value: ::ValueRef<'r>) -> Result { + let value = StdString::decode(value)?; + Ok(Self::from(value)) + } +} + +impl<'q, DB, N> Encode<'q, DB> for String +where + DB: Database, + StdString: Encode<'q, DB>, +{ + fn encode_by_ref( + &self, + buf: &mut ::ArgumentBuffer<'q>, + ) -> Result { + let Self(value, _) = self; + value.encode_by_ref(buf) + } + + fn encode( + self, + buf: &mut ::ArgumentBuffer<'q>, + ) -> Result { + let Self(value, _) = self; + value.encode(buf) + } + + fn produces(&self) -> Option<::TypeInfo> { + let Self(value, _) = self; + value.produces() + } + + fn size_hint(&self) -> usize { + let Self(value, _) = self; + value.size_hint() + } +} -- cgit v1.2.3