Canonicalize login and channel names.

Canonicalization does two things: * It prevents duplicate names that differ only by case or only by normalization/encoding sequence; and * It makes certain name-based comparisons "case-insensitive" (generalizing via Unicode's case-folding rules). This change is complicated, as it means that every name now needs to be stored in two forms. Unfortunately, this is _very likely_ a breaking schema change. The migrations in this commit perform a best-effort attempt to canonicalize existing channel or login names, but it's likely any existing channels or logins with non-ASCII characters will not be canonicalize correctly. Since clients look at all channel names and all login names on boot, and since the code in this commit verifies canonicalization when reading from the database, this will effectively make the server un-usuable until any incorrectly-canonicalized values are either manually canonicalized, or removed It might be possible to do better with [the `icu` sqlite3 extension][icu], but (a) I'm not convinced of that and (b) this commit is already huge; adding database extension support would make it far larger. [icu]: https://sqlite.org/src/dir/ext/icu For some references on why it's worth storing usernames this way, see <https://www.b-list.org/weblog/2018/nov/26/case/> and the refernced talk, as well as <https://www.b-list.org/weblog/2018/feb/11/usernames/>. Bennett's treatment of this issue is, to my eye, much more readable than the referenced Unicode technical reports, and I'm inclined to trust his opinion given that he maintains a widely-used, internet-facing user registration library for Django.
author: Owen Jacobson <owen@grimoire.ca> 2024-10-21 00:36:44 -0400
committer: Owen Jacobson <owen@grimoire.ca> 2024-10-22 10:58:11 -0400
commit: 3f9648eed48cd8b6cd35d0ae2ee5bbe25fa735ac (patch)
tree: 8ecdd86cd9e09d8a3bd55ec1f72888a81498cc38 /migrations
parent: 379e97c2cb145bc3a495aa14746273d83b508214 (diff)
1 files changed, 189 insertions, 0 deletions
diff --git a/migrations/20241019191531_canonical_names.sql b/migrations/20241019191531_canonical_names.sql
new file mode 100644
index 0000000..ab7cbf4
--- /dev/null
+++ b/migrations/20241019191531_canonical_names.sql
@@ -0,0 +1,189 @@
+alter table login
+rename to old_login;
+alter table token
+rename to old_token;
+alter table invite
+rename to old_invite;
+alter table channel
+rename to old_channel;
+alter table channel_deleted
+rename to old_channel_deleted;
+alter table message
+rename to old_message;
+alter table message_deleted
+rename to old_message_deleted;
+
+create table login (
+	id text
+		not null
+		primary key,
+	display_name text
+		not null,
+	canonical_name text
+		not null
+		unique,
+	password_hash text
+		not null,
+	created_sequence bigint
+		unique
+		not null,
+	created_at text
+		not null
+);
+
+insert into login (id, display_name, canonical_name, password_hash, created_sequence, created_at)
+select
+	id,
+	-- This isn't strictly correct, as existing names are not guaranteed to be
+	-- normalized, and as sqlite's built-in `lower` only operates on ASCII, but
+	-- without any way to do case folding and normalization in sqlite3, and
+	-- without any way to call out into Rust to do the work, this is the best we
+	-- can do. Normalization issues will produce errors at runtime when the
+	-- relevant rows are loaded.
+	name as display_name,
+	lower(name) as canonical_name,
+	password_hash,
+	created_sequence,
+	created_at
+from old_login;
+
+create table token (
+	id text
+		not null
+		primary key,
+	secret text
+		not null
+		unique,
+	login text
+		not null
+		references login (id),
+	issued_at text
+		not null,
+	last_used_at text
+		not null
+);
+
+insert into token (id, secret, login, issued_at, last_used_at)
+select id, secret, login, issued_at, last_used_at from old_token;
+
+create table invite (
+	id text
+		primary key
+		not null,
+	issuer text
+		not null
+		references login (id),
+	issued_at text
+		not null
+);
+
+insert into invite (id, issuer, issued_at)
+select id, issuer, issued_at
+from old_invite;
+
+create table channel (
+	id text
+		not null
+		primary key,
+	created_sequence bigint
+		unique
+		not null,
+	created_at text
+		not null
+);
+
+create table channel_name (
+	id text
+		not null
+		primary key
+		references channel (id),
+	display_name
+		not null,
+	canonical_name
+		not null
+		unique
+);
+
+insert into channel (id, created_sequence, created_at)
+select id, created_sequence, created_at from old_channel;
+
+insert into channel_name (id, display_name, canonical_name)
+select
+	id,
+	-- This isn't strictly correct, for the same reasons as above.
+	name as display_name,
+	name as canonical_name
+from old_channel
+where name is not null;
+
+create table channel_deleted (
+	id text
+		not null
+		primary key
+		references channel (id),
+	deleted_sequence bigint
+		unique
+		not null,
+	deleted_at text
+		not null
+);
+
+insert into channel_deleted (id, deleted_sequence, deleted_at)
+select id, deleted_sequence, deleted_at
+from old_channel_deleted;
+
+create table message (
+	id text
+		not null
+		primary key,
+	-- Starting from the code changes in this rev, values in this column will be
+	-- in NFC. However, we don't actually need to normalize historical values;
+	-- they're delivered to clients "verbatim" with respect to how they were sent,
+	-- which causes no harm.
+	channel text
+		not null
+		references channel (id),
+	sender text
+		not null
+		references login (id),
+	sent_sequence bigint
+		unique
+		not null,
+	sent_at text
+		not null,
+	body text
+		null
+);
+
+insert into message (id, channel, sender, sent_sequence, sent_at, body)
+select id, channel, sender, sent_sequence, sent_at, body
+from old_message;
+
+create table message_deleted (
+	id text
+		not null
+		primary key
+		references message (id),
+	deleted_sequence bigint
+		unique
+		not null,
+	deleted_at text
+		not null
+);
+
+insert into message_deleted (id, deleted_sequence, deleted_at)
+select id, deleted_sequence, deleted_at
+from old_message_deleted;
+
+drop table old_message_deleted;
+drop table old_message;
+drop table old_channel_deleted;
+drop table old_channel;
+drop table old_invite;
+drop table old_token;
+drop table old_login;
+
+create index token_issued_at
+on token (issued_at);
+create index message_sent_at
+on message (sent_at);
author	Owen Jacobson <owen@grimoire.ca>	2024-10-21 00:36:44 -0400
committer	Owen Jacobson <owen@grimoire.ca>	2024-10-22 10:58:11 -0400
commit	3f9648eed48cd8b6cd35d0ae2ee5bbe25fa735ac (patch)
tree	8ecdd86cd9e09d8a3bd55ec1f72888a81498cc38 /migrations
parent	379e97c2cb145bc3a495aa14746273d83b508214 (diff)