summaryrefslogtreecommitdiff
path: root/migrations/20241030152013_channel_message_last_event_id.sql
diff options
context:
space:
mode:
authorOwen Jacobson <owen@grimoire.ca>2024-10-30 11:32:52 -0400
committerOwen Jacobson <owen@grimoire.ca>2024-10-30 12:09:41 -0400
commit06c839436900ce07ec5c53175b01f3c5011e507c (patch)
treec9c1d2045982d65427b5345ae91609f4d224edb8 /migrations/20241030152013_channel_message_last_event_id.sql
parent50a382528288248381b07c25719cbc9a519b4c81 (diff)
Track an index-friendly sequence range for both channels and messages.
This is meant to limit the amount of messages that event replay needs to examine. Previously, the query required a table scan; table scans on `message` can be quite large, and should really be avoided. The new schema allows replays to be carried out using an index scan. The premise of this change is that, for each (channel, message), there is a range of event sequence numbers that the (channel, message) may appear in. We'll notate that as `[start, end]` in the general case, but they are: * for active channels, `[ch.created_sequence, ch.created_sequence]`. * for deleted channels, `[ch.created_sequence, ch_del.deleted_sequence]`. * for active messages, `[mg.sent_sequence, mg.sent_sequence]`. * for deleted messages, `[mg.sent_seqeunce, mg_del.deleted_sequence]`. (The two "active" ranges may grow in future releases, to support things like channel renames and message editing. That won't change the logic, but those two things will need to update the new `last_sequence` field.) There are two families of operations that need to retrieve based on these ranges: * Boot creates a snapshot as of a specific `resume_at` sequence number, and thus must include any record whose `start` falls on or before `resume_at`. We can't exclude records whose `end` is also before it, as their terminal state may be one that is included in boot (eg. active channels). * Event replay needs to include any events that fall after the same `resume_at`, and thus must include events from any record whose `end` falls strictly after `resume_at`. We can't exclude records whose `start` is also strictly after `resume_at`, as we'd omit them from replay, inappropriately, if we did. This gives three interesting cases: 1. Record fully after `resume_at`: event sequence --increasing--> x-a … x … x+k … resume_at start end This record should be omitted by boot, but included for event replay. 2. Record fully before `resume_at`: event sequence --increasing--> x … x+k … x+a start end resume_at This record should be omitted for event replay, but included for boot. 3. Record overlapping `resume_at`: event sequence --increasing--> x … x+a … x+k start resume_at end This record needs to be included for both boot and event replay. However, the bounds of that range were previously stored in two tables (`channel` and `channel_deleted`, or `message` and `message_deleted`, respectively), which sqlite (indeed most SQL implementations) cannot index. This forced a table scan, leading to the program considering every possible (channel, message) during event replay. This commit adds a `last_sequence` field to channels and messages, which is set to the above values as channels and messages are operated on. This field is indexed, and queries can use it to rapidly identify relevant rows for event replay, cutting down the amount of reading needed to generate events on resume.
Diffstat (limited to 'migrations/20241030152013_channel_message_last_event_id.sql')
-rw-r--r--migrations/20241030152013_channel_message_last_event_id.sql141
1 files changed, 141 insertions, 0 deletions
diff --git a/migrations/20241030152013_channel_message_last_event_id.sql b/migrations/20241030152013_channel_message_last_event_id.sql
new file mode 100644
index 0000000..dd6e66b
--- /dev/null
+++ b/migrations/20241030152013_channel_message_last_event_id.sql
@@ -0,0 +1,141 @@
+alter table channel
+rename to old_channel;
+alter table channel_name
+rename to old_channel_name;
+alter table channel_deleted
+rename to old_channel_deleted;
+alter table message
+rename to old_message;
+alter table message_deleted
+rename to old_message_deleted;
+
+create table channel (
+ id text
+ not null
+ primary key,
+ created_sequence bigint
+ unique
+ not null,
+ created_at text
+ not null,
+ last_sequence bigint
+ not null
+);
+
+insert into channel (id, created_sequence, created_at, last_sequence)
+select
+ ch.id,
+ ch.created_sequence,
+ ch.created_at,
+ max(ch.created_sequence, coalesce(del.deleted_sequence, ch.created_sequence)) as last_seqeuence
+from old_channel as ch
+left join old_channel_deleted as del
+ using (id);
+
+create table channel_name (
+ id text
+ not null
+ primary key
+ references channel (id),
+ display_name
+ not null,
+ canonical_name
+ not null
+ unique
+);
+
+insert into channel_name (id, display_name, canonical_name)
+select id, display_name, canonical_name
+from old_channel_name;
+
+create table channel_deleted (
+ id text
+ not null
+ primary key
+ references channel (id),
+ deleted_sequence bigint
+ unique
+ not null,
+ deleted_at text
+ not null
+);
+
+insert into channel_deleted (id, deleted_sequence, deleted_at)
+select id, deleted_sequence, deleted_at
+from old_channel_deleted;
+
+create table message (
+ id text
+ not null
+ primary key,
+ channel text
+ not null
+ references channel (id),
+ sender text
+ not null
+ references login (id),
+ sent_sequence bigint
+ unique
+ not null,
+ sent_at text
+ not null,
+ body text
+ null,
+ last_sequence bigint
+ not null
+);
+
+insert into message (id, channel, sender, sent_sequence, sent_at, body, last_sequence)
+select
+ msg.id,
+ msg.channel,
+ msg.sender,
+ msg.sent_sequence,
+ msg.sent_at,
+ msg.body,
+ max(msg.sent_sequence, coalesce(del.deleted_sequence, msg.sent_sequence)) as last_sequence
+from
+ old_message as msg
+ left join old_message_deleted as del
+ using (id);
+
+create table message_deleted (
+ id text
+ not null
+ primary key
+ references message (id),
+ deleted_sequence bigint
+ unique
+ not null,
+ deleted_at text
+ not null
+);
+
+insert into message_deleted (id, deleted_sequence, deleted_at)
+select id, deleted_sequence, deleted_at
+from old_message_deleted;
+
+drop table old_message_deleted;
+drop table old_message;
+drop table old_channel_deleted;
+drop table old_channel_name;
+drop table old_channel;
+
+-- recreate existing indices
+create index message_sent_at
+on message (sent_at);
+create index message_deleted_deleted_at
+on message_deleted (deleted_at);
+create index message_channel
+on message(channel);
+create index channel_created_sequence
+on channel (created_sequence);
+create index channel_created_at
+on channel (created_at);
+
+-- new indices
+create index channel_last_sequence
+on channel (last_sequence);
+
+create index message_last_sequence
+on message (last_sequence);