From 8390c0eb305f4b00691e8a2f739f50c66795c26c Mon Sep 17 00:00:00 2001 From: Dull Bananas Date: Mon, 18 Nov 2024 17:14:55 -0700 Subject: [PATCH] Make diff check work just like before --- Cargo.lock | 15 +- crates/db_schema/Cargo.toml | 2 +- crates/db_schema/src/schema_setup.rs | 17 +- .../db_schema/src/schema_setup/diff_check.rs | 270 +----------------- 4 files changed, 34 insertions(+), 270 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 38fb48f67..10defe305 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1361,6 +1361,19 @@ version = "0.1.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8" +[[package]] +name = "diffutils" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8d7ce619b5c0e13f7543dc2c203a7e6fa37e0111d876339aada7ec9540a58d5" +dependencies = [ + "chrono", + "diff", + "regex", + "same-file", + "unicode-width", +] + [[package]] name = "digest" version = "0.10.7" @@ -2627,7 +2640,7 @@ dependencies = [ "diesel-derive-newtype", "diesel_ltree", "diesel_migrations", - "diff", + "diffutils", "futures-util", "i-love-jesus", "lemmy_utils", diff --git a/crates/db_schema/Cargo.toml b/crates/db_schema/Cargo.toml index 2939228b6..2ea816ef5 100644 --- a/crates/db_schema/Cargo.toml +++ b/crates/db_schema/Cargo.toml @@ -86,4 +86,4 @@ tuplex = { workspace = true, optional = true } [dev-dependencies] serial_test = { workspace = true } pretty_assertions = { workspace = true } -diff = "0.1.13" +diffutils = "0.4.2" diff --git a/crates/db_schema/src/schema_setup.rs b/crates/db_schema/src/schema_setup.rs index 2b6bc4a76..610af8f05 100644 --- a/crates/db_schema/src/schema_setup.rs +++ b/crates/db_schema/src/schema_setup.rs @@ -60,7 +60,7 @@ const REPLACEABLE_SCHEMA_PATH: &str = "crates/db_schema/replaceable_schema"; struct MigrationHarnessWrapper<'a> { conn: &'a mut PgConnection, #[cfg(test)] - enable_diff_check: bool, + diff_checked_migration_name: Option, } impl<'a> MigrationHarnessWrapper<'a> { @@ -86,7 +86,7 @@ impl<'a> MigrationHarness for MigrationHarnessWrapper<'a> { migration: &dyn Migration, ) -> diesel::migration::Result> { #[cfg(test)] - if self.enable_diff_check { + if self.diff_checked_migration_name == Some(migration.name().to_string()) { let before = diff_check::get_dump(); self.run_migration_inner(migration)?; @@ -271,7 +271,11 @@ fn run_selected_migrations( let mut wrapper = MigrationHarnessWrapper { conn, #[cfg(test)] - enable_diff_check: options.enable_diff_check, + diff_checked_migration_name: options + .enable_diff_check + .then(|| diesel::migration::MigrationSource::::migrations(&migrations())) + .transpose()? + .and_then(|migrations| Some(migrations.last()?.name().to_string())), }; if options.revert { @@ -325,7 +329,8 @@ mod tests { // Start with consistent state by dropping everything conn.batch_execute("DROP OWNED BY CURRENT_USER;")?; - // Run all migrations and check for mistakes in down.sql files + // Run all migrations, make sure the newest migration can be redone, and check the newest + // down.sql file assert_eq!( run(o().run().enable_diff_check())?, ReplaceableSchemaRebuilt @@ -345,10 +350,8 @@ mod tests { ); assert_eq!(run(o().run().limit(1))?, ReplaceableSchemaRebuilt); - // Revert all migrations - assert_eq!(run(o().revert())?, ReplaceableSchemaNotRebuilt); - // This should throw an error saying to use lemmy_server instead of diesel CLI + conn.batch_execute("DROP OWNED BY CURRENT_USER;")?; assert!(matches!( conn.run_pending_migrations(migrations()), Err(e) if e.to_string().contains("lemmy_server") diff --git a/crates/db_schema/src/schema_setup/diff_check.rs b/crates/db_schema/src/schema_setup/diff_check.rs index ea9def7ef..7f6652c13 100644 --- a/crates/db_schema/src/schema_setup/diff_check.rs +++ b/crates/db_schema/src/schema_setup/diff_check.rs @@ -1,10 +1,6 @@ #![expect(clippy::expect_used)] use lemmy_utils::settings::SETTINGS; -use std::{ - borrow::Cow, - collections::btree_set::{self, BTreeSet}, - process::{Command, Stdio}, -}; +use std::process::{Command, Stdio}; // It's not possible to call `export_snapshot()` for each dump and run the dumps in parallel with // the `--snapshot` flag. Don't waste your time!!!! @@ -16,21 +12,12 @@ pub fn get_dump() -> String { // Specify database URL "--dbname", &db_url, - // Allow differences in row data and old fast tables - "--schema-only", - "--exclude-table=comment_aggregates_fast", - "--exclude-table=community_aggregates_fast", - "--exclude-table=post_aggregates_fast", - "--exclude-table=user_fast", - // Ignore some things to reduce the amount of queries done by pg_dump + // Disable some things "--no-owner", "--no-privileges", - "--no-comments", - "--no-publications", - "--no-security-labels", - "--no-subscriptions", "--no-table-access-method", - "--no-tablespaces", + "--schema-only", + "--no-sync", ]) .stderr(Stdio::inherit()) .output() @@ -42,251 +29,12 @@ pub fn get_dump() -> String { String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text") } -const PATTERN_LEN: usize = 19; - pub fn check_dump_diff(before: String, after: String, label: &str) { - // Performance optimization - if after == before { - return; - } + if before != after { + let diff_bytes = + diffutilslib::unified_diff(before.as_bytes(), after.as_bytes(), &Default::default()); + let diff = String::from_utf8_lossy(&diff_bytes); - // Borrow checker requires this to be a separate variable - let normalized_chunk_vecs = [&before, &after] - // Remove identical items - .map(|dump| chunks(dump).collect::>()) - .differences() - // Remove items without unwanted types of differences (if migrations are correct, then this - // removes everything) - .map(|chunks| chunks.map(|&i| normalize_chunk(i)).collect::>()); - let [only_in_before, only_in_after] = normalized_chunk_vecs // Imagine that this line doesn't exist - .differences() - .map(|chunks| chunks.map(|i| &**i).collect::>()); - - if only_in_before.is_empty() && only_in_after.is_empty() { - return; - } - - // Build the panic message - - let after_has_more = only_in_before.len() < only_in_after.len(); - let [chunks, mut other_chunks] = if after_has_more { - [only_in_before, only_in_after] - } else { - [only_in_after, only_in_before] - }; - - let diffs = chunks - .into_iter() - .chain(std::iter::repeat("")) - .map_while(|chunk| { - let (most_similar_chunk_index, most_similar_chunk) = other_chunks - .iter() - .enumerate() - .max_by_key(|(_, other_chunk)| { - if sql_command_name(chunk) != sql_command_name(other_chunk) { - 0 - } else { - similarity(chunk, other_chunk) - } - })?; - - let diff_lines = if after_has_more { - diff::lines(most_similar_chunk, chunk) - } else { - diff::lines(chunk, most_similar_chunk) - }; - - other_chunks.swap_remove(most_similar_chunk_index); - - Some( - diff_lines - .into_iter() - .flat_map(|line| match line { - diff::Result::Left(s) => ["- ", s, "\n"], - diff::Result::Right(s) => ["+ ", s, "\n"], - diff::Result::Both(s, _) => [" ", s, "\n"], - }) - .chain(["\n"]) - .collect::(), - ) - }); - - panic!( - "{}", - std::iter::once(format!("{label}\n\n")) - .chain(diffs) - .collect::() - ); -} - -trait Differences { - fn differences(&self) -> [btree_set::Difference<'_, T>; 2]; -} - -impl Differences for [BTreeSet; 2] { - /// Items only in `a`, and items only in `b` - fn differences(&self) -> [btree_set::Difference<'_, T>; 2] { - let [a, b] = self; - [a.difference(b), b.difference(a)] + panic!("{label}\n\n{diff}"); } } - -fn sql_command_name(chunk: &str) -> &str { - chunk - .split_once(|c: char| c.is_lowercase()) - .unwrap_or_default() - .0 -} - -fn similarity(chunk: &str, other_chunk: &str) -> usize { - diff::chars(chunk, other_chunk) - .into_iter() - .filter(|i| { - match i { - diff::Result::Both(c, _) => { - // Prevent whitespace from affecting similarity level - !c.is_whitespace() - && ( - // Increase accuracy for some trigger function diffs - c.is_lowercase() - // Preserve differences in names that contain a number - || c.is_numeric() - ) - } - _ => false, - } - }) - .count() -} - -fn normalize_chunk(chunk: &str) -> Cow<'_, str> { - let mut chunk = Cow::Borrowed(chunk); - - let stripped_lines = chunk - .lines() - .map(|line| line.strip_suffix(',').unwrap_or(line)); - - // Sort column names, so differences in column order are ignored - if chunk.starts_with("CREATE TABLE ") { - let mut lines = stripped_lines.collect::>(); - - sort_within_sections(&mut lines, |line| { - match line.chars().next() { - // CREATE - Some('C') => 0, - // Indented column name - Some(' ') => 1, - // End - Some(')') => 2, - _ => panic!("unrecognized part of `CREATE TABLE` statement: {line}"), - } - }); - - chunk = Cow::Owned(lines.join("\n")); - } - - // Replace timestamps with a constant string, so differences in timestamps are ignored - for index in 0.. { - // Performance optimization - let Some(byte) = chunk.as_bytes().get(index) else { - break; - }; - if !byte.is_ascii_digit() { - continue; - } - - // Check for this pattern: 0000-00-00 00:00:00 - let Some(( - &[a0, a1, a2, a3, b0, a4, a5, b1, a6, a7, b2, a8, a9, b3, a10, a11, b4, a12, a13], - remaining, - )) = chunk - .get(index..) - .and_then(|s| s.as_bytes().split_first_chunk::()) - else { - break; - }; - - if [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13] - .into_iter() - .all(|byte| byte.is_ascii_digit()) - && [b0, b1, b2, b3, b4] == *b"-- ::" - { - // Replace the part of the string that has the checked pattern and an optional fractional part - let len_after = if let Some((b'.', s)) = remaining.split_first() { - 1 + s.iter().position(|c| !c.is_ascii_digit()).unwrap_or(0) - } else { - 0 - }; - // Length of replacement string is likely to match previous string - // (there's up to 6 digits after the decimal point) - chunk.to_mut().replace_range( - index..(index + PATTERN_LEN + len_after), - "AAAAAAAAAAAAAAAAAAAAAAAAAA", - ); - } - } - - chunk -} - -fn sort_within_sections(vec: &mut [&T], mut section: impl FnMut(&T) -> u8) { - vec.sort_unstable_by_key(|&i| (section(i), i)); -} - -fn chunks(dump: &str) -> impl Iterator { - let mut remaining = dump; - std::iter::from_fn(move || { - remaining = remaining.trim_start(); - while let Some(s) = remove_skipped_item_from_beginning(remaining) { - remaining = s.trim_start(); - } - - // `trim_start` guarantees that `result` is not empty - let (result, after_result) = remaining.split_once("\n\n")?; - remaining = after_result; - Some(result) - }) -} - -fn remove_skipped_item_from_beginning(s: &str) -> Option<&str> { - // Skip commented line - if let Some(after) = s.strip_prefix("--") { - Some(after_first_occurence(after, "\n")) - } - // Skip old views and fast table triggers - else if let Some(after) = s - .strip_prefix("CREATE VIEW ") - .or_else(|| s.strip_prefix("CREATE OR REPLACE VIEW ")) - .or_else(|| s.strip_prefix("CREATE MATERIALIZED VIEW ")) - .or_else(|| { - s.strip_prefix("CREATE FUNCTION public.") - .and_then(after_skipped_trigger_name) - .and_then(|a| a.strip_prefix('(')) - }) - .or_else(|| { - s.strip_prefix("CREATE TRIGGER ") - .and_then(after_skipped_trigger_name) - .and_then(|a| a.strip_prefix(' ')) - }) - { - Some(after_first_occurence(after, "\n\n")) - } else { - None - } -} - -fn after_first_occurence<'a>(s: &'a str, pat: &str) -> &'a str { - s.split_once(pat).unwrap_or_default().1 -} - -fn after_skipped_trigger_name(s: &str) -> Option<&str> { - s.strip_prefix("refresh_comment_like") - .or_else(|| s.strip_prefix("refresh_comment")) - .or_else(|| s.strip_prefix("refresh_community_follower")) - .or_else(|| s.strip_prefix("refresh_community_user_ban")) - .or_else(|| s.strip_prefix("refresh_community")) - .or_else(|| s.strip_prefix("refresh_post_like")) - .or_else(|| s.strip_prefix("refresh_post")) - .or_else(|| s.strip_prefix("refresh_private_message")) - .or_else(|| s.strip_prefix("refresh_user")) -}