Make diff check work just like before

2024-11-30 00:01:25 +00:00 · 2024-11-18 17:14:55 -07:00 · 2024-11-18 17:14:55 -07:00 · 8390c0eb30
commit 8390c0eb30
parent 2aead25bc6
4 changed files with 34 additions and 270 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1361,6 +1361,19 @@ version = "0.1.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56254986775e3233ffa9c4d7d3faaf6d36a2c09d30b20687e9f88bc8bafc16c8"

+[[package]]
+name = "diffutils"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c8d7ce619b5c0e13f7543dc2c203a7e6fa37e0111d876339aada7ec9540a58d5"
+dependencies = [
+ "chrono",
+ "diff",
+ "regex",
+ "same-file",
+ "unicode-width",
+]
+
 [[package]]
 name = "digest"
 version = "0.10.7"
@ -2627,7 +2640,7 @@ dependencies = [
 "diesel-derive-newtype",
 "diesel_ltree",
 "diesel_migrations",
- "diff",
+ "diffutils",
 "futures-util",
 "i-love-jesus",
 "lemmy_utils",
--- a/crates/db_schema/Cargo.toml
+++ b/crates/db_schema/Cargo.toml
@ -86,4 +86,4 @@ tuplex = { workspace = true, optional = true }
 [dev-dependencies]
 serial_test = { workspace = true }
 pretty_assertions = { workspace = true }
-diff = "0.1.13"
+diffutils = "0.4.2"
--- a/crates/db_schema/src/schema_setup.rs
+++ b/crates/db_schema/src/schema_setup.rs
@ -60,7 +60,7 @@ const REPLACEABLE_SCHEMA_PATH: &str = "crates/db_schema/replaceable_schema";
 struct MigrationHarnessWrapper<'a> {
  conn: &'a mut PgConnection,
  #[cfg(test)]
-  enable_diff_check: bool,
+  diff_checked_migration_name: Option<String>,
 }

 impl<'a> MigrationHarnessWrapper<'a> {
@ -86,7 +86,7 @@ impl<'a> MigrationHarness<Pg> for MigrationHarnessWrapper<'a> {
    migration: &dyn Migration<Pg>,
  ) -> diesel::migration::Result<MigrationVersion<'static>> {
    #[cfg(test)]
-    if self.enable_diff_check {
+    if self.diff_checked_migration_name == Some(migration.name().to_string()) {
      let before = diff_check::get_dump();

      self.run_migration_inner(migration)?;
@ -271,7 +271,11 @@ fn run_selected_migrations(
  let mut wrapper = MigrationHarnessWrapper {
    conn,
    #[cfg(test)]
-    enable_diff_check: options.enable_diff_check,
+    diff_checked_migration_name: options
+      .enable_diff_check
+      .then(|| diesel::migration::MigrationSource::<Pg>::migrations(&migrations()))
+      .transpose()?
+      .and_then(|migrations| Some(migrations.last()?.name().to_string())),
  };

  if options.revert {
@ -325,7 +329,8 @@ mod tests {
    // Start with consistent state by dropping everything
    conn.batch_execute("DROP OWNED BY CURRENT_USER;")?;

-    // Run all migrations and check for mistakes in down.sql files
+    // Run all migrations, make sure the newest migration can be redone, and check the newest
+    // down.sql file
    assert_eq!(
      run(o().run().enable_diff_check())?,
      ReplaceableSchemaRebuilt
@ -345,10 +350,8 @@ mod tests {
    );
    assert_eq!(run(o().run().limit(1))?, ReplaceableSchemaRebuilt);

-    // Revert all migrations
-    assert_eq!(run(o().revert())?, ReplaceableSchemaNotRebuilt);
-
    // This should throw an error saying to use lemmy_server instead of diesel CLI
+    conn.batch_execute("DROP OWNED BY CURRENT_USER;")?;
    assert!(matches!(
      conn.run_pending_migrations(migrations()),
      Err(e) if e.to_string().contains("lemmy_server")
--- a/crates/db_schema/src/schema_setup/diff_check.rs
+++ b/crates/db_schema/src/schema_setup/diff_check.rs
@ -1,10 +1,6 @@
 #![expect(clippy::expect_used)]
 use lemmy_utils::settings::SETTINGS;
-use std::{
-  borrow::Cow,
-  collections::btree_set::{self, BTreeSet},
-  process::{Command, Stdio},
-};
+use std::process::{Command, Stdio};

 // It's not possible to call `export_snapshot()` for each dump and run the dumps in parallel with
 // the `--snapshot` flag. Don't waste your time!!!!
@ -16,21 +12,12 @@ pub fn get_dump() -> String {
      // Specify database URL
      "--dbname",
      &db_url,
-      // Allow differences in row data and old fast tables
-      "--schema-only",
-      "--exclude-table=comment_aggregates_fast",
-      "--exclude-table=community_aggregates_fast",
-      "--exclude-table=post_aggregates_fast",
-      "--exclude-table=user_fast",
-      // Ignore some things to reduce the amount of queries done by pg_dump
+      // Disable some things
      "--no-owner",
      "--no-privileges",
-      "--no-comments",
-      "--no-publications",
-      "--no-security-labels",
-      "--no-subscriptions",
      "--no-table-access-method",
-      "--no-tablespaces",
+      "--schema-only",
+      "--no-sync",
    ])
    .stderr(Stdio::inherit())
    .output()
@ -42,251 +29,12 @@ pub fn get_dump() -> String {
  String::from_utf8(output.stdout).expect("pg_dump output is not valid UTF-8 text")
 }

-const PATTERN_LEN: usize = 19;
-
 pub fn check_dump_diff(before: String, after: String, label: &str) {
-  // Performance optimization
-  if after == before {
-    return;
-  }
+  if before != after {
+    let diff_bytes =
+      diffutilslib::unified_diff(before.as_bytes(), after.as_bytes(), &Default::default());
+    let diff = String::from_utf8_lossy(&diff_bytes);

-  // Borrow checker requires this to be a separate variable
-  let normalized_chunk_vecs = [&before, &after]
-    // Remove identical items
-    .map(|dump| chunks(dump).collect::<BTreeSet<_>>())
-    .differences()
-    // Remove items without unwanted types of differences (if migrations are correct, then this
-    // removes everything)
-    .map(|chunks| chunks.map(|&i| normalize_chunk(i)).collect::<BTreeSet<_>>());
-  let [only_in_before, only_in_after] = normalized_chunk_vecs // Imagine that this line doesn't exist
-    .differences()
-    .map(|chunks| chunks.map(|i| &**i).collect::<Vec<_>>());
-
-  if only_in_before.is_empty() && only_in_after.is_empty() {
-    return;
-  }
-
-  // Build the panic message
-
-  let after_has_more = only_in_before.len() < only_in_after.len();
-  let [chunks, mut other_chunks] = if after_has_more {
-    [only_in_before, only_in_after]
-  } else {
-    [only_in_after, only_in_before]
-  };
-
-  let diffs = chunks
-    .into_iter()
-    .chain(std::iter::repeat(""))
-    .map_while(|chunk| {
-      let (most_similar_chunk_index, most_similar_chunk) = other_chunks
-        .iter()
-        .enumerate()
-        .max_by_key(|(_, other_chunk)| {
-          if sql_command_name(chunk) != sql_command_name(other_chunk) {
-            0
-          } else {
-            similarity(chunk, other_chunk)
-          }
-        })?;
-
-      let diff_lines = if after_has_more {
-        diff::lines(most_similar_chunk, chunk)
-      } else {
-        diff::lines(chunk, most_similar_chunk)
-      };
-
-      other_chunks.swap_remove(most_similar_chunk_index);
-
-      Some(
-        diff_lines
-          .into_iter()
-          .flat_map(|line| match line {
-            diff::Result::Left(s) => ["- ", s, "\n"],
-            diff::Result::Right(s) => ["+ ", s, "\n"],
-            diff::Result::Both(s, _) => ["  ", s, "\n"],
-          })
-          .chain(["\n"])
-          .collect::<String>(),
-      )
-    });
-
-  panic!(
-    "{}",
-    std::iter::once(format!("{label}\n\n"))
-      .chain(diffs)
-      .collect::<String>()
-  );
-}
-
-trait Differences<T> {
-  fn differences(&self) -> [btree_set::Difference<'_, T>; 2];
-}
-
-impl<T: Ord> Differences<T> for [BTreeSet<T>; 2] {
-  /// Items only in `a`, and items only in `b`
-  fn differences(&self) -> [btree_set::Difference<'_, T>; 2] {
-    let [a, b] = self;
-    [a.difference(b), b.difference(a)]
+    panic!("{label}\n\n{diff}");
  }
 }
-
-fn sql_command_name(chunk: &str) -> &str {
-  chunk
-    .split_once(|c: char| c.is_lowercase())
-    .unwrap_or_default()
-    .0
-}
-
-fn similarity(chunk: &str, other_chunk: &str) -> usize {
-  diff::chars(chunk, other_chunk)
-    .into_iter()
-    .filter(|i| {
-      match i {
-        diff::Result::Both(c, _) => {
-          // Prevent whitespace from affecting similarity level
-          !c.is_whitespace()
-            && (
-              // Increase accuracy for some trigger function diffs
-              c.is_lowercase()
-                  // Preserve differences in names that contain a number
-                  || c.is_numeric()
-            )
-        }
-        _ => false,
-      }
-    })
-    .count()
-}
-
-fn normalize_chunk(chunk: &str) -> Cow<'_, str> {
-  let mut chunk = Cow::Borrowed(chunk);
-
-  let stripped_lines = chunk
-    .lines()
-    .map(|line| line.strip_suffix(',').unwrap_or(line));
-
-  // Sort column names, so differences in column order are ignored
-  if chunk.starts_with("CREATE TABLE ") {
-    let mut lines = stripped_lines.collect::<Vec<_>>();
-
-    sort_within_sections(&mut lines, |line| {
-      match line.chars().next() {
-        // CREATE
-        Some('C') => 0,
-        // Indented column name
-        Some(' ') => 1,
-        // End
-        Some(')') => 2,
-        _ => panic!("unrecognized part of `CREATE TABLE` statement: {line}"),
-      }
-    });
-
-    chunk = Cow::Owned(lines.join("\n"));
-  }
-
-  // Replace timestamps with a constant string, so differences in timestamps are ignored
-  for index in 0.. {
-    // Performance optimization
-    let Some(byte) = chunk.as_bytes().get(index) else {
-      break;
-    };
-    if !byte.is_ascii_digit() {
-      continue;
-    }
-
-    // Check for this pattern: 0000-00-00 00:00:00
-    let Some((
-      &[a0, a1, a2, a3, b0, a4, a5, b1, a6, a7, b2, a8, a9, b3, a10, a11, b4, a12, a13],
-      remaining,
-    )) = chunk
-      .get(index..)
-      .and_then(|s| s.as_bytes().split_first_chunk::<PATTERN_LEN>())
-    else {
-      break;
-    };
-
-    if [a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13]
-      .into_iter()
-      .all(|byte| byte.is_ascii_digit())
-      && [b0, b1, b2, b3, b4] == *b"-- ::"
-    {
-      // Replace the part of the string that has the checked pattern and an optional fractional part
-      let len_after = if let Some((b'.', s)) = remaining.split_first() {
-        1 + s.iter().position(|c| !c.is_ascii_digit()).unwrap_or(0)
-      } else {
-        0
-      };
-      // Length of replacement string is likely to match previous string
-      // (there's up to 6 digits after the decimal point)
-      chunk.to_mut().replace_range(
-        index..(index + PATTERN_LEN + len_after),
-        "AAAAAAAAAAAAAAAAAAAAAAAAAA",
-      );
-    }
-  }
-
-  chunk
-}
-
-fn sort_within_sections<T: Ord + ?Sized>(vec: &mut [&T], mut section: impl FnMut(&T) -> u8) {
-  vec.sort_unstable_by_key(|&i| (section(i), i));
-}
-
-fn chunks(dump: &str) -> impl Iterator<Item = &str> {
-  let mut remaining = dump;
-  std::iter::from_fn(move || {
-    remaining = remaining.trim_start();
-    while let Some(s) = remove_skipped_item_from_beginning(remaining) {
-      remaining = s.trim_start();
-    }
-
-    // `trim_start` guarantees that `result` is not empty
-    let (result, after_result) = remaining.split_once("\n\n")?;
-    remaining = after_result;
-    Some(result)
-  })
-}
-
-fn remove_skipped_item_from_beginning(s: &str) -> Option<&str> {
-  // Skip commented line
-  if let Some(after) = s.strip_prefix("--") {
-    Some(after_first_occurence(after, "\n"))
-  }
-  // Skip old views and fast table triggers
-  else if let Some(after) = s
-    .strip_prefix("CREATE VIEW ")
-    .or_else(|| s.strip_prefix("CREATE OR REPLACE VIEW "))
-    .or_else(|| s.strip_prefix("CREATE MATERIALIZED VIEW "))
-    .or_else(|| {
-      s.strip_prefix("CREATE FUNCTION public.")
-        .and_then(after_skipped_trigger_name)
-        .and_then(|a| a.strip_prefix('('))
-    })
-    .or_else(|| {
-      s.strip_prefix("CREATE TRIGGER ")
-        .and_then(after_skipped_trigger_name)
-        .and_then(|a| a.strip_prefix(' '))
-    })
-  {
-    Some(after_first_occurence(after, "\n\n"))
-  } else {
-    None
-  }
-}
-
-fn after_first_occurence<'a>(s: &'a str, pat: &str) -> &'a str {
-  s.split_once(pat).unwrap_or_default().1
-}
-
-fn after_skipped_trigger_name(s: &str) -> Option<&str> {
-  s.strip_prefix("refresh_comment_like")
-    .or_else(|| s.strip_prefix("refresh_comment"))
-    .or_else(|| s.strip_prefix("refresh_community_follower"))
-    .or_else(|| s.strip_prefix("refresh_community_user_ban"))
-    .or_else(|| s.strip_prefix("refresh_community"))
-    .or_else(|| s.strip_prefix("refresh_post_like"))
-    .or_else(|| s.strip_prefix("refresh_post"))
-    .or_else(|| s.strip_prefix("refresh_private_message"))
-    .or_else(|| s.strip_prefix("refresh_user"))
-}