Use different counts for denominator in scaled rank for posts (#5261)

* Change 'Scaled' sort to use total interactions instead of monthly users * Misc SQL fixes * Fix formatting of error message * Run rustfmt
2025-02-04 16:14:48 +00:00 · 2025-02-03 16:23:00 -05:00 · 2025-02-03 16:23:00 -05:00 · 5240576de8
commit 5240576de8
parent 8f73c10e01
8 changed files with 66 additions and 23 deletions
--- a/crates/db_schema/replaceable_schema/utils.sql
+++ b/crates/db_schema/replaceable_schema/utils.sql
@ -33,16 +33,16 @@ now() - published) < '7 days' THEN
        0.0
    END;
-CREATE FUNCTION r.scaled_rank (score numeric, published timestamp with time zone, users_active_month numeric)
+CREATE FUNCTION r.scaled_rank (score numeric, published timestamp with time zone, interactions_month numeric)
    RETURNS double precision
    LANGUAGE sql
    IMMUTABLE PARALLEL SAFE
    -- Add 2 to avoid divide by zero errors
    -- Default for score = 1, active users = 1, and now, is (0.1728 / log(2 + 1)) = 0.3621
-    -- There may need to be a scale factor multiplied to users_active_month, to make
+    -- There may need to be a scale factor multiplied to interactions_month, to make
    -- the log curve less pronounced. This can be tuned in the future.
    RETURN (
-        r.hot_rank (score, published) / log(2 + users_active_month)
+        r.hot_rank (score, published) / log(2 + interactions_month)
 );
 -- For tables with `deleted` and `removed` columns, this function determines which rows to include in a count.
@ -212,6 +212,27 @@ GROUP BY
 END;
 $$;
 -- Community aggregate function for adding up total number of interactions
 CREATE OR REPLACE FUNCTION r.community_aggregates_interactions (i text)
    RETURNS TABLE (
        count_ bigint,
        community_id_ integer)
    LANGUAGE plpgsql
    AS $$
 BEGIN
    RETURN query
    SELECT
        COALESCE(sum(comments + upvotes + downvotes)::bigint, 0) AS count_,
        community_id AS community_id_
    FROM
        post_aggregates
    WHERE
        published >= (CURRENT_TIMESTAMP - i::interval)
    GROUP BY
        community_id;
 END;
 $$;
 -- Edit site aggregates to include voters and people who have read posts as active users
 CREATE OR REPLACE FUNCTION r.site_aggregates_activity (i text)
    RETURNS integer
--- a/crates/db_schema/src/aggregates/post_aggregates.rs
+++ b/crates/db_schema/src/aggregates/post_aggregates.rs
@ -22,9 +22,10 @@ impl PostAggregates {
    // Diesel can't update based on a join, which is necessary for the scaled_rank
    // https://github.com/diesel-rs/diesel/issues/1478
-    // Just select the users_active_month manually for now, since its a single post anyway
+    // Just select the metrics we need manually, for now, since its a single post anyway
-    let users_active_month = community_aggregates::table
+
-      .select(community_aggregates::users_active_month)
+    let interactions_month = community_aggregates::table
      .select(community_aggregates::interactions_month)
      .inner_join(post::table.on(community_aggregates::community_id.eq(post::community_id)))
      .filter(post::id.eq(post_id))
      .first::<i64>(conn)
@ -40,7 +41,7 @@ impl PostAggregates {
        post_aggregates::scaled_rank.eq(scaled_rank(
          post_aggregates::score,
          post_aggregates::published,
-          users_active_month,
+          interactions_month,
        )),
      ))
      .get_result::<Self>(conn)
--- a/crates/db_schema/src/aggregates/structs.rs
+++ b/crates/db_schema/src/aggregates/structs.rs
@ -70,6 +70,9 @@ pub struct CommunityAggregates {
  pub users_active_month: i64,
  /// The number of users with any activity in the last year.
  pub users_active_half_year: i64,
  /// Number of any interactions over the last month.
  #[serde(skip)]
  pub interactions_month: i64,
  #[serde(skip)]
  pub hot_rank: f64,
  pub subscribers_local: i64,
--- a/crates/db_schema/src/schema.rs
+++ b/crates/db_schema/src/schema.rs
@ -252,6 +252,7 @@ diesel::table! {
        users_active_week -> Int8,
        users_active_month -> Int8,
        users_active_half_year -> Int8,
        interactions_month -> Int8,
        hot_rank -> Float8,
        subscribers_local -> Int8,
        report_count -> Int2,
--- a/crates/db_schema/src/utils.rs
+++ b/crates/db_schema/src/utils.rs
@ -531,7 +531,7 @@ pub mod functions {
  define_sql_function! {
    #[sql_name = "r.scaled_rank"]
-    fn scaled_rank(score: BigInt, time: Timestamptz, users_active_month: BigInt) -> Double;
+    fn scaled_rank(score: BigInt, time: Timestamptz, interactions_month: BigInt) -> Double;
  }
  define_sql_function! {
--- a/crates/routes/src/utils/scheduled_tasks.rs
+++ b/crates/routes/src/utils/scheduled_tasks.rs
@ -265,15 +265,20 @@ async fn process_post_aggregates_ranks_in_batches(conn: &mut AsyncPgConnection)
      UPDATE post_aggregates pa
      SET hot_rank = r.hot_rank(pa.score, pa.published),
          hot_rank_active = r.hot_rank(pa.score, pa.newest_comment_time_necro),
-           scaled_rank = r.scaled_rank(pa.score, pa.published, ca.users_active_month)
+          scaled_rank = r.scaled_rank(pa.score, pa.published, ca.interactions_month)
      FROM batch, community_aggregates ca
-         WHERE pa.post_id = batch.post_id and pa.community_id = ca.community_id RETURNING pa.published;
+      WHERE pa.post_id = batch.post_id
-    "#,
+      AND pa.community_id = ca.community_id
      RETURNING pa.published;
 "#,
    )
    .bind::<Timestamptz, _>(previous_batch_last_published)
    .bind::<Integer, _>(update_batch_size)
    .get_results::<HotRanksUpdateResult>(conn)
-    .await.map_err(|e| LemmyErrorType::Unknown(format!("Failed to update {} hot_ranks: {}", "post_aggregates", e)))?;
+    .await
    .map_err(|e| {
      LemmyErrorType::Unknown(format!("Failed to update post_aggregates hot_ranks: {}", e))
    })?;
    processed_rows_count += updated_rows.len();
    previous_batch_result = updated_rows.last().map(|row| row.published);
@ -371,7 +376,7 @@ async fn active_counts(pool: &mut DbPool<'_>) -> LemmyResult<()> {
  for (full_form, abbr) in &intervals {
    let update_site_stmt = format!(
-      "update site_aggregates set users_active_{} = (select * from r.site_aggregates_activity('{}')) where site_id = 1",
+      "update site_aggregates set users_active_{} = (select r.site_aggregates_activity('{}')) where site_id = 1",
      abbr, full_form
    );
    sql_query(update_site_stmt).execute(&mut conn).await?;
@ -380,6 +385,11 @@ async fn active_counts(pool: &mut DbPool<'_>) -> LemmyResult<()> {
    sql_query(update_community_stmt).execute(&mut conn).await?;
  }
  let update_interactions_stmt = "update community_aggregates ca set interactions_month = mv.count_ from r.community_aggregates_interactions('1 month') mv where ca.community_id = mv.community_id_";
  sql_query(update_interactions_stmt)
    .execute(&mut conn)
    .await?;
  info!("Done.");
  Ok(())
 }
--- a/migrations/2025-01-21-000000_interactions_per_month_schema/down.sql
+++ b/migrations/2025-01-21-000000_interactions_per_month_schema/down.sql
@ -0,0 +1,3 @@
 ALTER TABLE community_aggregates
    DROP COLUMN interactions_month;
--- a/migrations/2025-01-21-000000_interactions_per_month_schema/up.sql
+++ b/migrations/2025-01-21-000000_interactions_per_month_schema/up.sql
@ -0,0 +1,4 @@
 -- Add the interactions_month column
 ALTER TABLE community_aggregates
    ADD COLUMN interactions_month bigint NOT NULL DEFAULT 0;
		`@ -0,0 +1,3 @@`
							`ALTER TABLE community_aggregates`
							`DROP COLUMN interactions_month;`