lemmy/crates/federate/src/lib.rs

use crate::{util::CancellableTask, worker::InstanceWorker};
use activitypub_federation::config::FederationConfig;
use lemmy_api_common::{
  context::LemmyContext,
  lemmy_utils::settings::structs::FederationWorkerConfig,
};
use lemmy_db_schema::{newtypes::InstanceId, source::instance::Instance};
use lemmy_utils::error::LemmyResult;
use stats::receive_print_stats;
use std::{collections::HashMap, time::Duration};
use tokio::{
  sync::mpsc::{unbounded_channel, UnboundedSender},
  task::JoinHandle,
  time::sleep,
};
use tokio_util::sync::CancellationToken;
use tracing::info;
use util::FederationQueueStateWithDomain;

mod inboxes;
mod send;
mod stats;
mod util;
mod worker;

static WORKER_EXIT_TIMEOUT: Duration = Duration::from_secs(30);
#[cfg(debug_assertions)]
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(5);
#[cfg(not(debug_assertions))]
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60);

#[derive(Clone)]
pub struct Opts {
  /// how many processes you are starting in total
  pub process_count: i32,
  /// the index of this process (1-based: 1 - process_count)
  pub process_index: i32,
}

pub struct SendManager {
  opts: Opts,
  workers: HashMap<InstanceId, CancellableTask>,
  context: FederationConfig<LemmyContext>,
  stats_sender: UnboundedSender<FederationQueueStateWithDomain>,
  exit_print: JoinHandle<()>,
  federation_worker_config: FederationWorkerConfig,
}

impl SendManager {
  fn new(
    opts: Opts,
    context: FederationConfig<LemmyContext>,
    federation_worker_config: FederationWorkerConfig,
  ) -> Self {
    assert!(opts.process_count > 0);
    assert!(opts.process_index > 0);
    assert!(opts.process_index <= opts.process_count);

    let (stats_sender, stats_receiver) = unbounded_channel();
    Self {
      opts,
      workers: HashMap::new(),
      stats_sender,
      exit_print: tokio::spawn(receive_print_stats(
        context.inner_pool().clone(),
        stats_receiver,
      )),
      context,
      federation_worker_config,
    }
  }

  pub fn run(
    opts: Opts,
    context: FederationConfig<LemmyContext>,
    config: FederationWorkerConfig,
  ) -> CancellableTask {
    CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |cancel| {
      let opts = opts.clone();
      let config = config.clone();
      let context = context.clone();
      let mut manager = Self::new(opts, context, config);
      async move {
        let result = manager.do_loop(cancel).await;
        // the loop function will only return if there is (a) an internal error (e.g. db connection
        // failure) or (b) it was cancelled from outside.
        if let Err(e) = result {
          // don't let this error bubble up, just log it, so the below cancel function will run
          // regardless
          tracing::error!("SendManager failed: {e}");
        }
        // cancel all the dependent workers as well to ensure they don't get orphaned and keep
        // running.
        manager.cancel().await?;
        LemmyResult::Ok(())
        // if the task was not intentionally cancelled, then this whole lambda will be run again by
        // CancellableTask after this
      }
    })
  }

  async fn do_loop(&mut self, cancel: CancellationToken) -> LemmyResult<()> {
    let process_index = self.opts.process_index - 1;
    info!(
      "Starting federation workers for process count {} and index {}",
      self.opts.process_count, process_index
    );
    let local_domain = self.context.settings().get_hostname_without_port()?;
    let mut pool = self.context.pool();
    loop {
      let mut total_count = 0;
      let mut dead_count = 0;
      let mut disallowed_count = 0;
      for (instance, allowed, is_dead) in
        Instance::read_federated_with_blocked_and_dead(&mut pool).await?
      {
        if instance.domain == local_domain {
          continue;
        }
        if instance.id.inner() % self.opts.process_count != process_index {
          continue;
        }
        total_count += 1;
        if !allowed {
          disallowed_count += 1;
        }
        if is_dead {
          dead_count += 1;
        }
        let should_federate = allowed && !is_dead;
        if should_federate {
          if self.workers.contains_key(&instance.id) {
            // worker already running
            continue;
          }
          // create new worker
          let context = self.context.clone();
          let stats_sender = self.stats_sender.clone();
          let federation_worker_config = self.federation_worker_config.clone();

          self.workers.insert(
            instance.id,
            CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |stop| {
              // if the instance worker ends unexpectedly due to internal/db errors, this lambda is
              // rerun by cancellabletask.
              let instance = instance.clone();
              InstanceWorker::init_and_loop(
                instance,
                context.clone(),
                federation_worker_config.clone(),
                stop,
                stats_sender.clone(),
              )
            }),
          );
        } else if !should_federate {
          if let Some(worker) = self.workers.remove(&instance.id) {
            if let Err(e) = worker.cancel().await {
              tracing::error!("error stopping worker: {e}");
            }
          }
        }
      }
      let worker_count = self.workers.len();
      tracing::info!("Federating to {worker_count}/{total_count} instances ({dead_count} dead, {disallowed_count} disallowed)");
      tokio::select! {
        () = sleep(INSTANCES_RECHECK_DELAY) => {},
        _ = cancel.cancelled() => { return Ok(()) }
      }
    }
  }

  pub async fn cancel(self) -> LemmyResult<()> {
    drop(self.stats_sender);
    tracing::warn!(
      "Waiting for {} workers ({:.2?} max)",
      self.workers.len(),
      WORKER_EXIT_TIMEOUT
    );
    // the cancel futures need to be awaited concurrently for the shutdown processes to be triggered
    // concurrently
    futures::future::join_all(
      self
        .workers
        .into_values()
        .map(util::CancellableTask::cancel),
    )
    .await;
    self.exit_print.await?;
    Ok(())
  }
}

#[cfg(test)]
#[expect(clippy::unwrap_used)]
#[expect(clippy::indexing_slicing)]
mod test {

  use super::*;
  use activitypub_federation::config::Data;
  use chrono::DateTime;
  use lemmy_db_schema::source::{
    federation_allowlist::FederationAllowList,
    federation_blocklist::FederationBlockList,
    instance::InstanceForm,
  };
  use lemmy_utils::error::LemmyError;
  use serial_test::serial;
  use std::{
    collections::HashSet,
    sync::{Arc, Mutex},
  };
  use tokio::spawn;

  struct TestData {
    send_manager: SendManager,
    context: Data<LemmyContext>,
    instances: Vec<Instance>,
  }
  impl TestData {
    async fn init(process_count: i32, process_index: i32) -> LemmyResult<Self> {
      let context = LemmyContext::init_test_context().await;
      let opts = Opts {
        process_count,
        process_index,
      };
      let federation_config = FederationConfig::builder()
        .domain("local.com")
        .app_data(context.clone())
        .build()
        .await?;
      let concurrent_sends_per_instance = std::env::var("LEMMY_TEST_FEDERATION_CONCURRENT_SENDS")
        .ok()
        .and_then(|s| s.parse().ok())
        .unwrap_or(1);

      let federation_worker_config = FederationWorkerConfig {
        concurrent_sends_per_instance,
      };
      let pool = &mut context.pool();
      let instances = vec![
        Instance::read_or_create(pool, "alpha.com".to_string()).await?,
        Instance::read_or_create(pool, "beta.com".to_string()).await?,
        Instance::read_or_create(pool, "gamma.com".to_string()).await?,
      ];

      let send_manager = SendManager::new(opts, federation_config, federation_worker_config);
      Ok(Self {
        send_manager,
        context,
        instances,
      })
    }

    async fn run(&mut self) -> LemmyResult<()> {
      // start it and cancel after workers are running
      let cancel = CancellationToken::new();
      let cancel_ = cancel.clone();
      spawn(async move {
        sleep(Duration::from_millis(100)).await;
        cancel_.cancel();
      });
      self.send_manager.do_loop(cancel.clone()).await?;
      Ok(())
    }

    async fn cleanup(self) -> LemmyResult<()> {
      self.send_manager.cancel().await?;
      Instance::delete_all(&mut self.context.pool()).await?;
      Ok(())
    }
  }

  /// Basic test with default params and only active/allowed instances
  #[tokio::test]
  #[serial]
  async fn test_send_manager() -> LemmyResult<()> {
    let mut data = TestData::init(1, 1).await?;

    data.run().await?;
    assert_eq!(3, data.send_manager.workers.len());
    let workers: HashSet<_> = data.send_manager.workers.keys().cloned().collect();
    let instances: HashSet<_> = data.instances.iter().map(|i| i.id).collect();
    assert_eq!(instances, workers);

    data.cleanup().await?;
    Ok(())
  }

  /// Running with multiple processes should start correct workers
  #[tokio::test]
  #[serial]
  async fn test_send_manager_processes() -> LemmyResult<()> {
    let active = Arc::new(Mutex::new(vec![]));
    let execute = |count, index, active: Arc<Mutex<Vec<InstanceId>>>| async move {
      let mut data = TestData::init(count, index).await?;
      data.run().await?;
      assert_eq!(1, data.send_manager.workers.len());
      for k in data.send_manager.workers.keys() {
        active.lock().unwrap().push(*k);
      }
      data.cleanup().await?;
      Ok::<(), LemmyError>(())
    };
    execute(3, 1, active.clone()).await?;
    execute(3, 2, active.clone()).await?;
    execute(3, 3, active.clone()).await?;

    // Should run exactly three workers
    assert_eq!(3, active.lock().unwrap().len());

    Ok(())
  }

  /// Use blocklist, should not send to blocked instances
  #[tokio::test]
  #[serial]
  async fn test_send_manager_blocked() -> LemmyResult<()> {
    let mut data = TestData::init(1, 1).await?;

    let domain = data.instances[0].domain.clone();
    FederationBlockList::replace(&mut data.context.pool(), Some(vec![domain])).await?;
    data.run().await?;
    let workers = &data.send_manager.workers;
    assert_eq!(2, workers.len());
    assert!(workers.contains_key(&data.instances[1].id));
    assert!(workers.contains_key(&data.instances[2].id));

    data.cleanup().await?;
    Ok(())
  }

  /// Use allowlist, should only send to allowed instance
  #[tokio::test]
  #[serial]
  async fn test_send_manager_allowed() -> LemmyResult<()> {
    let mut data = TestData::init(1, 1).await?;

    let domain = data.instances[0].domain.clone();
    FederationAllowList::replace(&mut data.context.pool(), Some(vec![domain])).await?;
    data.run().await?;
    let workers = &data.send_manager.workers;
    assert_eq!(1, workers.len());
    assert!(workers.contains_key(&data.instances[0].id));

    data.cleanup().await?;
    Ok(())
  }

  /// Mark instance as dead, there should be no worker created for it
  #[tokio::test]
  #[serial]
  async fn test_send_manager_dead() -> LemmyResult<()> {
    let mut data = TestData::init(1, 1).await?;

    let instance = &data.instances[0];
    let form = InstanceForm {
      updated: DateTime::from_timestamp(0, 0),
      ..InstanceForm::new(instance.domain.clone())
    };
    Instance::update(&mut data.context.pool(), instance.id, form).await?;

    data.run().await?;
    let workers = &data.send_manager.workers;
    assert_eq!(2, workers.len());
    assert!(workers.contains_key(&data.instances[1].id));
    assert!(workers.contains_key(&data.instances[2].id));

    data.cleanup().await?;
    Ok(())
  }
}