lemmy/crates/federate/src/lib.rs
phiresky a08642f813
federation: parallel sending per instance (#4623)
* federation: parallel sending

* federation: some comments

* lint and set force_write true when a request fails

* inbox_urls return vec

* split inbox functions into separate file

* cleanup

* extract sending task code to separate file

* move federation concurrent config to config file

* off by one issue

* improve msg

* fix both permanent stopping of federation queues and multiple creation of the same federation queues

* fix after merge

* lint fix

* Update crates/federate/src/send.rs

Co-authored-by: dullbananas <dull.bananas0@gmail.com>

* comment about reverse ordering

* remove crashable, comment

* comment

* move comment

* run federation tests twice

* fix test run

* prettier

* fix config default

* upgrade rust to 1.78 to fix diesel cli

* fix clippy

* delay

* add debug to make localhost urls not valid in ap crate, add some debug logs

* federation tests: ensure server stop after test and random activity id

* ci fix

* add test to federate 100 events

* fix send 100 test

* different data every time so activities are distinguishable

* allow out of order receives in test

* lint

* comment about https://github.com/LemmyNet/lemmy/pull/4623#discussion_r1565437391

* move sender for clarity, add comment

* move more things to members

* update test todo comment, use same env var as worker test but default to 1

* remove else below continue

* some more cleanup

* handle todo about smooth exit

* add federate inboxes collector tests

* lint

* actor max length

* don't reset fail count if activity skipped

* fix some comments

* reuse vars

* format

* Update .woodpecker.yml

* fix recheck time

* fix inboxes tests under fast mode

* format

* make i32 and ugly casts

* clippy

---------

Co-authored-by: dullbananas <dull.bananas0@gmail.com>
2024-07-21 11:50:50 -04:00

372 lines
11 KiB
Rust

use crate::{util::CancellableTask, worker::InstanceWorker};
use activitypub_federation::config::FederationConfig;
use lemmy_api_common::{
context::LemmyContext,
lemmy_utils::settings::structs::FederationWorkerConfig,
};
use lemmy_db_schema::{newtypes::InstanceId, source::instance::Instance};
use lemmy_utils::error::LemmyResult;
use stats::receive_print_stats;
use std::{collections::HashMap, time::Duration};
use tokio::{
sync::mpsc::{unbounded_channel, UnboundedSender},
task::JoinHandle,
time::sleep,
};
use tokio_util::sync::CancellationToken;
use tracing::info;
use util::FederationQueueStateWithDomain;
mod inboxes;
mod send;
mod stats;
mod util;
mod worker;
static WORKER_EXIT_TIMEOUT: Duration = Duration::from_secs(30);
#[cfg(debug_assertions)]
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(5);
#[cfg(not(debug_assertions))]
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60);
#[derive(Clone)]
pub struct Opts {
/// how many processes you are starting in total
pub process_count: i32,
/// the index of this process (1-based: 1 - process_count)
pub process_index: i32,
}
pub struct SendManager {
opts: Opts,
workers: HashMap<InstanceId, CancellableTask>,
context: FederationConfig<LemmyContext>,
stats_sender: UnboundedSender<FederationQueueStateWithDomain>,
exit_print: JoinHandle<()>,
federation_worker_config: FederationWorkerConfig,
}
impl SendManager {
fn new(
opts: Opts,
context: FederationConfig<LemmyContext>,
federation_worker_config: FederationWorkerConfig,
) -> Self {
assert!(opts.process_count > 0);
assert!(opts.process_index > 0);
assert!(opts.process_index <= opts.process_count);
let (stats_sender, stats_receiver) = unbounded_channel();
Self {
opts,
workers: HashMap::new(),
stats_sender,
exit_print: tokio::spawn(receive_print_stats(
context.inner_pool().clone(),
stats_receiver,
)),
context,
federation_worker_config,
}
}
pub fn run(
opts: Opts,
context: FederationConfig<LemmyContext>,
config: FederationWorkerConfig,
) -> CancellableTask {
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |cancel| {
let opts = opts.clone();
let config = config.clone();
let context = context.clone();
let mut manager = Self::new(opts, context, config);
async move {
let result = manager.do_loop(cancel).await;
// the loop function will only return if there is (a) an internal error (e.g. db connection
// failure) or (b) it was cancelled from outside.
if let Err(e) = result {
// don't let this error bubble up, just log it, so the below cancel function will run
// regardless
tracing::error!("SendManager failed: {e}");
}
// cancel all the dependent workers as well to ensure they don't get orphaned and keep
// running.
manager.cancel().await?;
LemmyResult::Ok(())
// if the task was not intentionally cancelled, then this whole lambda will be run again by
// CancellableTask after this
}
})
}
async fn do_loop(&mut self, cancel: CancellationToken) -> LemmyResult<()> {
let process_index = self.opts.process_index - 1;
info!(
"Starting federation workers for process count {} and index {}",
self.opts.process_count, process_index
);
let local_domain = self.context.settings().get_hostname_without_port()?;
let mut pool = self.context.pool();
loop {
let mut total_count = 0;
let mut dead_count = 0;
let mut disallowed_count = 0;
for (instance, allowed, is_dead) in
Instance::read_federated_with_blocked_and_dead(&mut pool).await?
{
if instance.domain == local_domain {
continue;
}
if instance.id.inner() % self.opts.process_count != process_index {
continue;
}
total_count += 1;
if !allowed {
disallowed_count += 1;
}
if is_dead {
dead_count += 1;
}
let should_federate = allowed && !is_dead;
if should_federate {
if self.workers.contains_key(&instance.id) {
// worker already running
continue;
}
// create new worker
let context = self.context.clone();
let stats_sender = self.stats_sender.clone();
let federation_worker_config = self.federation_worker_config.clone();
self.workers.insert(
instance.id,
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |stop| {
// if the instance worker ends unexpectedly due to internal/db errors, this lambda is
// rerun by cancellabletask.
let instance = instance.clone();
InstanceWorker::init_and_loop(
instance,
context.clone(),
federation_worker_config.clone(),
stop,
stats_sender.clone(),
)
}),
);
} else if !should_federate {
if let Some(worker) = self.workers.remove(&instance.id) {
if let Err(e) = worker.cancel().await {
tracing::error!("error stopping worker: {e}");
}
}
}
}
let worker_count = self.workers.len();
tracing::info!("Federating to {worker_count}/{total_count} instances ({dead_count} dead, {disallowed_count} disallowed)");
tokio::select! {
() = sleep(INSTANCES_RECHECK_DELAY) => {},
_ = cancel.cancelled() => { return Ok(()) }
}
}
}
pub async fn cancel(self) -> LemmyResult<()> {
drop(self.stats_sender);
tracing::warn!(
"Waiting for {} workers ({:.2?} max)",
self.workers.len(),
WORKER_EXIT_TIMEOUT
);
// the cancel futures need to be awaited concurrently for the shutdown processes to be triggered
// concurrently
futures::future::join_all(
self
.workers
.into_values()
.map(util::CancellableTask::cancel),
)
.await;
self.exit_print.await?;
Ok(())
}
}
#[cfg(test)]
#[allow(clippy::unwrap_used)]
#[allow(clippy::indexing_slicing)]
mod test {
use super::*;
use activitypub_federation::config::Data;
use chrono::DateTime;
use lemmy_db_schema::source::{
federation_allowlist::FederationAllowList,
federation_blocklist::FederationBlockList,
instance::InstanceForm,
};
use lemmy_utils::error::LemmyError;
use serial_test::serial;
use std::{
collections::HashSet,
sync::{Arc, Mutex},
};
use tokio::spawn;
struct TestData {
send_manager: SendManager,
context: Data<LemmyContext>,
instances: Vec<Instance>,
}
impl TestData {
async fn init(process_count: i32, process_index: i32) -> LemmyResult<Self> {
let context = LemmyContext::init_test_context().await;
let opts = Opts {
process_count,
process_index,
};
let federation_config = FederationConfig::builder()
.domain("local.com")
.app_data(context.clone())
.build()
.await?;
let concurrent_sends_per_instance = std::env::var("LEMMY_TEST_FEDERATION_CONCURRENT_SENDS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(1);
let federation_worker_config = FederationWorkerConfig {
concurrent_sends_per_instance,
};
let pool = &mut context.pool();
let instances = vec![
Instance::read_or_create(pool, "alpha.com".to_string()).await?,
Instance::read_or_create(pool, "beta.com".to_string()).await?,
Instance::read_or_create(pool, "gamma.com".to_string()).await?,
];
let send_manager = SendManager::new(opts, federation_config, federation_worker_config);
Ok(Self {
send_manager,
context,
instances,
})
}
async fn run(&mut self) -> LemmyResult<()> {
// start it and cancel after workers are running
let cancel = CancellationToken::new();
let cancel_ = cancel.clone();
spawn(async move {
sleep(Duration::from_millis(100)).await;
cancel_.cancel();
});
self.send_manager.do_loop(cancel.clone()).await?;
Ok(())
}
async fn cleanup(self) -> LemmyResult<()> {
self.send_manager.cancel().await?;
Instance::delete_all(&mut self.context.pool()).await?;
Ok(())
}
}
/// Basic test with default params and only active/allowed instances
#[tokio::test]
#[serial]
async fn test_send_manager() -> LemmyResult<()> {
let mut data = TestData::init(1, 1).await?;
data.run().await?;
assert_eq!(3, data.send_manager.workers.len());
let workers: HashSet<_> = data.send_manager.workers.keys().cloned().collect();
let instances: HashSet<_> = data.instances.iter().map(|i| i.id).collect();
assert_eq!(instances, workers);
data.cleanup().await?;
Ok(())
}
/// Running with multiple processes should start correct workers
#[tokio::test]
#[serial]
async fn test_send_manager_processes() -> LemmyResult<()> {
let active = Arc::new(Mutex::new(vec![]));
let execute = |count, index, active: Arc<Mutex<Vec<InstanceId>>>| async move {
let mut data = TestData::init(count, index).await?;
data.run().await?;
assert_eq!(1, data.send_manager.workers.len());
for k in data.send_manager.workers.keys() {
active.lock().unwrap().push(*k);
}
data.cleanup().await?;
Ok::<(), LemmyError>(())
};
execute(3, 1, active.clone()).await?;
execute(3, 2, active.clone()).await?;
execute(3, 3, active.clone()).await?;
// Should run exactly three workers
assert_eq!(3, active.lock().unwrap().len());
Ok(())
}
/// Use blocklist, should not send to blocked instances
#[tokio::test]
#[serial]
async fn test_send_manager_blocked() -> LemmyResult<()> {
let mut data = TestData::init(1, 1).await?;
let domain = data.instances[0].domain.clone();
FederationBlockList::replace(&mut data.context.pool(), Some(vec![domain])).await?;
data.run().await?;
let workers = &data.send_manager.workers;
assert_eq!(2, workers.len());
assert!(workers.contains_key(&data.instances[1].id));
assert!(workers.contains_key(&data.instances[2].id));
data.cleanup().await?;
Ok(())
}
/// Use allowlist, should only send to allowed instance
#[tokio::test]
#[serial]
async fn test_send_manager_allowed() -> LemmyResult<()> {
let mut data = TestData::init(1, 1).await?;
let domain = data.instances[0].domain.clone();
FederationAllowList::replace(&mut data.context.pool(), Some(vec![domain])).await?;
data.run().await?;
let workers = &data.send_manager.workers;
assert_eq!(1, workers.len());
assert!(workers.contains_key(&data.instances[0].id));
data.cleanup().await?;
Ok(())
}
/// Mark instance as dead, there should be no worker created for it
#[tokio::test]
#[serial]
async fn test_send_manager_dead() -> LemmyResult<()> {
let mut data = TestData::init(1, 1).await?;
let instance = &data.instances[0];
let form = InstanceForm::builder()
.domain(instance.domain.clone())
.updated(DateTime::from_timestamp(0, 0))
.build();
Instance::update(&mut data.context.pool(), instance.id, form).await?;
data.run().await?;
let workers = &data.send_manager.workers;
assert_eq!(2, workers.len());
assert!(workers.contains_key(&data.instances[1].id));
assert!(workers.contains_key(&data.instances[2].id));
data.cleanup().await?;
Ok(())
}
}