Fix federate loop (#4330)
* make activity channel infallible * clippy * federate: make cancellabletask loop itself
This commit is contained in:
parent
53147596b4
commit
024ab7d530
3 changed files with 48 additions and 61 deletions
|
@ -23,6 +23,7 @@ static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(5);
|
||||||
#[cfg(not(debug_assertions))]
|
#[cfg(not(debug_assertions))]
|
||||||
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60);
|
static INSTANCES_RECHECK_DELAY: Duration = Duration::from_secs(60);
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
pub struct Opts {
|
pub struct Opts {
|
||||||
/// how many processes you are starting in total
|
/// how many processes you are starting in total
|
||||||
pub process_count: i32,
|
pub process_count: i32,
|
||||||
|
@ -36,7 +37,7 @@ async fn start_stop_federation_workers(
|
||||||
federation_config: FederationConfig<LemmyContext>,
|
federation_config: FederationConfig<LemmyContext>,
|
||||||
cancel: CancellationToken,
|
cancel: CancellationToken,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let mut workers = HashMap::<InstanceId, CancellableTask<_>>::new();
|
let mut workers = HashMap::<InstanceId, CancellableTask>::new();
|
||||||
|
|
||||||
let (stats_sender, stats_receiver) = unbounded_channel();
|
let (stats_sender, stats_receiver) = unbounded_channel();
|
||||||
let exit_print = tokio::spawn(receive_print_stats(pool.clone(), stats_receiver));
|
let exit_print = tokio::spawn(receive_print_stats(pool.clone(), stats_receiver));
|
||||||
|
@ -66,40 +67,30 @@ async fn start_stop_federation_workers(
|
||||||
let should_federate = allowed && !is_dead;
|
let should_federate = allowed && !is_dead;
|
||||||
if should_federate {
|
if should_federate {
|
||||||
if workers.contains_key(&instance.id) {
|
if workers.contains_key(&instance.id) {
|
||||||
if workers
|
// worker already running
|
||||||
.get(&instance.id)
|
continue;
|
||||||
.map(util::CancellableTask::has_ended)
|
|
||||||
.unwrap_or(false)
|
|
||||||
{
|
|
||||||
// task must have errored out, remove and recreated it
|
|
||||||
let worker = workers
|
|
||||||
.remove(&instance.id)
|
|
||||||
.expect("just checked contains_key");
|
|
||||||
tracing::error!(
|
|
||||||
"worker for {} has stopped, recreating: {:?}",
|
|
||||||
instance.domain,
|
|
||||||
worker.cancel().await
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// create new worker
|
// create new worker
|
||||||
|
let config = federation_config.clone();
|
||||||
let stats_sender = stats_sender.clone();
|
let stats_sender = stats_sender.clone();
|
||||||
let context = federation_config.to_request_data();
|
|
||||||
let pool = pool.clone();
|
let pool = pool.clone();
|
||||||
workers.insert(
|
workers.insert(
|
||||||
instance.id,
|
instance.id,
|
||||||
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, |stop| async move {
|
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |stop| {
|
||||||
InstanceWorker::init_and_loop(
|
let instance = instance.clone();
|
||||||
instance,
|
let req_data = config.clone().to_request_data();
|
||||||
context,
|
let stats_sender = stats_sender.clone();
|
||||||
&mut DbPool::Pool(&pool),
|
let pool = pool.clone();
|
||||||
stop,
|
async move {
|
||||||
stats_sender,
|
InstanceWorker::init_and_loop(
|
||||||
)
|
instance,
|
||||||
.await?;
|
req_data,
|
||||||
Ok(())
|
&mut DbPool::Pool(&pool),
|
||||||
|
stop,
|
||||||
|
stats_sender,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
}),
|
}),
|
||||||
);
|
);
|
||||||
} else if !should_federate {
|
} else if !should_federate {
|
||||||
|
@ -135,9 +126,12 @@ pub fn start_stop_federation_workers_cancellable(
|
||||||
opts: Opts,
|
opts: Opts,
|
||||||
pool: ActualDbPool,
|
pool: ActualDbPool,
|
||||||
config: FederationConfig<LemmyContext>,
|
config: FederationConfig<LemmyContext>,
|
||||||
) -> CancellableTask<()> {
|
) -> CancellableTask {
|
||||||
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |c| {
|
CancellableTask::spawn(WORKER_EXIT_TIMEOUT, move |stop| {
|
||||||
start_stop_federation_workers(opts, pool, config, c)
|
let opts = opts.clone();
|
||||||
|
let pool = pool.clone();
|
||||||
|
let config = config.clone();
|
||||||
|
async move { start_stop_federation_workers(opts, pool, config, stop).await }
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,12 +20,7 @@ use moka::future::Cache;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use reqwest::Url;
|
use reqwest::Url;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use std::{
|
use std::{fmt::Debug, future::Future, pin::Pin, sync::Arc, time::Duration};
|
||||||
future::Future,
|
|
||||||
pin::Pin,
|
|
||||||
sync::{Arc, RwLock},
|
|
||||||
time::Duration,
|
|
||||||
};
|
|
||||||
use tokio::{task::JoinHandle, time::sleep};
|
use tokio::{task::JoinHandle, time::sleep};
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
||||||
|
@ -49,41 +44,41 @@ pub(crate) static WORK_FINISHED_RECHECK_DELAY: Lazy<Duration> = Lazy::new(|| {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
pub struct CancellableTask<R: Send + 'static> {
|
/// A task that will be run in an infinite loop, unless it is cancelled.
|
||||||
f: Pin<Box<dyn Future<Output = Result<R, anyhow::Error>> + Send + 'static>>,
|
/// If the task exits without being cancelled, an error will be logged and the task will be restarted.
|
||||||
ended: Arc<RwLock<bool>>,
|
pub struct CancellableTask {
|
||||||
|
f: Pin<Box<dyn Future<Output = Result<(), anyhow::Error>> + Send + 'static>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<R: Send + 'static> CancellableTask<R> {
|
impl CancellableTask {
|
||||||
/// spawn a task but with graceful shutdown
|
/// spawn a task but with graceful shutdown
|
||||||
pub fn spawn<F>(
|
pub fn spawn<F, R: Debug>(
|
||||||
timeout: Duration,
|
timeout: Duration,
|
||||||
task: impl FnOnce(CancellationToken) -> F,
|
task: impl Fn(CancellationToken) -> F + Send + 'static,
|
||||||
) -> CancellableTask<R>
|
) -> CancellableTask
|
||||||
where
|
where
|
||||||
F: Future<Output = Result<R>> + Send + 'static,
|
F: Future<Output = R> + Send + 'static,
|
||||||
{
|
{
|
||||||
let stop = CancellationToken::new();
|
let stop = CancellationToken::new();
|
||||||
let task = task(stop.clone());
|
let stop2 = stop.clone();
|
||||||
let ended = Arc::new(RwLock::new(false));
|
let task: JoinHandle<()> = tokio::spawn(async move {
|
||||||
let ended_write = ended.clone();
|
loop {
|
||||||
let task: JoinHandle<Result<R>> = tokio::spawn(async move {
|
let res = task(stop2.clone()).await;
|
||||||
match task.await {
|
if stop2.is_cancelled() {
|
||||||
Ok(o) => Ok(o),
|
return;
|
||||||
Err(e) => {
|
} else {
|
||||||
*ended_write.write().expect("poisoned") = true;
|
tracing::warn!("task exited, restarting: {res:?}");
|
||||||
Err(e)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
let abort = task.abort_handle();
|
let abort = task.abort_handle();
|
||||||
CancellableTask {
|
CancellableTask {
|
||||||
ended,
|
|
||||||
f: Box::pin(async move {
|
f: Box::pin(async move {
|
||||||
stop.cancel();
|
stop.cancel();
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
r = task => {
|
r = task => {
|
||||||
Ok(r.context("could not join")??)
|
r.context("could not join")?;
|
||||||
|
Ok(())
|
||||||
},
|
},
|
||||||
_ = sleep(timeout) => {
|
_ = sleep(timeout) => {
|
||||||
abort.abort();
|
abort.abort();
|
||||||
|
@ -96,12 +91,9 @@ impl<R: Send + 'static> CancellableTask<R> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// cancel the cancel signal, wait for timeout for the task to stop gracefully, otherwise abort it
|
/// cancel the cancel signal, wait for timeout for the task to stop gracefully, otherwise abort it
|
||||||
pub async fn cancel(self) -> Result<R, anyhow::Error> {
|
pub async fn cancel(self) -> Result<(), anyhow::Error> {
|
||||||
self.f.await
|
self.f.await
|
||||||
}
|
}
|
||||||
pub fn has_ended(&self) -> bool {
|
|
||||||
*self.ended.read().expect("poisoned")
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// assuming apub priv key and ids are immutable, then we don't need to have TTL
|
/// assuming apub priv key and ids are immutable, then we don't need to have TTL
|
||||||
|
|
|
@ -206,6 +206,7 @@ impl InstanceWorker {
|
||||||
.await
|
.await
|
||||||
.context("failed figuring out inbox urls")?;
|
.context("failed figuring out inbox urls")?;
|
||||||
if inbox_urls.is_empty() {
|
if inbox_urls.is_empty() {
|
||||||
|
tracing::debug!("{}: {:?} no inboxes", self.instance.domain, activity.id);
|
||||||
self.state.last_successful_id = Some(activity.id);
|
self.state.last_successful_id = Some(activity.id);
|
||||||
self.state.last_successful_published_time = Some(activity.published);
|
self.state.last_successful_published_time = Some(activity.published);
|
||||||
return Ok(());
|
return Ok(());
|
||||||
|
|
Loading…
Reference in a new issue