Exclude test instances from crawl

This commit is contained in:
Felix Ableitner 2021-04-27 15:09:02 +02:00
parent d92de61d9c
commit 60563fc7d9
2 changed files with 10 additions and 2 deletions

View file

@ -1,6 +1,6 @@
use crate::federated_instances::GetSiteResponse; use crate::federated_instances::GetSiteResponse;
use crate::node_info::NodeInfo; use crate::node_info::NodeInfo;
use crate::REQUEST_TIMEOUT; use crate::{EXCLUDE_INSTANCES, REQUEST_TIMEOUT};
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Error; use anyhow::Error;
use futures::try_join; use futures::try_join;
@ -21,7 +21,9 @@ pub async fn crawl(
let mut failed_instances = 0; let mut failed_instances = 0;
while let Some(current_instance) = pending_instances.pop_back() { while let Some(current_instance) = pending_instances.pop_back() {
crawled_instances.push(current_instance.domain.clone()); crawled_instances.push(current_instance.domain.clone());
if current_instance.depth > max_depth { if current_instance.depth > max_depth
|| EXCLUDE_INSTANCES.contains(&&**&current_instance.domain)
{
continue; continue;
} }
match fetch_instance_details(&current_instance.domain).await { match fetch_instance_details(&current_instance.domain).await {

View file

@ -7,3 +7,9 @@ pub mod node_info;
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml"; pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "1"; pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "1";
pub const EXCLUDE_INSTANCES: &'static [&str] = &[
"ds9.lemmy.ml",
"enterprise.lemmy.ml",
"voyager.lemmy.ml",
"test.lemmy.ml",
];