diff --git a/src/crawl.rs b/src/crawl.rs index c0753b6..52a4b95 100644 --- a/src/crawl.rs +++ b/src/crawl.rs @@ -1,6 +1,6 @@ use crate::federated_instances::GetSiteResponse; use crate::node_info::NodeInfo; -use crate::{EXCLUDE_INSTANCES, REQUEST_TIMEOUT}; +use crate::REQUEST_TIMEOUT; use anyhow::anyhow; use anyhow::Error; use futures::try_join; @@ -10,6 +10,7 @@ use std::collections::VecDeque; pub async fn crawl( start_instances: Vec, + exclude: Vec, max_depth: i32, ) -> Result<(Vec, i32), Error> { let mut pending_instances: VecDeque = start_instances @@ -21,9 +22,7 @@ pub async fn crawl( let mut failed_instances = 0; while let Some(current_instance) = pending_instances.pop_back() { crawled_instances.push(current_instance.domain.clone()); - if current_instance.depth > max_depth - || EXCLUDE_INSTANCES.contains(&&**¤t_instance.domain) - { + if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) { continue; } match fetch_instance_details(¤t_instance.domain).await { @@ -48,7 +47,7 @@ pub async fn crawl( // Sort by active monthly users descending instance_details.sort_by_key(|i| i.users_active_month); instance_details.reverse(); - + Ok((instance_details, failed_instances)) } diff --git a/src/lib.rs b/src/lib.rs index ac7af40..358a10e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,9 +7,5 @@ pub mod node_info; pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml"; pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "1"; -pub const EXCLUDE_INSTANCES: &'static [&str] = &[ - "ds9.lemmy.ml", - "enterprise.lemmy.ml", - "voyager.lemmy.ml", - "test.lemmy.ml", -]; +pub const EXCLUDE_INSTANCES: &str = + "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml"; diff --git a/src/main.rs b/src/main.rs index 7bd0f93..6a039dd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,7 +1,7 @@ use anyhow::Error; use clap::{App, Arg}; use lemmy_stats_crawler::crawl::{crawl, InstanceDetails}; -use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES}; +use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES}; use serde::Serialize; #[tokio::main] @@ -12,6 +12,7 @@ pub async fn main() -> Result<(), Error> { .long("start-instances") .takes_value(true), ) + .arg(Arg::with_name("exclude").long("exclude").takes_value(true)) .arg( Arg::with_name("max-crawl-depth") .long("max-crawl-depth") @@ -22,7 +23,13 @@ pub async fn main() -> Result<(), Error> { .value_of("start-instances") .unwrap_or(DEFAULT_START_INSTANCES) .split(',') - .map(|s| s.to_string()) + .map(|s| s.trim().to_string()) + .collect(); + let exclude: Vec = matches + .value_of("exclude") + .unwrap_or(EXCLUDE_INSTANCES) + .split(',') + .map(|s| s.trim().to_string()) .collect(); let max_crawl_depth: i32 = matches .value_of("max-crawl-depth") @@ -30,7 +37,8 @@ pub async fn main() -> Result<(), Error> { .parse()?; eprintln!("Crawling..."); - let (instance_details, failed_instances) = crawl(start_instances, max_crawl_depth).await?; + let (instance_details, failed_instances) = + crawl(start_instances, exclude, max_crawl_depth).await?; let total_stats = aggregate(instance_details, failed_instances); println!("{}", serde_json::to_string_pretty(&total_stats)?);