diff --git a/Cargo.lock b/Cargo.lock index 82c71a0..e606f7b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -205,6 +205,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anyhow" version = "1.0.57" @@ -444,26 +453,17 @@ dependencies = [ [[package]] name = "clap" -version = "3.1.15" +version = "2.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85a35a599b11c089a7f49105658d089b8f2cf0882993c17daf6de15285c2c35d" +checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c" dependencies = [ + "ansi_term", "atty", "bitflags", - "clap_lex", - "indexmap", - "strsim", - "termcolor", + "strsim 0.8.0", "textwrap", -] - -[[package]] -name = "clap_lex" -version = "0.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a37c35f1112dad5e6e0b1adaff798507497a18fceeb30cceb3bae7d1427b9213" -dependencies = [ - "os_str_bytes", + "unicode-width", + "vec_map", ] [[package]] @@ -556,7 +556,7 @@ dependencies = [ "ident_case", "proc-macro2 1.0.37", "quote 1.0.18", - "strsim", + "strsim 0.10.0", "syn 1.0.92", ] @@ -1030,6 +1030,15 @@ version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" +[[package]] +name = "heck" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" +dependencies = [ + "unicode-segmentation", +] + [[package]] name = "heck" version = "0.4.0" @@ -1318,7 +1327,6 @@ version = "0.1.0" dependencies = [ "anyhow", "async-recursion", - "clap", "derive-new", "futures", "lemmy_api_common", @@ -1328,6 +1336,8 @@ dependencies = [ "semver", "serde", "serde_json", + "stderrlog", + "structopt", "tokio", ] @@ -1831,12 +1841,6 @@ dependencies = [ "vcpkg", ] -[[package]] -name = "os_str_bytes" -version = "6.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" - [[package]] name = "parking_lot" version = "0.11.2" @@ -2065,6 +2069,30 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2 1.0.37", + "quote 1.0.18", + "syn 1.0.92", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2 1.0.37", + "quote 1.0.18", + "version_check", +] + [[package]] name = "proc-macro2" version = "0.4.30" @@ -2612,6 +2640,19 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "stderrlog" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38" +dependencies = [ + "atty", + "chrono", + "log", + "termcolor", + "thread_local", +] + [[package]] name = "string_cache" version = "0.8.4" @@ -2638,12 +2679,42 @@ dependencies = [ "quote 1.0.18", ] +[[package]] +name = "strsim" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a" + [[package]] name = "strsim" version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" +[[package]] +name = "structopt" +version = "0.3.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10" +dependencies = [ + "clap", + "lazy_static", + "structopt-derive", +] + +[[package]] +name = "structopt-derive" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0" +dependencies = [ + "heck 0.3.3", + "proc-macro-error", + "proc-macro2 1.0.37", + "quote 1.0.18", + "syn 1.0.92", +] + [[package]] name = "strum" version = "0.24.0" @@ -2656,7 +2727,7 @@ version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef" dependencies = [ - "heck", + "heck 0.4.0", "proc-macro2 1.0.37", "quote 1.0.18", "rustversion", @@ -2730,9 +2801,12 @@ dependencies = [ [[package]] name = "textwrap" -version = "0.15.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" +checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" +dependencies = [ + "unicode-width", +] [[package]] name = "thiserror" @@ -2756,11 +2830,11 @@ dependencies = [ [[package]] name = "thread_local" -version = "1.1.4" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" +checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14" dependencies = [ - "once_cell", + "lazy_static", ] [[package]] @@ -2943,9 +3017,9 @@ dependencies = [ [[package]] name = "tracing-subscriber" -version = "0.3.11" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4bc28f93baff38037f64e6f43d34cfa1605f27a49c34e8a04c5e78b0babf2596" +checksum = "77be66445c4eeebb934a7340f227bfe7b338173d3f8c00a60a5a58005c9faecf" dependencies = [ "sharded-slab", "thread_local", @@ -3016,6 +3090,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-segmentation" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" + [[package]] name = "unicode-width" version = "0.1.9" @@ -3087,6 +3167,12 @@ version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" +[[package]] +name = "vec_map" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191" + [[package]] name = "version_check" version = "0.9.4" diff --git a/Cargo.toml b/Cargo.toml index 1e20612..8b1ed68 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,10 +11,11 @@ anyhow = "1.0.57" tokio = { version = "1.18.1", features = ["macros", "rt-multi-thread"] } futures = "0.3.21" serde_json = "1.0.81" -clap = "3.1.15" semver = "1.0.9" once_cell = "1.10.0" lemmy_api_common = "0.16.0" async-recursion = "1.0.0" log = "0.4.17" derive-new = "0.5.9" +stderrlog = "0.5.1" +structopt = "0.3.26" diff --git a/src/crawl.rs b/src/crawl.rs index f7f9394..86c77ed 100644 --- a/src/crawl.rs +++ b/src/crawl.rs @@ -4,7 +4,7 @@ use anyhow::Error; use async_recursion::async_recursion; use futures::future::join_all; use lemmy_api_common::site::GetSiteResponse; -use log::info; +use log::debug; use semver::Version; use serde::Serialize; use std::collections::HashSet; @@ -12,10 +12,11 @@ use std::ops::Deref; use std::sync::Arc; use tokio::sync::Mutex; -#[derive(Serialize, Debug)] -pub struct InstanceDetails { - pub domain: String, - pub site_info: GetSiteResponse, +#[derive(new)] +pub struct CrawlJob { + domain: String, + current_distance: i32, + params: Arc, } #[derive(new)] @@ -26,33 +27,62 @@ pub struct CrawlParams { crawled_instances: Arc>>, } -#[derive(new)] -pub struct CrawlJob { - domain: String, - current_depth: i32, - params: Arc, +#[derive(Serialize, Debug)] +pub struct InstanceDetails { + pub domain: String, + pub site_info: GetSiteResponse, } impl CrawlJob { #[async_recursion] - pub async fn crawl(self) -> Result>, Error> { - // need to acquire and release mutix before recursing, otherwise it will deadlock + pub async fn crawl(self) -> Vec> { + // need to acquire and release mutex before recursing, otherwise it will deadlock { let mut crawled_instances = self.params.crawled_instances.deref().lock().await; if crawled_instances.contains(&self.domain) { - return Ok(vec![]); + return vec![]; } else { crawled_instances.insert(self.domain.clone()); } } - if self.current_depth > self.params.max_depth + if self.current_distance > self.params.max_depth || self.params.exclude_domains.contains(&self.domain) { - return Ok(vec![]); + return vec![]; } - info!("Starting crawl for {}", &self.domain); + debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance); + let site_info = match self.fetch_instance_details().await { + Ok(o) => o, + Err(e) => return vec![Err(e)], + }; + + if site_info.1 < self.params.min_lemmy_version { + return vec![]; + } + + let mut result = vec![]; + if let Some(federated) = &site_info.0.federated_instances { + for domain in federated.linked.iter() { + let crawl_job = + CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone()); + result.push(crawl_job.crawl()); + } + } + + let mut result2: Vec> = + join_all(result).await.into_iter().flatten().collect(); + debug!("Successfully finished crawl for {}", &self.domain); + result2.push(Ok(InstanceDetails { + domain: self.domain, + site_info: site_info.0, + })); + + result2 + } + + async fn fetch_instance_details(&self) -> Result<(GetSiteResponse, Version), Error> { let site_info_url = format!("https://{}/api/v3/site", &self.domain); let site_info = CLIENT .get(&site_info_url) @@ -61,33 +91,7 @@ impl CrawlJob { .await? .json::() .await?; - let version = Version::parse(&site_info.version)?; - if version < self.params.min_lemmy_version { - return Ok(vec![]); - } - - let mut result = vec![]; - if let Some(federated) = &site_info.federated_instances { - for domain in federated.linked.iter() { - let crawl_job = - CrawlJob::new(domain.clone(), self.current_depth + 1, self.params.clone()); - result.push(crawl_job.crawl()); - } - } - - let mut result2: Vec> = join_all(result) - .await - .into_iter() - .filter_map(|r| r.ok()) - .flat_map(|r| r.into_iter()) - .collect(); - info!("Successfully finished crawl for {}", &self.domain); - result2.push(Ok(InstanceDetails { - domain: self.domain, - site_info, - })); - - Ok(result2) + Ok((site_info, version)) } } diff --git a/src/defaults.rs b/src/defaults.rs deleted file mode 100644 index f9b09bb..0000000 --- a/src/defaults.rs +++ /dev/null @@ -1,4 +0,0 @@ -pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml"; -pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "20"; -pub const EXCLUDE_INSTANCES: &str = - "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml"; diff --git a/src/lib.rs b/src/lib.rs index 5fb76a0..b655209 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,7 +13,6 @@ use std::time::Duration; use tokio::sync::Mutex; pub mod crawl; -pub mod defaults; pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); @@ -22,12 +21,12 @@ static CLIENT: Lazy = Lazy::new(Client::default); pub async fn start_crawl( start_instances: Vec, exclude_domains: Vec, - max_depth: i32, + max_distance: i32, ) -> Result, Error> { let params = Arc::new(CrawlParams::new( min_lemmy_version().await?, exclude_domains, - max_depth, + max_distance, Arc::new(Mutex::new(HashSet::new())), )); let mut jobs = vec![]; @@ -36,12 +35,11 @@ pub async fn start_crawl( jobs.push(job.crawl()); } - // TODO: optionally log the errors + // TODO: log the errors let mut instance_details: Vec = join_all(jobs) .await .into_iter() - .filter_map(|r| r.ok()) - .flat_map(|r| r.into_iter()) + .flatten() .filter_map(|r| r.ok()) .collect(); diff --git a/src/main.rs b/src/main.rs index 20bc41c..3973317 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,52 +1,52 @@ use anyhow::Error; -use clap::{Arg, Command}; use lemmy_stats_crawler::crawl::InstanceDetails; -use lemmy_stats_crawler::defaults::{ - DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES, -}; use lemmy_stats_crawler::start_crawl; use serde::Serialize; +use structopt::StructOpt; + +#[derive(StructOpt, Debug)] +#[structopt()] +struct Parameters { + #[structopt(short, long, default_value = "lemmy.ml")] + start_instances: Vec, + #[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")] + exclude_instances: Vec, + #[structopt(short, long, default_value = "20")] + max_crawl_distance: i32, + + /// Silence all output + #[structopt(short, long)] + quiet: bool, + /// Verbose mode (-v, -vv, -vvv, etc) + #[structopt(short = "v", long = "verbose", parse(from_occurrences))] + verbose: usize, +} #[tokio::main] pub async fn main() -> Result<(), Error> { - let matches = Command::new("Lemmy Stats Crawler") - .arg( - Arg::new("start-instances") - .long("start-instances") - .takes_value(true), - ) - .arg(Arg::new("exclude").long("exclude").takes_value(true)) - .arg( - Arg::new("max-crawl-depth") - .long("max-crawl-depth") - .takes_value(true), - ) - .get_matches(); - let start_instances: Vec = matches - .value_of("start-instances") - .unwrap_or(DEFAULT_START_INSTANCES) - .split(',') - .map(|s| s.trim().to_string()) - .collect(); - let exclude: Vec = matches - .value_of("exclude") - .unwrap_or(EXCLUDE_INSTANCES) - .split(',') - .map(|s| s.trim().to_string()) - .collect(); - let max_crawl_depth: i32 = matches - .value_of("max-crawl-depth") - .unwrap_or(DEFAULT_MAX_CRAWL_DEPTH) - .parse()?; + let params = Parameters::from_args(); + + stderrlog::new() + .module(module_path!()) + .quiet(params.quiet) + .verbosity(params.verbose) + .init()?; eprintln!("Crawling..."); - let instance_details = start_crawl(start_instances, exclude, max_crawl_depth).await?; + let instance_details = start_crawl( + params.start_instances, + params.exclude_instances, + params.max_crawl_distance, + ) + .await?; let total_stats = aggregate(instance_details); println!("{}", serde_json::to_string_pretty(&total_stats)?); Ok(()) } +// TODO: lemmy stores these numbers in SiteAggregates, would be good to simply use that as a member +// (to avoid many members). but SiteAggregates also has id, site_id fields #[derive(Serialize)] struct TotalStats { crawled_instances: i32,