Recursive, parallel crawl #11

Merged
nutomic merged 2 commits from recursive-crawl into main 2022-05-13 11:37:53 +00:00
6 changed files with 204 additions and 119 deletions
Showing only changes of commit 0079e72759 - Show all commits

148
Cargo.lock generated
View file

@ -205,6 +205,15 @@ dependencies = [
"memchr", "memchr",
] ]
[[package]]
name = "ansi_term"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
dependencies = [
"winapi",
]
[[package]] [[package]]
name = "anyhow" name = "anyhow"
version = "1.0.57" version = "1.0.57"
@ -444,26 +453,17 @@ dependencies = [
[[package]] [[package]]
name = "clap" name = "clap"
version = "3.1.15" version = "2.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85a35a599b11c089a7f49105658d089b8f2cf0882993c17daf6de15285c2c35d" checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
dependencies = [ dependencies = [
"ansi_term",
"atty", "atty",
"bitflags", "bitflags",
"clap_lex", "strsim 0.8.0",
"indexmap",
"strsim",
"termcolor",
"textwrap", "textwrap",
] "unicode-width",
"vec_map",
[[package]]
name = "clap_lex"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a37c35f1112dad5e6e0b1adaff798507497a18fceeb30cceb3bae7d1427b9213"
dependencies = [
"os_str_bytes",
] ]
[[package]] [[package]]
@ -556,7 +556,7 @@ dependencies = [
"ident_case", "ident_case",
"proc-macro2 1.0.37", "proc-macro2 1.0.37",
"quote 1.0.18", "quote 1.0.18",
"strsim", "strsim 0.10.0",
"syn 1.0.92", "syn 1.0.92",
] ]
@ -1030,6 +1030,15 @@ version = "0.11.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
[[package]]
name = "heck"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
dependencies = [
"unicode-segmentation",
]
[[package]] [[package]]
name = "heck" name = "heck"
version = "0.4.0" version = "0.4.0"
@ -1318,7 +1327,6 @@ version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-recursion", "async-recursion",
"clap",
"derive-new", "derive-new",
"futures", "futures",
"lemmy_api_common", "lemmy_api_common",
@ -1328,6 +1336,8 @@ dependencies = [
"semver", "semver",
"serde", "serde",
"serde_json", "serde_json",
"stderrlog",
"structopt",
"tokio", "tokio",
] ]
@ -1831,12 +1841,6 @@ dependencies = [
"vcpkg", "vcpkg",
] ]
[[package]]
name = "os_str_bytes"
version = "6.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
[[package]] [[package]]
name = "parking_lot" name = "parking_lot"
version = "0.11.2" version = "0.11.2"
@ -2065,6 +2069,30 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-error"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
dependencies = [
"proc-macro-error-attr",
"proc-macro2 1.0.37",
"quote 1.0.18",
"syn 1.0.92",
"version_check",
]
[[package]]
name = "proc-macro-error-attr"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
dependencies = [
"proc-macro2 1.0.37",
"quote 1.0.18",
"version_check",
]
[[package]] [[package]]
name = "proc-macro2" name = "proc-macro2"
version = "0.4.30" version = "0.4.30"
@ -2612,6 +2640,19 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
[[package]]
name = "stderrlog"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38"
dependencies = [
"atty",
"chrono",
"log",
"termcolor",
"thread_local",
]
[[package]] [[package]]
name = "string_cache" name = "string_cache"
version = "0.8.4" version = "0.8.4"
@ -2638,12 +2679,42 @@ dependencies = [
"quote 1.0.18", "quote 1.0.18",
] ]
[[package]]
name = "strsim"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
[[package]] [[package]]
name = "strsim" name = "strsim"
version = "0.10.0" version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
[[package]]
name = "structopt"
version = "0.3.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
dependencies = [
"clap",
"lazy_static",
"structopt-derive",
]
[[package]]
name = "structopt-derive"
version = "0.4.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
dependencies = [
"heck 0.3.3",
"proc-macro-error",
"proc-macro2 1.0.37",
"quote 1.0.18",
"syn 1.0.92",
]
[[package]] [[package]]
name = "strum" name = "strum"
version = "0.24.0" version = "0.24.0"
@ -2656,7 +2727,7 @@ version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef" checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef"
dependencies = [ dependencies = [
"heck", "heck 0.4.0",
"proc-macro2 1.0.37", "proc-macro2 1.0.37",
"quote 1.0.18", "quote 1.0.18",
"rustversion", "rustversion",
@ -2730,9 +2801,12 @@ dependencies = [
[[package]] [[package]]
name = "textwrap" name = "textwrap"
version = "0.15.0" version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb" checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
dependencies = [
"unicode-width",
]
[[package]] [[package]]
name = "thiserror" name = "thiserror"
@ -2756,11 +2830,11 @@ dependencies = [
[[package]] [[package]]
name = "thread_local" name = "thread_local"
version = "1.1.4" version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180" checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
dependencies = [ dependencies = [
"once_cell", "lazy_static",
] ]
[[package]] [[package]]
@ -2943,9 +3017,9 @@ dependencies = [
[[package]] [[package]]
name = "tracing-subscriber" name = "tracing-subscriber"
version = "0.3.11" version = "0.3.6"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bc28f93baff38037f64e6f43d34cfa1605f27a49c34e8a04c5e78b0babf2596" checksum = "77be66445c4eeebb934a7340f227bfe7b338173d3f8c00a60a5a58005c9faecf"
dependencies = [ dependencies = [
"sharded-slab", "sharded-slab",
"thread_local", "thread_local",
@ -3016,6 +3090,12 @@ dependencies = [
"tinyvec", "tinyvec",
] ]
[[package]]
name = "unicode-segmentation"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99"
[[package]] [[package]]
name = "unicode-width" name = "unicode-width"
version = "0.1.9" version = "0.1.9"
@ -3087,6 +3167,12 @@ version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
[[package]]
name = "vec_map"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
[[package]] [[package]]
name = "version_check" name = "version_check"
version = "0.9.4" version = "0.9.4"

View file

@ -11,10 +11,11 @@ anyhow = "1.0.57"
tokio = { version = "1.18.1", features = ["macros", "rt-multi-thread"] } tokio = { version = "1.18.1", features = ["macros", "rt-multi-thread"] }
futures = "0.3.21" futures = "0.3.21"
serde_json = "1.0.81" serde_json = "1.0.81"
clap = "3.1.15"
semver = "1.0.9" semver = "1.0.9"
once_cell = "1.10.0" once_cell = "1.10.0"
lemmy_api_common = "0.16.0" lemmy_api_common = "0.16.0"
async-recursion = "1.0.0" async-recursion = "1.0.0"
log = "0.4.17" log = "0.4.17"
derive-new = "0.5.9" derive-new = "0.5.9"
stderrlog = "0.5.1"
structopt = "0.3.26"

View file

@ -4,7 +4,7 @@ use anyhow::Error;
use async_recursion::async_recursion; use async_recursion::async_recursion;
use futures::future::join_all; use futures::future::join_all;
use lemmy_api_common::site::GetSiteResponse; use lemmy_api_common::site::GetSiteResponse;
use log::info; use log::debug;
use semver::Version; use semver::Version;
use serde::Serialize; use serde::Serialize;
use std::collections::HashSet; use std::collections::HashSet;
@ -12,10 +12,11 @@ use std::ops::Deref;
use std::sync::Arc; use std::sync::Arc;
use tokio::sync::Mutex; use tokio::sync::Mutex;
#[derive(Serialize, Debug)] #[derive(new)]
pub struct InstanceDetails { pub struct CrawlJob {
pub domain: String, domain: String,
pub site_info: GetSiteResponse, current_distance: i32,
params: Arc<CrawlParams>,
} }
#[derive(new)] #[derive(new)]
@ -26,33 +27,62 @@ pub struct CrawlParams {
crawled_instances: Arc<Mutex<HashSet<String>>>, crawled_instances: Arc<Mutex<HashSet<String>>>,
} }
#[derive(new)] #[derive(Serialize, Debug)]
pub struct CrawlJob { pub struct InstanceDetails {
domain: String, pub domain: String,
current_depth: i32, pub site_info: GetSiteResponse,
params: Arc<CrawlParams>,
} }
impl CrawlJob { impl CrawlJob {
#[async_recursion] #[async_recursion]
pub async fn crawl(self) -> Result<Vec<Result<InstanceDetails, Error>>, Error> { pub async fn crawl(self) -> Vec<Result<InstanceDetails, Error>> {
// need to acquire and release mutix before recursing, otherwise it will deadlock // need to acquire and release mutex before recursing, otherwise it will deadlock
{ {
let mut crawled_instances = self.params.crawled_instances.deref().lock().await; let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
if crawled_instances.contains(&self.domain) { if crawled_instances.contains(&self.domain) {
return Ok(vec![]); return vec![];
} else { } else {
crawled_instances.insert(self.domain.clone()); crawled_instances.insert(self.domain.clone());
} }
} }
if self.current_depth > self.params.max_depth if self.current_distance > self.params.max_depth
|| self.params.exclude_domains.contains(&self.domain) || self.params.exclude_domains.contains(&self.domain)
{ {
return Ok(vec![]); return vec![];
} }
info!("Starting crawl for {}", &self.domain);
debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance);
let site_info = match self.fetch_instance_details().await {
Ok(o) => o,
Err(e) => return vec![Err(e)],
};
if site_info.1 < self.params.min_lemmy_version {
return vec![];
}
let mut result = vec![];
if let Some(federated) = &site_info.0.federated_instances {
for domain in federated.linked.iter() {
let crawl_job =
CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone());
result.push(crawl_job.crawl());
}
}
let mut result2: Vec<Result<InstanceDetails, Error>> =
join_all(result).await.into_iter().flatten().collect();
debug!("Successfully finished crawl for {}", &self.domain);
result2.push(Ok(InstanceDetails {
domain: self.domain,
site_info: site_info.0,
}));
result2
}
async fn fetch_instance_details(&self) -> Result<(GetSiteResponse, Version), Error> {
let site_info_url = format!("https://{}/api/v3/site", &self.domain); let site_info_url = format!("https://{}/api/v3/site", &self.domain);
let site_info = CLIENT let site_info = CLIENT
.get(&site_info_url) .get(&site_info_url)
@ -61,33 +91,7 @@ impl CrawlJob {
.await? .await?
.json::<GetSiteResponse>() .json::<GetSiteResponse>()
.await?; .await?;
let version = Version::parse(&site_info.version)?; let version = Version::parse(&site_info.version)?;
if version < self.params.min_lemmy_version { Ok((site_info, version))
return Ok(vec![]);
}
let mut result = vec![];
if let Some(federated) = &site_info.federated_instances {
for domain in federated.linked.iter() {
let crawl_job =
CrawlJob::new(domain.clone(), self.current_depth + 1, self.params.clone());
result.push(crawl_job.crawl());
}
}
let mut result2: Vec<Result<InstanceDetails, Error>> = join_all(result)
.await
.into_iter()
.filter_map(|r| r.ok())
.flat_map(|r| r.into_iter())
.collect();
info!("Successfully finished crawl for {}", &self.domain);
result2.push(Ok(InstanceDetails {
domain: self.domain,
site_info,
}));
Ok(result2)
} }
} }

View file

@ -1,4 +0,0 @@
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "20";
pub const EXCLUDE_INSTANCES: &str =
"ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml";

View file

@ -13,7 +13,6 @@ use std::time::Duration;
use tokio::sync::Mutex; use tokio::sync::Mutex;
pub mod crawl; pub mod crawl;
pub mod defaults;
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
@ -22,12 +21,12 @@ static CLIENT: Lazy<Client> = Lazy::new(Client::default);
pub async fn start_crawl( pub async fn start_crawl(
start_instances: Vec<String>, start_instances: Vec<String>,
exclude_domains: Vec<String>, exclude_domains: Vec<String>,
max_depth: i32, max_distance: i32,
) -> Result<Vec<InstanceDetails>, Error> { ) -> Result<Vec<InstanceDetails>, Error> {
let params = Arc::new(CrawlParams::new( let params = Arc::new(CrawlParams::new(
min_lemmy_version().await?, min_lemmy_version().await?,
exclude_domains, exclude_domains,
max_depth, max_distance,
Arc::new(Mutex::new(HashSet::new())), Arc::new(Mutex::new(HashSet::new())),
)); ));
let mut jobs = vec![]; let mut jobs = vec![];
@ -36,12 +35,11 @@ pub async fn start_crawl(
jobs.push(job.crawl()); jobs.push(job.crawl());
} }
// TODO: optionally log the errors // TODO: log the errors
let mut instance_details: Vec<InstanceDetails> = join_all(jobs) let mut instance_details: Vec<InstanceDetails> = join_all(jobs)
.await .await
.into_iter() .into_iter()
.filter_map(|r| r.ok()) .flatten()
.flat_map(|r| r.into_iter())
.filter_map(|r| r.ok()) .filter_map(|r| r.ok())
.collect(); .collect();

View file

@ -1,52 +1,52 @@
use anyhow::Error; use anyhow::Error;
use clap::{Arg, Command};
use lemmy_stats_crawler::crawl::InstanceDetails; use lemmy_stats_crawler::crawl::InstanceDetails;
use lemmy_stats_crawler::defaults::{
DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES,
};
use lemmy_stats_crawler::start_crawl; use lemmy_stats_crawler::start_crawl;
use serde::Serialize; use serde::Serialize;
use structopt::StructOpt;
#[derive(StructOpt, Debug)]
#[structopt()]
struct Parameters {
#[structopt(short, long, default_value = "lemmy.ml")]
start_instances: Vec<String>,
#[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")]
exclude_instances: Vec<String>,
#[structopt(short, long, default_value = "20")]
max_crawl_distance: i32,
/// Silence all output
#[structopt(short, long)]
quiet: bool,
/// Verbose mode (-v, -vv, -vvv, etc)
#[structopt(short = "v", long = "verbose", parse(from_occurrences))]
verbose: usize,
}
#[tokio::main] #[tokio::main]
pub async fn main() -> Result<(), Error> { pub async fn main() -> Result<(), Error> {
let matches = Command::new("Lemmy Stats Crawler") let params = Parameters::from_args();
.arg(
Arg::new("start-instances") stderrlog::new()
.long("start-instances") .module(module_path!())
.takes_value(true), .quiet(params.quiet)
) .verbosity(params.verbose)
.arg(Arg::new("exclude").long("exclude").takes_value(true)) .init()?;
.arg(
Arg::new("max-crawl-depth")
.long("max-crawl-depth")
.takes_value(true),
)
.get_matches();
let start_instances: Vec<String> = matches
.value_of("start-instances")
.unwrap_or(DEFAULT_START_INSTANCES)
.split(',')
.map(|s| s.trim().to_string())
.collect();
let exclude: Vec<String> = matches
.value_of("exclude")
.unwrap_or(EXCLUDE_INSTANCES)
.split(',')
.map(|s| s.trim().to_string())
.collect();
let max_crawl_depth: i32 = matches
.value_of("max-crawl-depth")
.unwrap_or(DEFAULT_MAX_CRAWL_DEPTH)
.parse()?;
eprintln!("Crawling..."); eprintln!("Crawling...");
let instance_details = start_crawl(start_instances, exclude, max_crawl_depth).await?; let instance_details = start_crawl(
params.start_instances,
params.exclude_instances,
params.max_crawl_distance,
)
.await?;
let total_stats = aggregate(instance_details); let total_stats = aggregate(instance_details);
println!("{}", serde_json::to_string_pretty(&total_stats)?); println!("{}", serde_json::to_string_pretty(&total_stats)?);
Ok(()) Ok(())
} }
// TODO: lemmy stores these numbers in SiteAggregates, would be good to simply use that as a member
// (to avoid many members). but SiteAggregates also has id, site_id fields
#[derive(Serialize)] #[derive(Serialize)]
struct TotalStats { struct TotalStats {
crawled_instances: i32, crawled_instances: i32,