Recursive, parallel crawl

This commit is contained in:
Felix Ableitner 2022-05-10 02:52:48 +02:00
parent 2e2a4888d0
commit cb5f20f397
3 changed files with 117 additions and 68 deletions

13
Cargo.lock generated
View file

@ -220,6 +220,17 @@ dependencies = [
"event-listener", "event-listener",
] ]
[[package]]
name = "async-recursion"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2cda8f4bcc10624c4e85bc66b3f452cca98cfa5ca002dc83a16aad2367641bea"
dependencies = [
"proc-macro2 1.0.37",
"quote 1.0.18",
"syn 1.0.92",
]
[[package]] [[package]]
name = "async-trait" name = "async-trait"
version = "0.1.53" version = "0.1.53"
@ -1295,9 +1306,11 @@ name = "lemmy-stats-crawler"
version = "0.1.0" version = "0.1.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"async-recursion",
"clap", "clap",
"futures", "futures",
"lemmy_api_common", "lemmy_api_common",
"log",
"once_cell", "once_cell",
"reqwest", "reqwest",
"semver", "semver",

View file

@ -15,3 +15,5 @@ clap = "3.1.15"
semver = "1.0.9" semver = "1.0.9"
once_cell = "1.10.0" once_cell = "1.10.0"
lemmy_api_common = "0.16.0" lemmy_api_common = "0.16.0"
async-recursion = "1.0.0"
log = "0.4.17"

View file

@ -1,56 +1,46 @@
use crate::REQUEST_TIMEOUT; use crate::REQUEST_TIMEOUT;
use anyhow::anyhow;
use anyhow::Error; use anyhow::Error;
use async_recursion::async_recursion;
use futures::future::try_join_all;
use lemmy_api_common::site::GetSiteResponse; use lemmy_api_common::site::GetSiteResponse;
use log::info;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use reqwest::Client; use reqwest::Client;
use semver::Version; use semver::Version;
use serde::Serialize; use serde::Serialize;
use std::collections::VecDeque; use std::ops::Deref;
use std::sync::Arc;
use tokio::sync::Mutex;
static CLIENT: Lazy<Client> = Lazy::new(Client::default); static CLIENT: Lazy<Client> = Lazy::new(Client::default);
pub async fn crawl( pub async fn crawl(
start_instances: Vec<String>, start_instances: Vec<String>,
exclude: Vec<String>, exclude_domains: Vec<String>,
max_depth: i32, max_depth: i32,
) -> Result<(Vec<InstanceDetails>, i32), Error> { ) -> Result<(Vec<InstanceDetails>, i32), Error> {
let mut pending_instances: VecDeque<CrawlInstance> = start_instances let params = Arc::new(CrawlParams {
.iter() min_lemmy_version: min_lemmy_version().await?,
.map(|s| CrawlInstance::new(s.to_string(), 0)) exclude_domains,
.collect(); max_depth,
let min_lemmy_version = min_lemmy_version().await?; });
let mut crawled_instances = vec![]; let crawled_instances = Arc::new(Mutex::new(vec![]));
let mut instance_details = vec![]; let mut jobs = vec![];
let mut failed_instances = 0; for domain in start_instances.into_iter() {
while let Some(current_instance) = pending_instances.pop_back() { let job = CrawlJob {
crawled_instances.push(current_instance.domain.clone()); domain,
if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) { current_depth: 0,
continue; params: params.clone(),
} crawled_instances: crawled_instances.clone(),
match fetch_instance_details(&current_instance.domain, &min_lemmy_version).await { };
Ok(details) => { jobs.push(job.crawl());
if let Some(federated) = &details.site_info.federated_instances.as_ref() {
for i in &federated.linked {
let is_in_crawled = crawled_instances.contains(i);
let is_in_pending = pending_instances.iter().any(|p| &p.domain == i);
if !is_in_crawled && !is_in_pending {
let ci = CrawlInstance::new(i.clone(), current_instance.depth + 1);
pending_instances.push_back(ci);
}
}
}
instance_details.push(details);
}
Err(e) => {
failed_instances += 1;
eprintln!("Failed to crawl {}: {}", current_instance.domain, e)
}
}
} }
let mut instance_details: Vec<InstanceDetails> =
try_join_all(jobs).await?.into_iter().flatten().collect();
// Sort by active monthly users descending // Sort by active monthly users descending
instance_details.sort_by_key(|i| { instance_details.sort_unstable_by_key(|i| {
i.site_info i.site_info
.site_view .site_view
.as_ref() .as_ref()
@ -59,7 +49,7 @@ pub async fn crawl(
}); });
instance_details.reverse(); instance_details.reverse();
Ok((instance_details, failed_instances)) Ok((instance_details, 0))
} }
#[derive(Serialize, Debug)] #[derive(Serialize, Debug)]
@ -68,41 +58,85 @@ pub struct InstanceDetails {
pub site_info: GetSiteResponse, pub site_info: GetSiteResponse,
} }
struct CrawlInstance { struct CrawlParams {
min_lemmy_version: Version,
exclude_domains: Vec<String>,
max_depth: i32,
}
struct CrawlJob {
domain: String, domain: String,
depth: i32, current_depth: i32,
params: Arc<CrawlParams>,
crawled_instances: Arc<Mutex<Vec<String>>>,
} }
impl CrawlInstance { impl CrawlJob {
pub fn new(domain: String, depth: i32) -> CrawlInstance { #[async_recursion]
CrawlInstance { domain, depth } pub async fn crawl(self) -> Result<Vec<InstanceDetails>, Error> {
// need to acquire and release mutix before recursing, otherwise it will deadlock
{
let mut crawled_instances = self.crawled_instances.deref().lock().await;
if crawled_instances.contains(&self.domain) {
return Ok(vec![]);
} else {
crawled_instances.push(self.domain.clone());
}
} }
}
async fn fetch_instance_details( if self.current_depth > self.params.max_depth
domain: &str, || self.params.exclude_domains.contains(&self.domain)
min_lemmy_version: &Version, {
) -> Result<InstanceDetails, Error> { return Ok(vec![]);
let client = Client::default(); }
info!("Starting crawl for {}", &self.domain);
let site_info_url = format!("https://{}/api/v3/site", domain); let site_info_url = format!("https://{}/api/v3/site", &self.domain);
let site_info = client let site_info = CLIENT
.get(&site_info_url) .get(&site_info_url)
.timeout(REQUEST_TIMEOUT) .timeout(REQUEST_TIMEOUT)
.send() .send()
.await? .await
.json::<GetSiteResponse>() .ok();
.await?;
let version = Version::parse(&site_info.version)?; if let Some(site_info2) = site_info {
if &version < min_lemmy_version { let site_info3 = site_info2.json::<GetSiteResponse>().await.ok();
return Err(anyhow!("lemmy version is too old ({})", version)); if let Some(site_info4) = site_info3 {
let version = Version::parse(&site_info4.version).ok();
if let Some(version) = version {
if version < self.params.min_lemmy_version {
return Ok(vec![]);
}
} }
Ok(InstanceDetails { let mut result = vec![];
domain: domain.to_owned(), if let Some(federated) = &site_info4.federated_instances {
site_info, for domain in federated.linked.iter() {
}) let crawl_job = CrawlJob {
domain: domain.clone(),
current_depth: self.current_depth + 1,
params: self.params.clone(),
crawled_instances: self.crawled_instances.clone(),
};
result.push(crawl_job.crawl());
}
}
let mut result2: Vec<InstanceDetails> =
try_join_all(result).await?.into_iter().flatten().collect();
info!("Successfully finished crawl for {}", &self.domain);
result2.push(InstanceDetails {
domain: self.domain,
site_info: site_info4,
});
return Ok(result2);
}
}
Ok(vec![])
}
} }
/// calculate minimum allowed lemmy version based on current version. in case of current version /// calculate minimum allowed lemmy version based on current version. in case of current version