From f01e077020702ed4aea93bd415235ac977aa37da Mon Sep 17 00:00:00 2001 From: Felix Ableitner Date: Mon, 15 Mar 2021 22:19:48 +0100 Subject: [PATCH] Add max depth parameter for crawl --- src/crawl.rs | 139 ++++++++++++++++++++++++++++++--------------------- src/lib.rs | 3 +- src/main.rs | 8 +-- 3 files changed, 87 insertions(+), 63 deletions(-) diff --git a/src/crawl.rs b/src/crawl.rs index 715dde9..8f5937e 100644 --- a/src/crawl.rs +++ b/src/crawl.rs @@ -1,79 +1,102 @@ -use anyhow::Error; -use futures::try_join; use crate::federated_instances::GetSiteResponse; use crate::node_info::NodeInfo; +use crate::REQUEST_TIMEOUT; +use anyhow::Error; +use futures::try_join; use reqwest::Client; use serde::Serialize; use std::collections::VecDeque; -use crate::REQUEST_TIMEOUT; -pub async fn crawl(start_instances: Vec) -> Result, Error> { - let mut pending_instances = VecDeque::from(start_instances); - let mut crawled_instances = vec![]; - let mut instance_details = vec![]; - while let Some(current_instance) = pending_instances.pop_back() { - crawled_instances.push(current_instance.clone()); - match fetch_instance_details(¤t_instance).await { - Ok(details) => { - instance_details.push(details.to_owned()); - for i in details.linked_instances { - if !crawled_instances.contains(&i) && !pending_instances.contains(&i) { - pending_instances.push_back(i); - } +pub async fn crawl( + start_instances: Vec, + max_depth: i32, +) -> Result, Error> { + let mut pending_instances: VecDeque = start_instances + .iter() + .map(|s| CrawlInstance::new(s.to_string(), 0)) + .collect(); + let mut crawled_instances = vec![]; + let mut instance_details = vec![]; + while let Some(current_instance) = pending_instances.pop_back() { + crawled_instances.push(current_instance.domain.clone()); + if current_instance.depth > max_depth { + continue; + } + match fetch_instance_details(¤t_instance.domain).await { + Ok(details) => { + instance_details.push(details.to_owned()); + for i in details.linked_instances { + let is_in_crawled = crawled_instances.contains(&i); + let is_in_pending = pending_instances.iter().any(|p| p.domain == i); + if !is_in_crawled && !is_in_pending { + let ci = CrawlInstance::new(i, current_instance.depth + 1); + pending_instances.push_back(ci); + } + } + } + Err(e) => eprintln!("Failed to crawl {}: {}", current_instance.domain, e), } - } - Err(e) => eprintln!("Failed to crawl {}: {}", current_instance, e), } - } - Ok(instance_details) + Ok(instance_details) } #[derive(Serialize, Clone)] pub struct InstanceDetails { - pub domain: String, - pub name: String, - pub version: String, - pub icon: Option, - pub online_users: i32, - pub total_users: i64, - pub users_active_halfyear: i64, - pub users_active_month: i64, - pub open_registrations: bool, - pub linked_instances_count: i32, - // The following fields are only used for aggregation, but not shown in output - #[serde(skip)] - pub linked_instances: Vec, + pub domain: String, + pub name: String, + pub version: String, + pub icon: Option, + pub online_users: i32, + pub total_users: i64, + pub users_active_halfyear: i64, + pub users_active_month: i64, + pub open_registrations: bool, + pub linked_instances_count: i32, + // The following fields are only used for aggregation, but not shown in output + #[serde(skip)] + pub linked_instances: Vec, +} + +struct CrawlInstance { + domain: String, + depth: i32, +} + +impl CrawlInstance { + pub fn new(domain: String, depth: i32) -> CrawlInstance { + CrawlInstance { domain, depth } + } } async fn fetch_instance_details(domain: &str) -> Result { - let client = Client::default(); + let client = Client::default(); - let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); - let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send(); + let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); + let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send(); - let site_info_url = format!("https://{}/api/v2/site", domain); - let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send(); + let site_info_url = format!("https://{}/api/v2/site", domain); + let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send(); - let (node_info, site_info) = try_join!(node_info_request, site_info_request)?; - let node_info: NodeInfo = node_info.json().await?; - let site_info: GetSiteResponse = site_info.json().await?; + let (node_info, site_info) = try_join!(node_info_request, site_info_request)?; + let node_info: NodeInfo = node_info.json().await?; + let site_info: GetSiteResponse = site_info.json().await?; - let linked_instances = site_info - .federated_instances - .map(|f| f.linked) - .unwrap_or(vec![]); - Ok(InstanceDetails { - domain: domain.to_owned(), - name: site_info.site_view.site.name, - version: node_info.software.version, - icon: site_info.site_view.site.icon, - online_users: site_info.online as i32, - total_users: node_info.usage.users.total, - users_active_halfyear: node_info.usage.users.active_halfyear, - users_active_month: node_info.usage.users.active_month, - open_registrations: node_info.open_registrations, - linked_instances_count: linked_instances.len() as i32, - linked_instances, - }) + let linked_instances = site_info + .federated_instances + .map(|f| f.linked) + .unwrap_or(vec![]); + Ok(InstanceDetails { + domain: domain.to_owned(), + name: site_info.site_view.site.name, + version: node_info.software.version, + icon: site_info.site_view.site.icon, + online_users: site_info.online as i32, + total_users: node_info.usage.users.total, + users_active_halfyear: node_info.usage.users.active_halfyear, + users_active_month: node_info.usage.users.active_month, + open_registrations: node_info.open_registrations, + linked_instances_count: linked_instances.len() as i32, + linked_instances, + }) } diff --git a/src/lib.rs b/src/lib.rs index cd76424..0fcb565 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,8 +1,9 @@ use std::time::Duration; +pub mod crawl; pub mod federated_instances; pub mod node_info; -pub mod crawl; pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); pub const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"]; +pub const MAX_CRAWL_DEPTH: i32 = 2; diff --git a/src/main.rs b/src/main.rs index c84e127..8800a2c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,14 @@ use anyhow::Error; -use serde::Serialize; -use lemmy_stats_crawler::START_INSTANCES; use lemmy_stats_crawler::crawl::{crawl, InstanceDetails}; +use lemmy_stats_crawler::{MAX_CRAWL_DEPTH, START_INSTANCES}; +use serde::Serialize; #[tokio::main] pub async fn main() -> Result<(), Error> { let start_instances = START_INSTANCES.iter().map(|s| s.to_string()).collect(); eprintln!("Crawling..."); - let instance_details = crawl(start_instances).await?; + let instance_details = crawl(start_instances, MAX_CRAWL_DEPTH).await?; let total_stats = aggregate(instance_details); println!("{}", serde_json::to_string(&total_stats)?); @@ -38,4 +38,4 @@ fn aggregate(instance_details: Vec) -> TotalStats { total_online_users, instance_details, } -} \ No newline at end of file +}