diff --git a/src/crawl.rs b/src/crawl.rs new file mode 100644 index 0000000..54615e2 --- /dev/null +++ b/src/crawl.rs @@ -0,0 +1,80 @@ +use anyhow::Error; +use futures::try_join; +use crate::federated_instances::GetSiteResponse; +use crate::node_info::NodeInfo; +use reqwest::Client; +use serde::Serialize; +use std::collections::VecDeque; +use crate::REQUEST_TIMEOUT; + +pub async fn crawl(start_instances: Vec) -> Result, Error> { + let mut pending_instances = VecDeque::from(start_instances); + let mut instance_details = vec![]; + while let Some(current_instance) = pending_instances.pop_back() { + match fetch_instance_details(¤t_instance).await { + Ok(details) => { + instance_details.push(details.to_owned()); + // add all unknown, linked instances to pending + let crawled_instances: &Vec<&str> = + &instance_details.iter().map(|i| i.domain.as_ref()).collect(); + for i in details.linked_instances { + if !crawled_instances.contains(&&*i) && !pending_instances.contains(&i) { + pending_instances.push_back(i); + } + } + } + Err(e) => eprintln!("Failed to crawl {}: {}", current_instance, e), + } + } + + Ok(instance_details) +} + +#[derive(Serialize, Clone)] +pub struct InstanceDetails { + pub domain: String, + pub name: String, + pub version: String, + pub icon: Option, + pub online_users: i32, + pub total_users: i64, + pub users_active_halfyear: i64, + pub users_active_month: i64, + pub open_registrations: bool, + pub linked_instances_count: i32, + // The following fields are only used for aggregation, but not shown in output + #[serde(skip)] + pub linked_instances: Vec, +} + +async fn fetch_instance_details(domain: &str) -> Result { + let client = Client::default(); + + let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); + let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send(); + + let site_info_url = format!("https://{}/api/v2/site", domain); + let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send(); + + let (node_info, site_info) = try_join!(node_info_request, site_info_request)?; + let node_info: NodeInfo = node_info.json().await?; + let site_info: GetSiteResponse = site_info.json().await?; + + let linked_instances = site_info + .federated_instances + .map(|f| f.linked) + .unwrap_or(vec![]); + Ok(InstanceDetails { + domain: domain.to_owned(), + name: site_info.site_view.site.name, + version: node_info.version, + icon: site_info.site_view.site.icon, + online_users: site_info.online as i32, + total_users: node_info.usage.users.total, + users_active_halfyear: node_info.usage.users.active_halfyear, + users_active_month: node_info.usage.users.active_month, + open_registrations: node_info.open_registrations, + linked_instances_count: linked_instances.len() as i32, + linked_instances, + }) +} diff --git a/src/lib.rs b/src/lib.rs index 34f82e2..cd76424 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,2 +1,8 @@ +use std::time::Duration; + pub mod federated_instances; pub mod node_info; +pub mod crawl; + +pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +pub const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"]; diff --git a/src/main.rs b/src/main.rs index b5ac432..c84e127 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,14 +1,7 @@ use anyhow::Error; -use futures::try_join; -use lemmy_stats_crawler::federated_instances::GetSiteResponse; -use lemmy_stats_crawler::node_info::NodeInfo; -use reqwest::Client; use serde::Serialize; -use std::collections::VecDeque; -use tokio::time::Duration; - -const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); -const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"]; +use lemmy_stats_crawler::START_INSTANCES; +use lemmy_stats_crawler::crawl::{crawl, InstanceDetails}; #[tokio::main] pub async fn main() -> Result<(), Error> { @@ -45,76 +38,4 @@ fn aggregate(instance_details: Vec) -> TotalStats { total_online_users, instance_details, } -} - -async fn crawl(start_instances: Vec) -> Result, Error> { - let mut pending_instances = VecDeque::from(start_instances); - let mut instance_details = vec![]; - while let Some(current_instance) = pending_instances.pop_back() { - match fetch_instance_details(¤t_instance).await { - Ok(details) => { - instance_details.push(details.to_owned()); - // add all unknown, linked instances to pending - let crawled_instances: &Vec<&str> = - &instance_details.iter().map(|i| i.domain.as_ref()).collect(); - for i in details.linked_instances { - if !crawled_instances.contains(&&*i) && !pending_instances.contains(&i) { - pending_instances.push_back(i); - } - } - } - Err(e) => eprintln!("Failed to crawl {}: {}", current_instance, e), - } - } - - Ok(instance_details) -} - -#[derive(Serialize, Clone)] -struct InstanceDetails { - domain: String, - name: String, - version: String, - icon: Option, - online_users: i32, - total_users: i64, - users_active_halfyear: i64, - users_active_month: i64, - open_registrations: bool, - linked_instances_count: i32, - // The following fields are only used for aggregation, but not shown in output - #[serde(skip)] - linked_instances: Vec, -} - -async fn fetch_instance_details(domain: &str) -> Result { - let client = Client::default(); - - let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); - let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send(); - - let site_info_url = format!("https://{}/api/v2/site", domain); - let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send(); - - let (node_info, site_info) = try_join!(node_info_request, site_info_request)?; - let node_info: NodeInfo = node_info.json().await?; - let site_info: GetSiteResponse = site_info.json().await?; - - let linked_instances = site_info - .federated_instances - .map(|f| f.linked) - .unwrap_or(vec![]); - Ok(InstanceDetails { - domain: domain.to_owned(), - name: site_info.site_view.site.name, - version: node_info.version, - icon: site_info.site_view.site.icon, - online_users: site_info.online as i32, - total_users: node_info.usage.users.total, - users_active_halfyear: node_info.usage.users.active_halfyear, - users_active_month: node_info.usage.users.active_month, - open_registrations: node_info.open_registrations, - linked_instances_count: linked_instances.len() as i32, - linked_instances, - }) -} +} \ No newline at end of file