lemmy-stats-crawler/src/crawl.rs

107 lines
3.6 KiB
Rust
Raw Normal View History

2021-03-11 13:34:44 +00:00
use crate::federated_instances::GetSiteResponse;
use crate::node_info::NodeInfo;
2021-03-15 21:19:48 +00:00
use crate::REQUEST_TIMEOUT;
use anyhow::Error;
use futures::try_join;
2021-03-11 13:34:44 +00:00
use reqwest::Client;
use serde::Serialize;
use std::collections::VecDeque;
2021-03-15 21:19:48 +00:00
pub async fn crawl(
start_instances: Vec<String>,
max_depth: i32,
2021-03-17 02:04:57 +00:00
) -> Result<(Vec<InstanceDetails>, i32), Error> {
2021-03-15 21:19:48 +00:00
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
.iter()
.map(|s| CrawlInstance::new(s.to_string(), 0))
.collect();
let mut crawled_instances = vec![];
let mut instance_details = vec![];
2021-03-17 02:04:57 +00:00
let mut failed_instances = 0;
2021-03-15 21:19:48 +00:00
while let Some(current_instance) = pending_instances.pop_back() {
crawled_instances.push(current_instance.domain.clone());
if current_instance.depth > max_depth {
continue;
}
match fetch_instance_details(&current_instance.domain).await {
Ok(details) => {
instance_details.push(details.to_owned());
for i in details.linked_instances {
let is_in_crawled = crawled_instances.contains(&i);
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
if !is_in_crawled && !is_in_pending {
let ci = CrawlInstance::new(i, current_instance.depth + 1);
pending_instances.push_back(ci);
}
}
}
2021-03-17 02:04:57 +00:00
Err(e) => {
failed_instances += 1;
eprintln!("Failed to crawl {}: {}", current_instance.domain, e)
2021-03-17 02:09:21 +00:00
}
2021-03-11 13:34:44 +00:00
}
}
2021-03-17 02:04:57 +00:00
Ok((instance_details, failed_instances))
2021-03-11 13:34:44 +00:00
}
#[derive(Serialize, Clone)]
pub struct InstanceDetails {
2021-03-15 21:19:48 +00:00
pub domain: String,
pub name: String,
pub version: String,
pub icon: Option<String>,
pub online_users: i32,
pub total_users: i64,
pub users_active_halfyear: i64,
pub users_active_month: i64,
pub open_registrations: bool,
pub linked_instances_count: i32,
// The following fields are only used for aggregation, but not shown in output
#[serde(skip)]
pub linked_instances: Vec<String>,
}
struct CrawlInstance {
domain: String,
depth: i32,
2021-03-15 21:19:48 +00:00
}
impl CrawlInstance {
pub fn new(domain: String, depth: i32) -> CrawlInstance {
CrawlInstance { domain, depth }
}
2021-03-11 13:34:44 +00:00
}
async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error> {
2021-03-15 21:19:48 +00:00
let client = Client::default();
2021-03-11 13:34:44 +00:00
2021-03-15 21:19:48 +00:00
let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain);
let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send();
2021-03-11 13:34:44 +00:00
2021-03-15 21:19:48 +00:00
let site_info_url = format!("https://{}/api/v2/site", domain);
let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send();
2021-03-11 13:34:44 +00:00
2021-03-15 21:19:48 +00:00
let (node_info, site_info) = try_join!(node_info_request, site_info_request)?;
let node_info: NodeInfo = node_info.json().await?;
let site_info: GetSiteResponse = site_info.json().await?;
2021-03-11 13:34:44 +00:00
2021-03-15 21:19:48 +00:00
let linked_instances = site_info
.federated_instances
.map(|f| f.linked)
2021-03-17 02:09:21 +00:00
.unwrap_or_default();
2021-03-15 21:19:48 +00:00
Ok(InstanceDetails {
domain: domain.to_owned(),
name: site_info.site_view.site.name,
version: node_info.software.version,
icon: site_info.site_view.site.icon,
online_users: site_info.online as i32,
total_users: node_info.usage.users.total,
users_active_halfyear: node_info.usage.users.active_halfyear,
users_active_month: node_info.usage.users.active_month,
open_registrations: node_info.open_registrations,
linked_instances_count: linked_instances.len() as i32,
linked_instances,
})
2021-03-11 13:34:44 +00:00
}