Add max depth parameter for crawl

This commit is contained in:
Felix Ableitner 2021-03-15 22:19:48 +01:00
parent 5a09fa46c4
commit f01e077020
3 changed files with 87 additions and 63 deletions

View file

@ -1,79 +1,102 @@
use anyhow::Error;
use futures::try_join;
use crate::federated_instances::GetSiteResponse; use crate::federated_instances::GetSiteResponse;
use crate::node_info::NodeInfo; use crate::node_info::NodeInfo;
use crate::REQUEST_TIMEOUT;
use anyhow::Error;
use futures::try_join;
use reqwest::Client; use reqwest::Client;
use serde::Serialize; use serde::Serialize;
use std::collections::VecDeque; use std::collections::VecDeque;
use crate::REQUEST_TIMEOUT;
pub async fn crawl(start_instances: Vec<String>) -> Result<Vec<InstanceDetails>, Error> { pub async fn crawl(
let mut pending_instances = VecDeque::from(start_instances); start_instances: Vec<String>,
let mut crawled_instances = vec![]; max_depth: i32,
let mut instance_details = vec![]; ) -> Result<Vec<InstanceDetails>, Error> {
while let Some(current_instance) = pending_instances.pop_back() { let mut pending_instances: VecDeque<CrawlInstance> = start_instances
crawled_instances.push(current_instance.clone()); .iter()
match fetch_instance_details(&current_instance).await { .map(|s| CrawlInstance::new(s.to_string(), 0))
Ok(details) => { .collect();
instance_details.push(details.to_owned()); let mut crawled_instances = vec![];
for i in details.linked_instances { let mut instance_details = vec![];
if !crawled_instances.contains(&i) && !pending_instances.contains(&i) { while let Some(current_instance) = pending_instances.pop_back() {
pending_instances.push_back(i); crawled_instances.push(current_instance.domain.clone());
} if current_instance.depth > max_depth {
continue;
}
match fetch_instance_details(&current_instance.domain).await {
Ok(details) => {
instance_details.push(details.to_owned());
for i in details.linked_instances {
let is_in_crawled = crawled_instances.contains(&i);
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
if !is_in_crawled && !is_in_pending {
let ci = CrawlInstance::new(i, current_instance.depth + 1);
pending_instances.push_back(ci);
}
}
}
Err(e) => eprintln!("Failed to crawl {}: {}", current_instance.domain, e),
} }
}
Err(e) => eprintln!("Failed to crawl {}: {}", current_instance, e),
} }
}
Ok(instance_details) Ok(instance_details)
} }
#[derive(Serialize, Clone)] #[derive(Serialize, Clone)]
pub struct InstanceDetails { pub struct InstanceDetails {
pub domain: String, pub domain: String,
pub name: String, pub name: String,
pub version: String, pub version: String,
pub icon: Option<String>, pub icon: Option<String>,
pub online_users: i32, pub online_users: i32,
pub total_users: i64, pub total_users: i64,
pub users_active_halfyear: i64, pub users_active_halfyear: i64,
pub users_active_month: i64, pub users_active_month: i64,
pub open_registrations: bool, pub open_registrations: bool,
pub linked_instances_count: i32, pub linked_instances_count: i32,
// The following fields are only used for aggregation, but not shown in output // The following fields are only used for aggregation, but not shown in output
#[serde(skip)] #[serde(skip)]
pub linked_instances: Vec<String>, pub linked_instances: Vec<String>,
}
struct CrawlInstance {
domain: String,
depth: i32,
}
impl CrawlInstance {
pub fn new(domain: String, depth: i32) -> CrawlInstance {
CrawlInstance { domain, depth }
}
} }
async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error> { async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error> {
let client = Client::default(); let client = Client::default();
let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain);
let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send(); let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send();
let site_info_url = format!("https://{}/api/v2/site", domain); let site_info_url = format!("https://{}/api/v2/site", domain);
let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send(); let site_info_request = client.get(&site_info_url).timeout(REQUEST_TIMEOUT).send();
let (node_info, site_info) = try_join!(node_info_request, site_info_request)?; let (node_info, site_info) = try_join!(node_info_request, site_info_request)?;
let node_info: NodeInfo = node_info.json().await?; let node_info: NodeInfo = node_info.json().await?;
let site_info: GetSiteResponse = site_info.json().await?; let site_info: GetSiteResponse = site_info.json().await?;
let linked_instances = site_info let linked_instances = site_info
.federated_instances .federated_instances
.map(|f| f.linked) .map(|f| f.linked)
.unwrap_or(vec![]); .unwrap_or(vec![]);
Ok(InstanceDetails { Ok(InstanceDetails {
domain: domain.to_owned(), domain: domain.to_owned(),
name: site_info.site_view.site.name, name: site_info.site_view.site.name,
version: node_info.software.version, version: node_info.software.version,
icon: site_info.site_view.site.icon, icon: site_info.site_view.site.icon,
online_users: site_info.online as i32, online_users: site_info.online as i32,
total_users: node_info.usage.users.total, total_users: node_info.usage.users.total,
users_active_halfyear: node_info.usage.users.active_halfyear, users_active_halfyear: node_info.usage.users.active_halfyear,
users_active_month: node_info.usage.users.active_month, users_active_month: node_info.usage.users.active_month,
open_registrations: node_info.open_registrations, open_registrations: node_info.open_registrations,
linked_instances_count: linked_instances.len() as i32, linked_instances_count: linked_instances.len() as i32,
linked_instances, linked_instances,
}) })
} }

View file

@ -1,8 +1,9 @@
use std::time::Duration; use std::time::Duration;
pub mod crawl;
pub mod federated_instances; pub mod federated_instances;
pub mod node_info; pub mod node_info;
pub mod crawl;
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"]; pub const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"];
pub const MAX_CRAWL_DEPTH: i32 = 2;

View file

@ -1,14 +1,14 @@
use anyhow::Error; use anyhow::Error;
use serde::Serialize;
use lemmy_stats_crawler::START_INSTANCES;
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails}; use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
use lemmy_stats_crawler::{MAX_CRAWL_DEPTH, START_INSTANCES};
use serde::Serialize;
#[tokio::main] #[tokio::main]
pub async fn main() -> Result<(), Error> { pub async fn main() -> Result<(), Error> {
let start_instances = START_INSTANCES.iter().map(|s| s.to_string()).collect(); let start_instances = START_INSTANCES.iter().map(|s| s.to_string()).collect();
eprintln!("Crawling..."); eprintln!("Crawling...");
let instance_details = crawl(start_instances).await?; let instance_details = crawl(start_instances, MAX_CRAWL_DEPTH).await?;
let total_stats = aggregate(instance_details); let total_stats = aggregate(instance_details);
println!("{}", serde_json::to_string(&total_stats)?); println!("{}", serde_json::to_string(&total_stats)?);