Also calculate counts including federated instances

This commit is contained in:
Felix Ableitner 2022-12-05 16:01:37 +01:00
parent 1343932b2b
commit ea57a16987
6 changed files with 186 additions and 50 deletions

1
Cargo.lock generated
View file

@ -1380,6 +1380,7 @@ dependencies = [
"derive-new", "derive-new",
"futures", "futures",
"lemmy_api_common", "lemmy_api_common",
"lemmy_db_schema",
"log", "log",
"once_cell", "once_cell",
"reqwest", "reqwest",

View file

@ -14,6 +14,7 @@ serde_json = "1.0.89"
semver = "1.0.14" semver = "1.0.14"
once_cell = "1.16.0" once_cell = "1.16.0"
lemmy_api_common = "=0.16.0" lemmy_api_common = "=0.16.0"
lemmy_db_schema = "=0.16.0"
async-recursion = "1.0.0" async-recursion = "1.0.0"
log = "0.4.17" log = "0.4.17"
derive-new = "0.5.9" derive-new = "0.5.9"

View file

@ -1,11 +1,12 @@
use crate::node_info::{NodeInfo, NodeInfoWellKnown};
use crate::CLIENT; use crate::CLIENT;
use anyhow::Error; use anyhow::{anyhow, Error};
use async_recursion::async_recursion; use async_recursion::async_recursion;
use futures::future::join_all; use futures::future::join_all;
use lemmy_api_common::site::GetSiteResponse; use lemmy_api_common::site::GetSiteResponse;
use log::debug; use log::debug;
use reqwest::Url;
use semver::Version; use semver::Version;
use serde::Serialize;
use std::collections::HashSet; use std::collections::HashSet;
use std::ops::Deref; use std::ops::Deref;
use std::sync::Arc; use std::sync::Arc;
@ -26,15 +27,16 @@ pub struct CrawlParams {
crawled_instances: Arc<Mutex<HashSet<String>>>, crawled_instances: Arc<Mutex<HashSet<String>>>,
} }
#[derive(Serialize, Debug)] #[derive(Debug)]
pub struct InstanceDetails { pub struct CrawlResult {
pub domain: String, pub domain: String,
pub site_info: GetSiteResponse, pub node_info: NodeInfo,
pub site_info: Option<GetSiteResponse>,
} }
impl CrawlJob { impl CrawlJob {
#[async_recursion] #[async_recursion]
pub async fn crawl(self) -> Vec<Result<InstanceDetails, Error>> { pub async fn crawl(self) -> Vec<Result<CrawlResult, Error>> {
// need to acquire and release mutex before recursing, otherwise it will deadlock // need to acquire and release mutex before recursing, otherwise it will deadlock
{ {
let mut crawled_instances = self.params.crawled_instances.deref().lock().await; let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
@ -55,47 +57,79 @@ impl CrawlJob {
"Starting crawl for {}, distance {}", "Starting crawl for {}, distance {}",
&self.domain, &self.current_distance &self.domain, &self.current_distance
); );
let site_info = match self.fetch_instance_details().await { let (node_info, site_info) = match self.fetch_instance_details().await {
Ok(o) => o, Ok(o) => o,
Err(e) => return vec![Err(e)], Err(e) => return vec![Err(e)],
}; };
let mut crawl_result = CrawlResult {
domain: self.domain.clone(),
node_info,
site_info: None,
};
if site_info.1 < self.params.min_lemmy_version { if let Some(site_info) = site_info {
return vec![]; match Version::parse(&site_info.version) {
} Ok(version) => {
if version < self.params.min_lemmy_version {
let mut result = vec![]; return vec![Ok(crawl_result)];
if let Some(federated) = &site_info.0.federated_instances { }
for domain in federated.linked.iter() { }
let crawl_job = CrawlJob::new( Err(e) => return vec![Err(e.into())],
domain.clone(),
self.current_distance + 1,
self.params.clone(),
);
result.push(crawl_job.crawl());
} }
let mut result = vec![];
if let Some(federated) = &site_info.federated_instances {
for domain in federated.linked.iter() {
let crawl_job = CrawlJob::new(
domain.clone(),
self.current_distance + 1,
self.params.clone(),
);
result.push(crawl_job.crawl());
}
}
let mut result2: Vec<Result<CrawlResult, Error>> =
join_all(result).await.into_iter().flatten().collect();
debug!("Successfully finished crawl for {}", &self.domain);
crawl_result.site_info = Some(site_info);
result2.push(Ok(crawl_result));
result2
} else {
vec![Ok(crawl_result)]
} }
let mut result2: Vec<Result<InstanceDetails, Error>> =
join_all(result).await.into_iter().flatten().collect();
debug!("Successfully finished crawl for {}", &self.domain);
result2.push(Ok(InstanceDetails {
domain: self.domain,
site_info: site_info.0,
}));
result2
} }
async fn fetch_instance_details(&self) -> Result<(GetSiteResponse, Version), Error> { async fn fetch_instance_details(&self) -> Result<(NodeInfo, Option<GetSiteResponse>), Error> {
let site_info_url = format!("https://{}/api/v3/site", &self.domain); let rel_node_info: Url = Url::parse("http://nodeinfo.diaspora.software/ns/schema/2.0")
.expect("parse nodeinfo relation url");
let node_info_well_known = CLIENT
.get(&format!("https://{}/.well-known/nodeinfo", &self.domain))
.send()
.await?
.json::<NodeInfoWellKnown>()
.await?;
let node_info_url = node_info_well_known
.links
.into_iter()
.find(|l| l.rel == rel_node_info)
.ok_or_else(|| anyhow!("failed to find nodeinfo link for {}", &self.domain))?
.href;
let node_info = CLIENT
.get(node_info_url)
.send()
.await?
.json::<NodeInfo>()
.await?;
let site_info = CLIENT let site_info = CLIENT
.get(&site_info_url) .get(&format!("https://{}/api/v3/site", &self.domain))
.send() .send()
.await? .await?
.json::<GetSiteResponse>() .json::<GetSiteResponse>()
.await?; .await
let version = Version::parse(&site_info.version)?; .ok();
Ok((site_info, version)) Ok((node_info, site_info))
} }
} }

View file

@ -1,19 +1,23 @@
#[macro_use] #[macro_use]
extern crate derive_new; extern crate derive_new;
use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails}; use crate::crawl::{CrawlJob, CrawlParams, CrawlResult};
use crate::node_info::{NodeInfo, NodeInfoUsage, NodeInfoUsers};
use anyhow::Error; use anyhow::Error;
use futures::future::join_all; use futures::future::join_all;
use lemmy_api_common::site::GetSiteResponse;
use log::warn; use log::warn;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use reqwest::{Client, ClientBuilder}; use reqwest::{Client, ClientBuilder};
use semver::Version; use semver::Version;
use serde::Serialize;
use std::collections::HashSet; use std::collections::HashSet;
use std::sync::Arc; use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use tokio::sync::Mutex; use tokio::sync::Mutex;
pub mod crawl; pub mod crawl;
mod node_info;
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
@ -22,14 +26,21 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
.timeout(REQUEST_TIMEOUT) .timeout(REQUEST_TIMEOUT)
.user_agent("lemmy-stats-crawler") .user_agent("lemmy-stats-crawler")
.build() .build()
.unwrap() .expect("build reqwest client")
}); });
#[derive(Serialize, Debug)]
pub struct CrawlResult2 {
pub domain: String,
pub site_info: GetSiteResponse,
pub federated_counts: Option<NodeInfoUsage>,
}
pub async fn start_crawl( pub async fn start_crawl(
start_instances: Vec<String>, start_instances: Vec<String>,
exclude_domains: Vec<String>, exclude_domains: Vec<String>,
max_distance: i32, max_distance: i32,
) -> Result<Vec<InstanceDetails>, Error> { ) -> Result<Vec<CrawlResult2>, Error> {
let params = Arc::new(CrawlParams::new( let params = Arc::new(CrawlParams::new(
min_lemmy_version().await?, min_lemmy_version().await?,
exclude_domains, exclude_domains,
@ -42,8 +53,7 @@ pub async fn start_crawl(
jobs.push(job.crawl()); jobs.push(job.crawl());
} }
// TODO: log the errors let crawl_results: Vec<CrawlResult> = join_all(jobs)
let mut instance_details: Vec<InstanceDetails> = join_all(jobs)
.await .await
.into_iter() .into_iter()
.flatten() .flatten()
@ -52,20 +62,20 @@ pub async fn start_crawl(
warn!("{}", e) warn!("{}", e)
} }
}) })
.filter_map(|r| r.ok()) .filter_map(Result::ok)
.collect(); .collect();
let mut crawl_results = calculate_federated_site_aggregates(crawl_results)?;
// Sort by active monthly users descending // Sort by active monthly users descending
instance_details.sort_unstable_by_key(|i| { crawl_results.sort_unstable_by_key(|i| {
i.site_info i.site_info
.site_view .site_view
.as_ref() .as_ref()
.map(|s| s.counts.users_active_month) .map(|s| s.counts.users_active_month)
.unwrap_or(0) .unwrap_or(0)
}); });
instance_details.reverse(); crawl_results.reverse();
Ok(crawl_results)
Ok(instance_details)
} }
/// calculate minimum allowed lemmy version based on current version. in case of current version /// calculate minimum allowed lemmy version based on current version. in case of current version
@ -82,3 +92,47 @@ async fn min_lemmy_version() -> Result<Version, Error> {
version.minor -= 1; version.minor -= 1;
Ok(version) Ok(version)
} }
fn calculate_federated_site_aggregates(
crawl_results: Vec<CrawlResult>,
) -> Result<Vec<CrawlResult2>, Error> {
let node_info: Vec<(String, NodeInfo)> = crawl_results
.iter()
.map(|c| (c.domain.clone(), c.node_info.clone()))
.collect();
let lemmy_instances: Vec<(String, GetSiteResponse)> = crawl_results
.into_iter()
.filter_map(|c| {
let domain = c.domain;
c.site_info.map(|c2| (domain, c2))
})
.collect();
let mut ret = vec![];
for instance in &lemmy_instances {
let federated_counts = if let Some(federated_instances) = &instance.1.federated_instances {
node_info
.iter()
.filter(|i| federated_instances.linked.contains(&i.0) || i.0 == instance.0)
.map(|i| i.1.usage.clone())
.reduce(|a, b| NodeInfoUsage {
users: NodeInfoUsers {
total: a.users.total + b.users.total,
active_halfyear: a.users.active_halfyear + b.users.active_halfyear,
active_month: a.users.active_month + b.users.active_month,
},
posts: a.posts + b.posts,
comments: a.comments + b.comments,
})
} else {
None
};
// TODO: workaround because GetSiteResponse doesnt implement clone
let site_info = serde_json::from_str(&serde_json::to_string(&instance.1)?)?;
ret.push(CrawlResult2 {
domain: instance.0.clone(),
site_info,
federated_counts,
});
}
Ok(ret)
}

View file

@ -1,6 +1,5 @@
use anyhow::Error; use anyhow::Error;
use lemmy_stats_crawler::crawl::InstanceDetails; use lemmy_stats_crawler::{start_crawl, CrawlResult2};
use lemmy_stats_crawler::start_crawl;
use serde::Serialize; use serde::Serialize;
use structopt::StructOpt; use structopt::StructOpt;
@ -57,10 +56,10 @@ struct TotalStats {
users_active_week: i64, users_active_week: i64,
users_active_month: i64, users_active_month: i64,
users_active_halfyear: i64, users_active_halfyear: i64,
instance_details: Vec<InstanceDetails>, instance_details: Vec<CrawlResult2>,
} }
fn aggregate(instance_details: Vec<InstanceDetails>) -> TotalStats { fn aggregate(instance_details: Vec<CrawlResult2>) -> TotalStats {
let mut online_users = 0; let mut online_users = 0;
let mut total_users = 0; let mut total_users = 0;
let mut users_active_day = 0; let mut users_active_day = 0;

47
src/node_info.rs Normal file
View file

@ -0,0 +1,47 @@
use reqwest::Url;
use serde::{Deserialize, Serialize};
#[derive(Deserialize, Debug)]
pub struct NodeInfoWellKnown {
pub links: Vec<NodeInfoWellKnownLinks>,
}
#[derive(Deserialize, Debug)]
pub struct NodeInfoWellKnownLinks {
pub rel: Url,
pub href: Url,
}
#[derive(Deserialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct NodeInfo {
pub version: String,
pub software: NodeInfoSoftware,
pub protocols: Vec<String>,
pub usage: NodeInfoUsage,
pub open_registrations: bool,
}
#[derive(Deserialize, Debug, Clone)]
pub struct NodeInfoSoftware {
pub name: String,
pub version: String,
}
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
#[serde(rename_all = "camelCase", default)]
pub struct NodeInfoUsage {
pub users: NodeInfoUsers,
#[serde(rename(deserialize = "localPosts"))]
pub posts: i64,
#[serde(rename(deserialize = "localComments"))]
pub comments: i64,
}
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
#[serde(rename_all = "camelCase", default)]
pub struct NodeInfoUsers {
pub total: i64,
pub active_halfyear: i64,
pub active_month: i64,
}