Merge pull request 'Also calculate counts including federated instances' (#12) from federated-counts into main
Reviewed-on: #12
This commit is contained in:
commit
bcb852cfeb
6 changed files with 186 additions and 50 deletions
1
Cargo.lock
generated
1
Cargo.lock
generated
|
@ -1380,6 +1380,7 @@ dependencies = [
|
||||||
"derive-new",
|
"derive-new",
|
||||||
"futures",
|
"futures",
|
||||||
"lemmy_api_common",
|
"lemmy_api_common",
|
||||||
|
"lemmy_db_schema",
|
||||||
"log",
|
"log",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
|
|
|
@ -14,6 +14,7 @@ serde_json = "1.0.89"
|
||||||
semver = "1.0.14"
|
semver = "1.0.14"
|
||||||
once_cell = "1.16.0"
|
once_cell = "1.16.0"
|
||||||
lemmy_api_common = "=0.16.0"
|
lemmy_api_common = "=0.16.0"
|
||||||
|
lemmy_db_schema = "=0.16.0"
|
||||||
async-recursion = "1.0.0"
|
async-recursion = "1.0.0"
|
||||||
log = "0.4.17"
|
log = "0.4.17"
|
||||||
derive-new = "0.5.9"
|
derive-new = "0.5.9"
|
||||||
|
|
76
src/crawl.rs
76
src/crawl.rs
|
@ -1,11 +1,12 @@
|
||||||
|
use crate::node_info::{NodeInfo, NodeInfoWellKnown};
|
||||||
use crate::CLIENT;
|
use crate::CLIENT;
|
||||||
use anyhow::Error;
|
use anyhow::{anyhow, Error};
|
||||||
use async_recursion::async_recursion;
|
use async_recursion::async_recursion;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use lemmy_api_common::site::GetSiteResponse;
|
use lemmy_api_common::site::GetSiteResponse;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
|
use reqwest::Url;
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
use serde::Serialize;
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::ops::Deref;
|
use std::ops::Deref;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
@ -26,15 +27,16 @@ pub struct CrawlParams {
|
||||||
crawled_instances: Arc<Mutex<HashSet<String>>>,
|
crawled_instances: Arc<Mutex<HashSet<String>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug)]
|
#[derive(Debug)]
|
||||||
pub struct InstanceDetails {
|
pub struct CrawlResult {
|
||||||
pub domain: String,
|
pub domain: String,
|
||||||
pub site_info: GetSiteResponse,
|
pub node_info: NodeInfo,
|
||||||
|
pub site_info: Option<GetSiteResponse>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CrawlJob {
|
impl CrawlJob {
|
||||||
#[async_recursion]
|
#[async_recursion]
|
||||||
pub async fn crawl(self) -> Vec<Result<InstanceDetails, Error>> {
|
pub async fn crawl(self) -> Vec<Result<CrawlResult, Error>> {
|
||||||
// need to acquire and release mutex before recursing, otherwise it will deadlock
|
// need to acquire and release mutex before recursing, otherwise it will deadlock
|
||||||
{
|
{
|
||||||
let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
|
let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
|
||||||
|
@ -55,17 +57,28 @@ impl CrawlJob {
|
||||||
"Starting crawl for {}, distance {}",
|
"Starting crawl for {}, distance {}",
|
||||||
&self.domain, &self.current_distance
|
&self.domain, &self.current_distance
|
||||||
);
|
);
|
||||||
let site_info = match self.fetch_instance_details().await {
|
let (node_info, site_info) = match self.fetch_instance_details().await {
|
||||||
Ok(o) => o,
|
Ok(o) => o,
|
||||||
Err(e) => return vec![Err(e)],
|
Err(e) => return vec![Err(e)],
|
||||||
};
|
};
|
||||||
|
let mut crawl_result = CrawlResult {
|
||||||
|
domain: self.domain.clone(),
|
||||||
|
node_info,
|
||||||
|
site_info: None,
|
||||||
|
};
|
||||||
|
|
||||||
if site_info.1 < self.params.min_lemmy_version {
|
if let Some(site_info) = site_info {
|
||||||
return vec![];
|
match Version::parse(&site_info.version) {
|
||||||
|
Ok(version) => {
|
||||||
|
if version < self.params.min_lemmy_version {
|
||||||
|
return vec![Ok(crawl_result)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => return vec![Err(e.into())],
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut result = vec![];
|
let mut result = vec![];
|
||||||
if let Some(federated) = &site_info.0.federated_instances {
|
if let Some(federated) = &site_info.federated_instances {
|
||||||
for domain in federated.linked.iter() {
|
for domain in federated.linked.iter() {
|
||||||
let crawl_job = CrawlJob::new(
|
let crawl_job = CrawlJob::new(
|
||||||
domain.clone(),
|
domain.clone(),
|
||||||
|
@ -76,26 +89,47 @@ impl CrawlJob {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut result2: Vec<Result<InstanceDetails, Error>> =
|
let mut result2: Vec<Result<CrawlResult, Error>> =
|
||||||
join_all(result).await.into_iter().flatten().collect();
|
join_all(result).await.into_iter().flatten().collect();
|
||||||
debug!("Successfully finished crawl for {}", &self.domain);
|
debug!("Successfully finished crawl for {}", &self.domain);
|
||||||
result2.push(Ok(InstanceDetails {
|
crawl_result.site_info = Some(site_info);
|
||||||
domain: self.domain,
|
result2.push(Ok(crawl_result));
|
||||||
site_info: site_info.0,
|
|
||||||
}));
|
|
||||||
|
|
||||||
result2
|
result2
|
||||||
|
} else {
|
||||||
|
vec![Ok(crawl_result)]
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_instance_details(&self) -> Result<(GetSiteResponse, Version), Error> {
|
async fn fetch_instance_details(&self) -> Result<(NodeInfo, Option<GetSiteResponse>), Error> {
|
||||||
let site_info_url = format!("https://{}/api/v3/site", &self.domain);
|
let rel_node_info: Url = Url::parse("http://nodeinfo.diaspora.software/ns/schema/2.0")
|
||||||
|
.expect("parse nodeinfo relation url");
|
||||||
|
let node_info_well_known = CLIENT
|
||||||
|
.get(&format!("https://{}/.well-known/nodeinfo", &self.domain))
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.json::<NodeInfoWellKnown>()
|
||||||
|
.await?;
|
||||||
|
let node_info_url = node_info_well_known
|
||||||
|
.links
|
||||||
|
.into_iter()
|
||||||
|
.find(|l| l.rel == rel_node_info)
|
||||||
|
.ok_or_else(|| anyhow!("failed to find nodeinfo link for {}", &self.domain))?
|
||||||
|
.href;
|
||||||
|
let node_info = CLIENT
|
||||||
|
.get(node_info_url)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.json::<NodeInfo>()
|
||||||
|
.await?;
|
||||||
|
|
||||||
let site_info = CLIENT
|
let site_info = CLIENT
|
||||||
.get(&site_info_url)
|
.get(&format!("https://{}/api/v3/site", &self.domain))
|
||||||
.send()
|
.send()
|
||||||
.await?
|
.await?
|
||||||
.json::<GetSiteResponse>()
|
.json::<GetSiteResponse>()
|
||||||
.await?;
|
.await
|
||||||
let version = Version::parse(&site_info.version)?;
|
.ok();
|
||||||
Ok((site_info, version))
|
Ok((node_info, site_info))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
74
src/lib.rs
74
src/lib.rs
|
@ -1,19 +1,23 @@
|
||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate derive_new;
|
extern crate derive_new;
|
||||||
|
|
||||||
use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails};
|
use crate::crawl::{CrawlJob, CrawlParams, CrawlResult};
|
||||||
|
use crate::node_info::{NodeInfo, NodeInfoUsage, NodeInfoUsers};
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
|
use lemmy_api_common::site::GetSiteResponse;
|
||||||
use log::warn;
|
use log::warn;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use reqwest::{Client, ClientBuilder};
|
use reqwest::{Client, ClientBuilder};
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
|
use serde::Serialize;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
use tokio::sync::Mutex;
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
pub mod crawl;
|
pub mod crawl;
|
||||||
|
mod node_info;
|
||||||
|
|
||||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
|
@ -22,14 +26,21 @@ static CLIENT: Lazy<Client> = Lazy::new(|| {
|
||||||
.timeout(REQUEST_TIMEOUT)
|
.timeout(REQUEST_TIMEOUT)
|
||||||
.user_agent("lemmy-stats-crawler")
|
.user_agent("lemmy-stats-crawler")
|
||||||
.build()
|
.build()
|
||||||
.unwrap()
|
.expect("build reqwest client")
|
||||||
});
|
});
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
pub struct CrawlResult2 {
|
||||||
|
pub domain: String,
|
||||||
|
pub site_info: GetSiteResponse,
|
||||||
|
pub federated_counts: Option<NodeInfoUsage>,
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn start_crawl(
|
pub async fn start_crawl(
|
||||||
start_instances: Vec<String>,
|
start_instances: Vec<String>,
|
||||||
exclude_domains: Vec<String>,
|
exclude_domains: Vec<String>,
|
||||||
max_distance: i32,
|
max_distance: i32,
|
||||||
) -> Result<Vec<InstanceDetails>, Error> {
|
) -> Result<Vec<CrawlResult2>, Error> {
|
||||||
let params = Arc::new(CrawlParams::new(
|
let params = Arc::new(CrawlParams::new(
|
||||||
min_lemmy_version().await?,
|
min_lemmy_version().await?,
|
||||||
exclude_domains,
|
exclude_domains,
|
||||||
|
@ -42,8 +53,7 @@ pub async fn start_crawl(
|
||||||
jobs.push(job.crawl());
|
jobs.push(job.crawl());
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: log the errors
|
let crawl_results: Vec<CrawlResult> = join_all(jobs)
|
||||||
let mut instance_details: Vec<InstanceDetails> = join_all(jobs)
|
|
||||||
.await
|
.await
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.flatten()
|
.flatten()
|
||||||
|
@ -52,20 +62,20 @@ pub async fn start_crawl(
|
||||||
warn!("{}", e)
|
warn!("{}", e)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.filter_map(|r| r.ok())
|
.filter_map(Result::ok)
|
||||||
.collect();
|
.collect();
|
||||||
|
let mut crawl_results = calculate_federated_site_aggregates(crawl_results)?;
|
||||||
|
|
||||||
// Sort by active monthly users descending
|
// Sort by active monthly users descending
|
||||||
instance_details.sort_unstable_by_key(|i| {
|
crawl_results.sort_unstable_by_key(|i| {
|
||||||
i.site_info
|
i.site_info
|
||||||
.site_view
|
.site_view
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|s| s.counts.users_active_month)
|
.map(|s| s.counts.users_active_month)
|
||||||
.unwrap_or(0)
|
.unwrap_or(0)
|
||||||
});
|
});
|
||||||
instance_details.reverse();
|
crawl_results.reverse();
|
||||||
|
Ok(crawl_results)
|
||||||
Ok(instance_details)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// calculate minimum allowed lemmy version based on current version. in case of current version
|
/// calculate minimum allowed lemmy version based on current version. in case of current version
|
||||||
|
@ -82,3 +92,47 @@ async fn min_lemmy_version() -> Result<Version, Error> {
|
||||||
version.minor -= 1;
|
version.minor -= 1;
|
||||||
Ok(version)
|
Ok(version)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn calculate_federated_site_aggregates(
|
||||||
|
crawl_results: Vec<CrawlResult>,
|
||||||
|
) -> Result<Vec<CrawlResult2>, Error> {
|
||||||
|
let node_info: Vec<(String, NodeInfo)> = crawl_results
|
||||||
|
.iter()
|
||||||
|
.map(|c| (c.domain.clone(), c.node_info.clone()))
|
||||||
|
.collect();
|
||||||
|
let lemmy_instances: Vec<(String, GetSiteResponse)> = crawl_results
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|c| {
|
||||||
|
let domain = c.domain;
|
||||||
|
c.site_info.map(|c2| (domain, c2))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let mut ret = vec![];
|
||||||
|
for instance in &lemmy_instances {
|
||||||
|
let federated_counts = if let Some(federated_instances) = &instance.1.federated_instances {
|
||||||
|
node_info
|
||||||
|
.iter()
|
||||||
|
.filter(|i| federated_instances.linked.contains(&i.0) || i.0 == instance.0)
|
||||||
|
.map(|i| i.1.usage.clone())
|
||||||
|
.reduce(|a, b| NodeInfoUsage {
|
||||||
|
users: NodeInfoUsers {
|
||||||
|
total: a.users.total + b.users.total,
|
||||||
|
active_halfyear: a.users.active_halfyear + b.users.active_halfyear,
|
||||||
|
active_month: a.users.active_month + b.users.active_month,
|
||||||
|
},
|
||||||
|
posts: a.posts + b.posts,
|
||||||
|
comments: a.comments + b.comments,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
// TODO: workaround because GetSiteResponse doesnt implement clone
|
||||||
|
let site_info = serde_json::from_str(&serde_json::to_string(&instance.1)?)?;
|
||||||
|
ret.push(CrawlResult2 {
|
||||||
|
domain: instance.0.clone(),
|
||||||
|
site_info,
|
||||||
|
federated_counts,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(ret)
|
||||||
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use lemmy_stats_crawler::crawl::InstanceDetails;
|
use lemmy_stats_crawler::{start_crawl, CrawlResult2};
|
||||||
use lemmy_stats_crawler::start_crawl;
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use structopt::StructOpt;
|
use structopt::StructOpt;
|
||||||
|
|
||||||
|
@ -57,10 +56,10 @@ struct TotalStats {
|
||||||
users_active_week: i64,
|
users_active_week: i64,
|
||||||
users_active_month: i64,
|
users_active_month: i64,
|
||||||
users_active_halfyear: i64,
|
users_active_halfyear: i64,
|
||||||
instance_details: Vec<InstanceDetails>,
|
instance_details: Vec<CrawlResult2>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn aggregate(instance_details: Vec<InstanceDetails>) -> TotalStats {
|
fn aggregate(instance_details: Vec<CrawlResult2>) -> TotalStats {
|
||||||
let mut online_users = 0;
|
let mut online_users = 0;
|
||||||
let mut total_users = 0;
|
let mut total_users = 0;
|
||||||
let mut users_active_day = 0;
|
let mut users_active_day = 0;
|
||||||
|
|
47
src/node_info.rs
Normal file
47
src/node_info.rs
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
use reqwest::Url;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
pub struct NodeInfoWellKnown {
|
||||||
|
pub links: Vec<NodeInfoWellKnownLinks>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
pub struct NodeInfoWellKnownLinks {
|
||||||
|
pub rel: Url,
|
||||||
|
pub href: Url,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
|
#[serde(rename_all = "camelCase")]
|
||||||
|
pub struct NodeInfo {
|
||||||
|
pub version: String,
|
||||||
|
pub software: NodeInfoSoftware,
|
||||||
|
pub protocols: Vec<String>,
|
||||||
|
pub usage: NodeInfoUsage,
|
||||||
|
pub open_registrations: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
|
pub struct NodeInfoSoftware {
|
||||||
|
pub name: String,
|
||||||
|
pub version: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
|
||||||
|
#[serde(rename_all = "camelCase", default)]
|
||||||
|
pub struct NodeInfoUsage {
|
||||||
|
pub users: NodeInfoUsers,
|
||||||
|
#[serde(rename(deserialize = "localPosts"))]
|
||||||
|
pub posts: i64,
|
||||||
|
#[serde(rename(deserialize = "localComments"))]
|
||||||
|
pub comments: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
|
||||||
|
#[serde(rename_all = "camelCase", default)]
|
||||||
|
pub struct NodeInfoUsers {
|
||||||
|
pub total: i64,
|
||||||
|
pub active_halfyear: i64,
|
||||||
|
pub active_month: i64,
|
||||||
|
}
|
Loading…
Reference in a new issue