Compare commits
14 commits
min-versio
...
main
Author | SHA1 | Date | |
---|---|---|---|
bcb852cfeb | |||
ea57a16987 | |||
1343932b2b | |||
1d1823e27d | |||
418db7831f | |||
d11febc7e8 | |||
7ede38f584 | |||
21cf61f847 | |||
|
575672cbe3 | ||
|
0c3ba08d6c | ||
0079e72759 | |||
c254e50211 | |||
2e2a4888d0 | |||
|
8509c19f50 |
7 changed files with 3021 additions and 616 deletions
3021
Cargo.lock
generated
3021
Cargo.lock
generated
File diff suppressed because it is too large
Load diff
24
Cargo.toml
24
Cargo.toml
|
@ -5,12 +5,18 @@ authors = ["Felix Ableitner"]
|
||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
reqwest = { version = "0.10.10", default-features = false, features = ["json", "rustls-tls"] }
|
reqwest = { version = "0.11.13", default-features = false, features = ["json", "rustls-tls"] }
|
||||||
serde = { version = "1.0.123", features = ["derive"] }
|
serde = { version = "1.0.149", features = ["derive"] }
|
||||||
anyhow = "1.0.38"
|
anyhow = "1.0.66"
|
||||||
tokio = { version = "0.2.25", features = ["rt-threaded", "macros"] }
|
tokio = { version = "1.22.0", features = ["macros", "rt-multi-thread"] }
|
||||||
futures = "0.3.13"
|
futures = "0.3.25"
|
||||||
serde_json = "1.0.64"
|
serde_json = "1.0.89"
|
||||||
clap = "2.33.3"
|
semver = "1.0.14"
|
||||||
semver = "1.0.7"
|
once_cell = "1.16.0"
|
||||||
once_cell = "1.10.0"
|
lemmy_api_common = "=0.16.0"
|
||||||
|
lemmy_db_schema = "=0.16.0"
|
||||||
|
async-recursion = "1.0.0"
|
||||||
|
log = "0.4.17"
|
||||||
|
derive-new = "0.5.9"
|
||||||
|
stderrlog = "0.5.4"
|
||||||
|
structopt = "0.3.26"
|
||||||
|
|
279
src/crawl.rs
279
src/crawl.rs
|
@ -1,180 +1,135 @@
|
||||||
use crate::federated_instances::GetSiteResponse;
|
use crate::node_info::{NodeInfo, NodeInfoWellKnown};
|
||||||
use crate::node_info::NodeInfo;
|
use crate::CLIENT;
|
||||||
use crate::REQUEST_TIMEOUT;
|
use anyhow::{anyhow, Error};
|
||||||
use anyhow::anyhow;
|
use async_recursion::async_recursion;
|
||||||
use anyhow::Error;
|
use futures::future::join_all;
|
||||||
use futures::try_join;
|
use lemmy_api_common::site::GetSiteResponse;
|
||||||
use once_cell::sync::Lazy;
|
use log::debug;
|
||||||
use reqwest::Client;
|
use reqwest::Url;
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
use serde::Serialize;
|
use std::collections::HashSet;
|
||||||
use std::collections::VecDeque;
|
use std::ops::Deref;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
static CLIENT: Lazy<Client> = Lazy::new(Client::default);
|
#[derive(new)]
|
||||||
|
pub struct CrawlJob {
|
||||||
pub async fn crawl(
|
|
||||||
start_instances: Vec<String>,
|
|
||||||
exclude: Vec<String>,
|
|
||||||
max_depth: i32,
|
|
||||||
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
|
||||||
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
|
|
||||||
.iter()
|
|
||||||
.map(|s| CrawlInstance::new(s.to_string(), 0))
|
|
||||||
.collect();
|
|
||||||
let min_lemmy_version = min_lemmy_version().await?;
|
|
||||||
let mut crawled_instances = vec![];
|
|
||||||
let mut instance_details = vec![];
|
|
||||||
let mut failed_instances = 0;
|
|
||||||
while let Some(current_instance) = pending_instances.pop_back() {
|
|
||||||
crawled_instances.push(current_instance.domain.clone());
|
|
||||||
if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
match fetch_instance_details(¤t_instance.domain, &min_lemmy_version).await {
|
|
||||||
Ok(details) => {
|
|
||||||
instance_details.push(details.to_owned());
|
|
||||||
for i in details.linked_instances {
|
|
||||||
let is_in_crawled = crawled_instances.contains(&i);
|
|
||||||
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
|
|
||||||
if !is_in_crawled && !is_in_pending {
|
|
||||||
let ci = CrawlInstance::new(i, current_instance.depth + 1);
|
|
||||||
pending_instances.push_back(ci);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
failed_instances += 1;
|
|
||||||
eprintln!("Failed to crawl {}: {}", current_instance.domain, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort by active monthly users descending
|
|
||||||
instance_details.sort_by_key(|i| i.users_active_month);
|
|
||||||
instance_details.reverse();
|
|
||||||
|
|
||||||
Ok((instance_details, failed_instances))
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Serialize, Clone)]
|
|
||||||
pub struct InstanceDetails {
|
|
||||||
pub domain: String,
|
|
||||||
pub name: String,
|
|
||||||
pub description: Option<String>,
|
|
||||||
pub version: String,
|
|
||||||
pub icon: Option<String>,
|
|
||||||
pub online_users: i32,
|
|
||||||
pub total_users: i64,
|
|
||||||
pub users_active_halfyear: i64,
|
|
||||||
pub users_active_month: i64,
|
|
||||||
pub open_registrations: bool,
|
|
||||||
pub linked_instances_count: i32,
|
|
||||||
pub require_application: bool,
|
|
||||||
// The following fields are only used for aggregation, but not shown in output
|
|
||||||
#[serde(skip)]
|
|
||||||
pub linked_instances: Vec<String>,
|
|
||||||
}
|
|
||||||
|
|
||||||
struct CrawlInstance {
|
|
||||||
domain: String,
|
domain: String,
|
||||||
depth: i32,
|
current_distance: i32,
|
||||||
|
params: Arc<CrawlParams>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CrawlInstance {
|
#[derive(new)]
|
||||||
pub fn new(domain: String, depth: i32) -> CrawlInstance {
|
pub struct CrawlParams {
|
||||||
CrawlInstance { domain, depth }
|
min_lemmy_version: Version,
|
||||||
}
|
exclude_domains: Vec<String>,
|
||||||
|
max_depth: i32,
|
||||||
|
crawled_instances: Arc<Mutex<HashSet<String>>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_instance_details(
|
#[derive(Debug)]
|
||||||
domain: &str,
|
pub struct CrawlResult {
|
||||||
min_lemmy_version: &Version,
|
pub domain: String,
|
||||||
) -> Result<InstanceDetails, Error> {
|
pub node_info: NodeInfo,
|
||||||
let client = Client::default();
|
pub site_info: Option<GetSiteResponse>,
|
||||||
|
}
|
||||||
|
|
||||||
let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain);
|
impl CrawlJob {
|
||||||
let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send();
|
#[async_recursion]
|
||||||
|
pub async fn crawl(self) -> Vec<Result<CrawlResult, Error>> {
|
||||||
let site_info_url_v2 = format!("https://{}/api/v2/site", domain);
|
// need to acquire and release mutex before recursing, otherwise it will deadlock
|
||||||
let site_info_request_v2 = client
|
{
|
||||||
.get(&site_info_url_v2)
|
let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
|
||||||
.timeout(REQUEST_TIMEOUT)
|
if crawled_instances.contains(&self.domain) {
|
||||||
.send();
|
return vec![];
|
||||||
let site_info_url_v3 = format!("https://{}/api/v3/site", domain);
|
|
||||||
let site_info_request_v3 = client
|
|
||||||
.get(&site_info_url_v3)
|
|
||||||
.timeout(REQUEST_TIMEOUT)
|
|
||||||
.send();
|
|
||||||
|
|
||||||
let (node_info, site_info_v2, site_info_v3) = try_join!(
|
|
||||||
node_info_request,
|
|
||||||
site_info_request_v2,
|
|
||||||
site_info_request_v3
|
|
||||||
)?;
|
|
||||||
let node_info: NodeInfo = node_info.json().await?;
|
|
||||||
if node_info.software.name != "lemmy" {
|
|
||||||
return Err(anyhow!("not a lemmy instance"));
|
|
||||||
}
|
|
||||||
let version = Version::parse(&node_info.software.version)?;
|
|
||||||
if &version < min_lemmy_version {
|
|
||||||
return Err(anyhow!("lemmy version is too old ({})", version));
|
|
||||||
}
|
|
||||||
let site_info_v2 = site_info_v2.json::<GetSiteResponse>().await.ok();
|
|
||||||
let site_info_v3 = site_info_v3.json::<GetSiteResponse>().await.ok();
|
|
||||||
let mut site_info: GetSiteResponse = if let Some(site_info_v2) = site_info_v2 {
|
|
||||||
site_info_v2
|
|
||||||
} else if let Some(site_info_v3) = site_info_v3 {
|
|
||||||
site_info_v3
|
|
||||||
} else {
|
} else {
|
||||||
return Err(anyhow!("Failed to read site_info"));
|
crawled_instances.insert(self.domain.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if self.current_distance > self.params.max_depth
|
||||||
|
|| self.params.exclude_domains.contains(&self.domain)
|
||||||
|
{
|
||||||
|
return vec![];
|
||||||
|
}
|
||||||
|
|
||||||
|
debug!(
|
||||||
|
"Starting crawl for {}, distance {}",
|
||||||
|
&self.domain, &self.current_distance
|
||||||
|
);
|
||||||
|
let (node_info, site_info) = match self.fetch_instance_details().await {
|
||||||
|
Ok(o) => o,
|
||||||
|
Err(e) => return vec![Err(e)],
|
||||||
|
};
|
||||||
|
let mut crawl_result = CrawlResult {
|
||||||
|
domain: self.domain.clone(),
|
||||||
|
node_info,
|
||||||
|
site_info: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(description) = &site_info.site_view.site.description {
|
if let Some(site_info) = site_info {
|
||||||
if description.len() > 150 {
|
match Version::parse(&site_info.version) {
|
||||||
site_info.site_view.site.description = None;
|
Ok(version) => {
|
||||||
|
if version < self.params.min_lemmy_version {
|
||||||
|
return vec![Ok(crawl_result)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => return vec![Err(e.into())],
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut result = vec![];
|
||||||
|
if let Some(federated) = &site_info.federated_instances {
|
||||||
|
for domain in federated.linked.iter() {
|
||||||
|
let crawl_job = CrawlJob::new(
|
||||||
|
domain.clone(),
|
||||||
|
self.current_distance + 1,
|
||||||
|
self.params.clone(),
|
||||||
|
);
|
||||||
|
result.push(crawl_job.crawl());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let require_application = site_info
|
let mut result2: Vec<Result<CrawlResult, Error>> =
|
||||||
.site_view
|
join_all(result).await.into_iter().flatten().collect();
|
||||||
.site
|
debug!("Successfully finished crawl for {}", &self.domain);
|
||||||
.require_application
|
crawl_result.site_info = Some(site_info);
|
||||||
.unwrap_or(false);
|
result2.push(Ok(crawl_result));
|
||||||
let linked_instances: Vec<String> = site_info
|
|
||||||
.federated_instances
|
|
||||||
.map(|f| f.linked)
|
|
||||||
.unwrap_or_default()
|
|
||||||
.iter()
|
|
||||||
.map(|l| l.to_lowercase())
|
|
||||||
.collect();
|
|
||||||
Ok(InstanceDetails {
|
|
||||||
domain: domain.to_owned(),
|
|
||||||
name: site_info.site_view.site.name,
|
|
||||||
description: site_info.site_view.site.description,
|
|
||||||
version: node_info.software.version,
|
|
||||||
icon: site_info.site_view.site.icon,
|
|
||||||
online_users: site_info.online as i32,
|
|
||||||
total_users: node_info.usage.users.total,
|
|
||||||
users_active_halfyear: node_info.usage.users.active_halfyear,
|
|
||||||
users_active_month: node_info.usage.users.active_month,
|
|
||||||
open_registrations: node_info.open_registrations,
|
|
||||||
linked_instances_count: linked_instances.len() as i32,
|
|
||||||
require_application,
|
|
||||||
linked_instances,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
/// calculate minimum allowed lemmy version based on current version. in case of current version
|
result2
|
||||||
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on
|
} else {
|
||||||
/// the previous version when a major lemmy release is published.
|
vec![Ok(crawl_result)]
|
||||||
async fn min_lemmy_version() -> Result<Version, Error> {
|
}
|
||||||
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION";
|
}
|
||||||
let req = CLIENT
|
|
||||||
.get(lemmy_version_url)
|
async fn fetch_instance_details(&self) -> Result<(NodeInfo, Option<GetSiteResponse>), Error> {
|
||||||
.timeout(REQUEST_TIMEOUT)
|
let rel_node_info: Url = Url::parse("http://nodeinfo.diaspora.software/ns/schema/2.0")
|
||||||
|
.expect("parse nodeinfo relation url");
|
||||||
|
let node_info_well_known = CLIENT
|
||||||
|
.get(&format!("https://{}/.well-known/nodeinfo", &self.domain))
|
||||||
.send()
|
.send()
|
||||||
|
.await?
|
||||||
|
.json::<NodeInfoWellKnown>()
|
||||||
.await?;
|
.await?;
|
||||||
let mut version = Version::parse(req.text().await?.trim())?;
|
let node_info_url = node_info_well_known
|
||||||
version.minor -= 1;
|
.links
|
||||||
Ok(version)
|
.into_iter()
|
||||||
|
.find(|l| l.rel == rel_node_info)
|
||||||
|
.ok_or_else(|| anyhow!("failed to find nodeinfo link for {}", &self.domain))?
|
||||||
|
.href;
|
||||||
|
let node_info = CLIENT
|
||||||
|
.get(node_info_url)
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.json::<NodeInfo>()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let site_info = CLIENT
|
||||||
|
.get(&format!("https://{}/api/v3/site", &self.domain))
|
||||||
|
.send()
|
||||||
|
.await?
|
||||||
|
.json::<GetSiteResponse>()
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
Ok((node_info, site_info))
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,28 +0,0 @@
|
||||||
use serde::Deserialize;
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug, Clone)]
|
|
||||||
pub struct GetSiteResponse {
|
|
||||||
pub site_view: SiteView,
|
|
||||||
pub online: usize,
|
|
||||||
pub federated_instances: Option<FederatedInstances>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug, Clone)]
|
|
||||||
pub struct FederatedInstances {
|
|
||||||
pub linked: Vec<String>,
|
|
||||||
pub allowed: Option<Vec<String>>,
|
|
||||||
pub blocked: Option<Vec<String>>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug, Clone)]
|
|
||||||
pub struct SiteView {
|
|
||||||
pub site: Site,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug, Clone)]
|
|
||||||
pub struct Site {
|
|
||||||
pub name: String,
|
|
||||||
pub icon: Option<String>,
|
|
||||||
pub description: Option<String>,
|
|
||||||
pub require_application: Option<bool>,
|
|
||||||
}
|
|
141
src/lib.rs
141
src/lib.rs
|
@ -1,11 +1,138 @@
|
||||||
|
#[macro_use]
|
||||||
|
extern crate derive_new;
|
||||||
|
|
||||||
|
use crate::crawl::{CrawlJob, CrawlParams, CrawlResult};
|
||||||
|
use crate::node_info::{NodeInfo, NodeInfoUsage, NodeInfoUsers};
|
||||||
|
use anyhow::Error;
|
||||||
|
use futures::future::join_all;
|
||||||
|
use lemmy_api_common::site::GetSiteResponse;
|
||||||
|
use log::warn;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use reqwest::{Client, ClientBuilder};
|
||||||
|
use semver::Version;
|
||||||
|
use serde::Serialize;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
pub mod crawl;
|
pub mod crawl;
|
||||||
pub mod federated_instances;
|
mod node_info;
|
||||||
pub mod node_info;
|
|
||||||
|
|
||||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
|
|
||||||
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "20";
|
static CLIENT: Lazy<Client> = Lazy::new(|| {
|
||||||
pub const EXCLUDE_INSTANCES: &str =
|
ClientBuilder::new()
|
||||||
"ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml";
|
.timeout(REQUEST_TIMEOUT)
|
||||||
|
.user_agent("lemmy-stats-crawler")
|
||||||
|
.build()
|
||||||
|
.expect("build reqwest client")
|
||||||
|
});
|
||||||
|
|
||||||
|
#[derive(Serialize, Debug)]
|
||||||
|
pub struct CrawlResult2 {
|
||||||
|
pub domain: String,
|
||||||
|
pub site_info: GetSiteResponse,
|
||||||
|
pub federated_counts: Option<NodeInfoUsage>,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn start_crawl(
|
||||||
|
start_instances: Vec<String>,
|
||||||
|
exclude_domains: Vec<String>,
|
||||||
|
max_distance: i32,
|
||||||
|
) -> Result<Vec<CrawlResult2>, Error> {
|
||||||
|
let params = Arc::new(CrawlParams::new(
|
||||||
|
min_lemmy_version().await?,
|
||||||
|
exclude_domains,
|
||||||
|
max_distance,
|
||||||
|
Arc::new(Mutex::new(HashSet::new())),
|
||||||
|
));
|
||||||
|
let mut jobs = vec![];
|
||||||
|
for domain in start_instances.into_iter() {
|
||||||
|
let job = CrawlJob::new(domain, 0, params.clone());
|
||||||
|
jobs.push(job.crawl());
|
||||||
|
}
|
||||||
|
|
||||||
|
let crawl_results: Vec<CrawlResult> = join_all(jobs)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.flatten()
|
||||||
|
.inspect(|r| {
|
||||||
|
if let Err(e) = r {
|
||||||
|
warn!("{}", e)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.collect();
|
||||||
|
let mut crawl_results = calculate_federated_site_aggregates(crawl_results)?;
|
||||||
|
|
||||||
|
// Sort by active monthly users descending
|
||||||
|
crawl_results.sort_unstable_by_key(|i| {
|
||||||
|
i.site_info
|
||||||
|
.site_view
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| s.counts.users_active_month)
|
||||||
|
.unwrap_or(0)
|
||||||
|
});
|
||||||
|
crawl_results.reverse();
|
||||||
|
Ok(crawl_results)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// calculate minimum allowed lemmy version based on current version. in case of current version
|
||||||
|
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on
|
||||||
|
/// the previous version when a major lemmy release is published.
|
||||||
|
async fn min_lemmy_version() -> Result<Version, Error> {
|
||||||
|
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION";
|
||||||
|
let req = CLIENT
|
||||||
|
.get(lemmy_version_url)
|
||||||
|
.timeout(REQUEST_TIMEOUT)
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
let mut version = Version::parse(req.text().await?.trim())?;
|
||||||
|
version.minor -= 1;
|
||||||
|
Ok(version)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn calculate_federated_site_aggregates(
|
||||||
|
crawl_results: Vec<CrawlResult>,
|
||||||
|
) -> Result<Vec<CrawlResult2>, Error> {
|
||||||
|
let node_info: Vec<(String, NodeInfo)> = crawl_results
|
||||||
|
.iter()
|
||||||
|
.map(|c| (c.domain.clone(), c.node_info.clone()))
|
||||||
|
.collect();
|
||||||
|
let lemmy_instances: Vec<(String, GetSiteResponse)> = crawl_results
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|c| {
|
||||||
|
let domain = c.domain;
|
||||||
|
c.site_info.map(|c2| (domain, c2))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let mut ret = vec![];
|
||||||
|
for instance in &lemmy_instances {
|
||||||
|
let federated_counts = if let Some(federated_instances) = &instance.1.federated_instances {
|
||||||
|
node_info
|
||||||
|
.iter()
|
||||||
|
.filter(|i| federated_instances.linked.contains(&i.0) || i.0 == instance.0)
|
||||||
|
.map(|i| i.1.usage.clone())
|
||||||
|
.reduce(|a, b| NodeInfoUsage {
|
||||||
|
users: NodeInfoUsers {
|
||||||
|
total: a.users.total + b.users.total,
|
||||||
|
active_halfyear: a.users.active_halfyear + b.users.active_halfyear,
|
||||||
|
active_month: a.users.active_month + b.users.active_month,
|
||||||
|
},
|
||||||
|
posts: a.posts + b.posts,
|
||||||
|
comments: a.comments + b.comments,
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
// TODO: workaround because GetSiteResponse doesnt implement clone
|
||||||
|
let site_info = serde_json::from_str(&serde_json::to_string(&instance.1)?)?;
|
||||||
|
ret.push(CrawlResult2 {
|
||||||
|
domain: instance.0.clone(),
|
||||||
|
site_info,
|
||||||
|
federated_counts,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
Ok(ret)
|
||||||
|
}
|
||||||
|
|
108
src/main.rs
108
src/main.rs
|
@ -1,73 +1,91 @@
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use clap::{App, Arg};
|
use lemmy_stats_crawler::{start_crawl, CrawlResult2};
|
||||||
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
|
|
||||||
use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES};
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
use structopt::StructOpt;
|
||||||
|
|
||||||
|
#[derive(StructOpt, Debug)]
|
||||||
|
#[structopt()]
|
||||||
|
struct Parameters {
|
||||||
|
#[structopt(short, long, use_delimiter = true, default_value = "lemmy.ml")]
|
||||||
|
start_instances: Vec<String>,
|
||||||
|
#[structopt(
|
||||||
|
short,
|
||||||
|
long,
|
||||||
|
use_delimiter = true,
|
||||||
|
default_value = "ds9.lemmy.ml,enterprise.lemmy.ml,voyager.lemmy.ml,test.lemmy.ml"
|
||||||
|
)]
|
||||||
|
exclude_instances: Vec<String>,
|
||||||
|
#[structopt(short, long, default_value = "20")]
|
||||||
|
max_crawl_distance: i32,
|
||||||
|
/// Silence all output
|
||||||
|
#[structopt(short, long)]
|
||||||
|
quiet: bool,
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() -> Result<(), Error> {
|
pub async fn main() -> Result<(), Error> {
|
||||||
let matches = App::new("Lemmy Stats Crawler")
|
let params = Parameters::from_args();
|
||||||
.arg(
|
|
||||||
Arg::with_name("start-instances")
|
stderrlog::new()
|
||||||
.long("start-instances")
|
.module(module_path!())
|
||||||
.takes_value(true),
|
.quiet(params.quiet)
|
||||||
)
|
.verbosity(1)
|
||||||
.arg(Arg::with_name("exclude").long("exclude").takes_value(true))
|
.init()?;
|
||||||
.arg(
|
|
||||||
Arg::with_name("max-crawl-depth")
|
|
||||||
.long("max-crawl-depth")
|
|
||||||
.takes_value(true),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
let start_instances: Vec<String> = matches
|
|
||||||
.value_of("start-instances")
|
|
||||||
.unwrap_or(DEFAULT_START_INSTANCES)
|
|
||||||
.split(',')
|
|
||||||
.map(|s| s.trim().to_string())
|
|
||||||
.collect();
|
|
||||||
let exclude: Vec<String> = matches
|
|
||||||
.value_of("exclude")
|
|
||||||
.unwrap_or(EXCLUDE_INSTANCES)
|
|
||||||
.split(',')
|
|
||||||
.map(|s| s.trim().to_string())
|
|
||||||
.collect();
|
|
||||||
let max_crawl_depth: i32 = matches
|
|
||||||
.value_of("max-crawl-depth")
|
|
||||||
.unwrap_or(DEFAULT_MAX_CRAWL_DEPTH)
|
|
||||||
.parse()?;
|
|
||||||
|
|
||||||
eprintln!("Crawling...");
|
eprintln!("Crawling...");
|
||||||
let (instance_details, failed_instances) =
|
let instance_details = start_crawl(
|
||||||
crawl(start_instances, exclude, max_crawl_depth).await?;
|
params.start_instances,
|
||||||
let total_stats = aggregate(instance_details, failed_instances);
|
params.exclude_instances,
|
||||||
|
params.max_crawl_distance,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let total_stats = aggregate(instance_details);
|
||||||
|
|
||||||
println!("{}", serde_json::to_string_pretty(&total_stats)?);
|
println!("{}", serde_json::to_string_pretty(&total_stats)?);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: lemmy stores these numbers in SiteAggregates, would be good to simply use that as a member
|
||||||
|
// (to avoid many members). but SiteAggregates also has id, site_id fields
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct TotalStats {
|
struct TotalStats {
|
||||||
crawled_instances: i32,
|
crawled_instances: i32,
|
||||||
failed_instances: i32,
|
online_users: usize,
|
||||||
total_users: i64,
|
total_users: i64,
|
||||||
total_online_users: i32,
|
users_active_day: i64,
|
||||||
instance_details: Vec<InstanceDetails>,
|
users_active_week: i64,
|
||||||
|
users_active_month: i64,
|
||||||
|
users_active_halfyear: i64,
|
||||||
|
instance_details: Vec<CrawlResult2>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn aggregate(instance_details: Vec<InstanceDetails>, failed_instances: i32) -> TotalStats {
|
fn aggregate(instance_details: Vec<CrawlResult2>) -> TotalStats {
|
||||||
let mut crawled_instances = 0;
|
let mut online_users = 0;
|
||||||
let mut total_users = 0;
|
let mut total_users = 0;
|
||||||
let mut total_online_users = 0;
|
let mut users_active_day = 0;
|
||||||
|
let mut users_active_week = 0;
|
||||||
|
let mut users_active_month = 0;
|
||||||
|
let mut users_active_halfyear = 0;
|
||||||
|
let mut crawled_instances = 0;
|
||||||
for i in &instance_details {
|
for i in &instance_details {
|
||||||
crawled_instances += 1;
|
crawled_instances += 1;
|
||||||
total_users += i.total_users;
|
online_users += i.site_info.online;
|
||||||
total_online_users += i.online_users;
|
if let Some(site_view) = &i.site_info.site_view {
|
||||||
|
total_users += site_view.counts.users;
|
||||||
|
users_active_day += site_view.counts.users_active_day;
|
||||||
|
users_active_week += site_view.counts.users_active_week;
|
||||||
|
users_active_month += site_view.counts.users_active_month;
|
||||||
|
users_active_halfyear += site_view.counts.users_active_half_year;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
TotalStats {
|
TotalStats {
|
||||||
crawled_instances,
|
crawled_instances,
|
||||||
failed_instances,
|
online_users,
|
||||||
total_users,
|
total_users,
|
||||||
total_online_users,
|
users_active_day,
|
||||||
|
users_active_week,
|
||||||
|
users_active_halfyear,
|
||||||
|
users_active_month,
|
||||||
instance_details,
|
instance_details,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,18 @@
|
||||||
use serde::Deserialize;
|
use reqwest::Url;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Debug)]
|
||||||
|
pub struct NodeInfoWellKnown {
|
||||||
|
pub links: Vec<NodeInfoWellKnownLinks>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug)]
|
||||||
|
pub struct NodeInfoWellKnownLinks {
|
||||||
|
pub rel: Url,
|
||||||
|
pub href: Url,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct NodeInfo {
|
pub struct NodeInfo {
|
||||||
pub version: String,
|
pub version: String,
|
||||||
|
@ -10,22 +22,24 @@ pub struct NodeInfo {
|
||||||
pub open_registrations: bool,
|
pub open_registrations: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Debug, Clone)]
|
||||||
pub struct NodeInfoSoftware {
|
pub struct NodeInfoSoftware {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub version: String,
|
pub version: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase", default)]
|
||||||
pub struct NodeInfoUsage {
|
pub struct NodeInfoUsage {
|
||||||
pub users: NodeInfoUsers,
|
pub users: NodeInfoUsers,
|
||||||
pub local_posts: i64,
|
#[serde(rename(deserialize = "localPosts"))]
|
||||||
pub local_comments: i64,
|
pub posts: i64,
|
||||||
|
#[serde(rename(deserialize = "localComments"))]
|
||||||
|
pub comments: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Deserialize, Debug)]
|
#[derive(Deserialize, Serialize, Debug, Clone, Default)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase", default)]
|
||||||
pub struct NodeInfoUsers {
|
pub struct NodeInfoUsers {
|
||||||
pub total: i64,
|
pub total: i64,
|
||||||
pub active_halfyear: i64,
|
pub active_halfyear: i64,
|
||||||
|
|
Loading…
Reference in a new issue