Compare commits

...

14 commits

Author SHA1 Message Date
bcb852cfeb Merge pull request 'Also calculate counts including federated instances' (#12) from federated-counts into main
Reviewed-on: #12
2022-12-07 20:38:04 +00:00
ea57a16987 Also calculate counts including federated instances 2022-12-07 15:42:08 +01:00
1343932b2b Upgrade dependencies 2022-12-05 15:38:27 +01:00
1d1823e27d Remove --verbose param 2022-10-28 15:50:33 +02:00
418db7831f Log crawl errors 2022-10-28 14:05:14 +02:00
d11febc7e8 Use exact version for Lemmy dependency 2022-05-16 22:59:06 +02:00
7ede38f584 Handle comma separated params correctly 2022-05-16 14:23:32 +02:00
21cf61f847 Add user agent 2022-05-13 16:57:53 +02:00
nutomic
575672cbe3 Merge pull request 'Recursive, parallel crawl' (#11) from recursive-crawl into main
Reviewed-on: https://yerbamate.ml/LemmyNet/lemmy-stats-crawler/pulls/11
2022-05-13 11:37:52 +00:00
nutomic
0c3ba08d6c Merge pull request 'Use lemmy structs' (#10) from lemmy-structs into main
Reviewed-on: https://yerbamate.ml/LemmyNet/lemmy-stats-crawler/pulls/10
2022-05-13 11:37:45 +00:00
0079e72759 Add stderrlog, structopt for better logging and params 2022-05-11 00:56:42 +02:00
c254e50211 Recursive, parallel crawl 2022-05-10 11:49:41 +02:00
2e2a4888d0 Use lemmy structs 2022-05-10 01:29:42 +02:00
nutomic
8509c19f50 Merge pull request 'Add min version check to avoid including old instances' (#9) from min-version into main
Reviewed-on: https://yerbamate.ml/LemmyNet/lemmy-stats-crawler/pulls/9
2022-05-03 12:19:39 +00:00
7 changed files with 3021 additions and 616 deletions

3021
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -5,12 +5,18 @@ authors = ["Felix Ableitner"]
edition = "2018" edition = "2018"
[dependencies] [dependencies]
reqwest = { version = "0.10.10", default-features = false, features = ["json", "rustls-tls"] } reqwest = { version = "0.11.13", default-features = false, features = ["json", "rustls-tls"] }
serde = { version = "1.0.123", features = ["derive"] } serde = { version = "1.0.149", features = ["derive"] }
anyhow = "1.0.38" anyhow = "1.0.66"
tokio = { version = "0.2.25", features = ["rt-threaded", "macros"] } tokio = { version = "1.22.0", features = ["macros", "rt-multi-thread"] }
futures = "0.3.13" futures = "0.3.25"
serde_json = "1.0.64" serde_json = "1.0.89"
clap = "2.33.3" semver = "1.0.14"
semver = "1.0.7" once_cell = "1.16.0"
once_cell = "1.10.0" lemmy_api_common = "=0.16.0"
lemmy_db_schema = "=0.16.0"
async-recursion = "1.0.0"
log = "0.4.17"
derive-new = "0.5.9"
stderrlog = "0.5.4"
structopt = "0.3.26"

View file

@ -1,180 +1,135 @@
use crate::federated_instances::GetSiteResponse; use crate::node_info::{NodeInfo, NodeInfoWellKnown};
use crate::node_info::NodeInfo; use crate::CLIENT;
use crate::REQUEST_TIMEOUT; use anyhow::{anyhow, Error};
use anyhow::anyhow; use async_recursion::async_recursion;
use anyhow::Error; use futures::future::join_all;
use futures::try_join; use lemmy_api_common::site::GetSiteResponse;
use once_cell::sync::Lazy; use log::debug;
use reqwest::Client; use reqwest::Url;
use semver::Version; use semver::Version;
use serde::Serialize; use std::collections::HashSet;
use std::collections::VecDeque; use std::ops::Deref;
use std::sync::Arc;
use tokio::sync::Mutex;
static CLIENT: Lazy<Client> = Lazy::new(Client::default); #[derive(new)]
pub struct CrawlJob {
pub async fn crawl(
start_instances: Vec<String>,
exclude: Vec<String>,
max_depth: i32,
) -> Result<(Vec<InstanceDetails>, i32), Error> {
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
.iter()
.map(|s| CrawlInstance::new(s.to_string(), 0))
.collect();
let min_lemmy_version = min_lemmy_version().await?;
let mut crawled_instances = vec![];
let mut instance_details = vec![];
let mut failed_instances = 0;
while let Some(current_instance) = pending_instances.pop_back() {
crawled_instances.push(current_instance.domain.clone());
if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) {
continue;
}
match fetch_instance_details(&current_instance.domain, &min_lemmy_version).await {
Ok(details) => {
instance_details.push(details.to_owned());
for i in details.linked_instances {
let is_in_crawled = crawled_instances.contains(&i);
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
if !is_in_crawled && !is_in_pending {
let ci = CrawlInstance::new(i, current_instance.depth + 1);
pending_instances.push_back(ci);
}
}
}
Err(e) => {
failed_instances += 1;
eprintln!("Failed to crawl {}: {}", current_instance.domain, e)
}
}
}
// Sort by active monthly users descending
instance_details.sort_by_key(|i| i.users_active_month);
instance_details.reverse();
Ok((instance_details, failed_instances))
}
#[derive(Serialize, Clone)]
pub struct InstanceDetails {
pub domain: String,
pub name: String,
pub description: Option<String>,
pub version: String,
pub icon: Option<String>,
pub online_users: i32,
pub total_users: i64,
pub users_active_halfyear: i64,
pub users_active_month: i64,
pub open_registrations: bool,
pub linked_instances_count: i32,
pub require_application: bool,
// The following fields are only used for aggregation, but not shown in output
#[serde(skip)]
pub linked_instances: Vec<String>,
}
struct CrawlInstance {
domain: String, domain: String,
depth: i32, current_distance: i32,
params: Arc<CrawlParams>,
} }
impl CrawlInstance { #[derive(new)]
pub fn new(domain: String, depth: i32) -> CrawlInstance { pub struct CrawlParams {
CrawlInstance { domain, depth } min_lemmy_version: Version,
} exclude_domains: Vec<String>,
max_depth: i32,
crawled_instances: Arc<Mutex<HashSet<String>>>,
} }
async fn fetch_instance_details( #[derive(Debug)]
domain: &str, pub struct CrawlResult {
min_lemmy_version: &Version, pub domain: String,
) -> Result<InstanceDetails, Error> { pub node_info: NodeInfo,
let client = Client::default(); pub site_info: Option<GetSiteResponse>,
}
let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); impl CrawlJob {
let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send(); #[async_recursion]
pub async fn crawl(self) -> Vec<Result<CrawlResult, Error>> {
let site_info_url_v2 = format!("https://{}/api/v2/site", domain); // need to acquire and release mutex before recursing, otherwise it will deadlock
let site_info_request_v2 = client {
.get(&site_info_url_v2) let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
.timeout(REQUEST_TIMEOUT) if crawled_instances.contains(&self.domain) {
.send(); return vec![];
let site_info_url_v3 = format!("https://{}/api/v3/site", domain);
let site_info_request_v3 = client
.get(&site_info_url_v3)
.timeout(REQUEST_TIMEOUT)
.send();
let (node_info, site_info_v2, site_info_v3) = try_join!(
node_info_request,
site_info_request_v2,
site_info_request_v3
)?;
let node_info: NodeInfo = node_info.json().await?;
if node_info.software.name != "lemmy" {
return Err(anyhow!("not a lemmy instance"));
}
let version = Version::parse(&node_info.software.version)?;
if &version < min_lemmy_version {
return Err(anyhow!("lemmy version is too old ({})", version));
}
let site_info_v2 = site_info_v2.json::<GetSiteResponse>().await.ok();
let site_info_v3 = site_info_v3.json::<GetSiteResponse>().await.ok();
let mut site_info: GetSiteResponse = if let Some(site_info_v2) = site_info_v2 {
site_info_v2
} else if let Some(site_info_v3) = site_info_v3 {
site_info_v3
} else { } else {
return Err(anyhow!("Failed to read site_info")); crawled_instances.insert(self.domain.clone());
}
}
if self.current_distance > self.params.max_depth
|| self.params.exclude_domains.contains(&self.domain)
{
return vec![];
}
debug!(
"Starting crawl for {}, distance {}",
&self.domain, &self.current_distance
);
let (node_info, site_info) = match self.fetch_instance_details().await {
Ok(o) => o,
Err(e) => return vec![Err(e)],
};
let mut crawl_result = CrawlResult {
domain: self.domain.clone(),
node_info,
site_info: None,
}; };
if let Some(description) = &site_info.site_view.site.description { if let Some(site_info) = site_info {
if description.len() > 150 { match Version::parse(&site_info.version) {
site_info.site_view.site.description = None; Ok(version) => {
if version < self.params.min_lemmy_version {
return vec![Ok(crawl_result)];
}
}
Err(e) => return vec![Err(e.into())],
}
let mut result = vec![];
if let Some(federated) = &site_info.federated_instances {
for domain in federated.linked.iter() {
let crawl_job = CrawlJob::new(
domain.clone(),
self.current_distance + 1,
self.params.clone(),
);
result.push(crawl_job.crawl());
} }
} }
let require_application = site_info let mut result2: Vec<Result<CrawlResult, Error>> =
.site_view join_all(result).await.into_iter().flatten().collect();
.site debug!("Successfully finished crawl for {}", &self.domain);
.require_application crawl_result.site_info = Some(site_info);
.unwrap_or(false); result2.push(Ok(crawl_result));
let linked_instances: Vec<String> = site_info
.federated_instances
.map(|f| f.linked)
.unwrap_or_default()
.iter()
.map(|l| l.to_lowercase())
.collect();
Ok(InstanceDetails {
domain: domain.to_owned(),
name: site_info.site_view.site.name,
description: site_info.site_view.site.description,
version: node_info.software.version,
icon: site_info.site_view.site.icon,
online_users: site_info.online as i32,
total_users: node_info.usage.users.total,
users_active_halfyear: node_info.usage.users.active_halfyear,
users_active_month: node_info.usage.users.active_month,
open_registrations: node_info.open_registrations,
linked_instances_count: linked_instances.len() as i32,
require_application,
linked_instances,
})
}
/// calculate minimum allowed lemmy version based on current version. in case of current version result2
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on } else {
/// the previous version when a major lemmy release is published. vec![Ok(crawl_result)]
async fn min_lemmy_version() -> Result<Version, Error> { }
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION"; }
let req = CLIENT
.get(lemmy_version_url) async fn fetch_instance_details(&self) -> Result<(NodeInfo, Option<GetSiteResponse>), Error> {
.timeout(REQUEST_TIMEOUT) let rel_node_info: Url = Url::parse("http://nodeinfo.diaspora.software/ns/schema/2.0")
.expect("parse nodeinfo relation url");
let node_info_well_known = CLIENT
.get(&format!("https://{}/.well-known/nodeinfo", &self.domain))
.send() .send()
.await?
.json::<NodeInfoWellKnown>()
.await?; .await?;
let mut version = Version::parse(req.text().await?.trim())?; let node_info_url = node_info_well_known
version.minor -= 1; .links
Ok(version) .into_iter()
.find(|l| l.rel == rel_node_info)
.ok_or_else(|| anyhow!("failed to find nodeinfo link for {}", &self.domain))?
.href;
let node_info = CLIENT
.get(node_info_url)
.send()
.await?
.json::<NodeInfo>()
.await?;
let site_info = CLIENT
.get(&format!("https://{}/api/v3/site", &self.domain))
.send()
.await?
.json::<GetSiteResponse>()
.await
.ok();
Ok((node_info, site_info))
}
} }

View file

@ -1,28 +0,0 @@
use serde::Deserialize;
#[derive(Deserialize, Debug, Clone)]
pub struct GetSiteResponse {
pub site_view: SiteView,
pub online: usize,
pub federated_instances: Option<FederatedInstances>,
}
#[derive(Deserialize, Debug, Clone)]
pub struct FederatedInstances {
pub linked: Vec<String>,
pub allowed: Option<Vec<String>>,
pub blocked: Option<Vec<String>>,
}
#[derive(Deserialize, Debug, Clone)]
pub struct SiteView {
pub site: Site,
}
#[derive(Deserialize, Debug, Clone)]
pub struct Site {
pub name: String,
pub icon: Option<String>,
pub description: Option<String>,
pub require_application: Option<bool>,
}

View file

@ -1,11 +1,138 @@
#[macro_use]
extern crate derive_new;
use crate::crawl::{CrawlJob, CrawlParams, CrawlResult};
use crate::node_info::{NodeInfo, NodeInfoUsage, NodeInfoUsers};
use anyhow::Error;
use futures::future::join_all;
use lemmy_api_common::site::GetSiteResponse;
use log::warn;
use once_cell::sync::Lazy;
use reqwest::{Client, ClientBuilder};
use semver::Version;
use serde::Serialize;
use std::collections::HashSet;
use std::sync::Arc;
use std::time::Duration; use std::time::Duration;
use tokio::sync::Mutex;
pub mod crawl; pub mod crawl;
pub mod federated_instances; mod node_info;
pub mod node_info;
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "20"; static CLIENT: Lazy<Client> = Lazy::new(|| {
pub const EXCLUDE_INSTANCES: &str = ClientBuilder::new()
"ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml"; .timeout(REQUEST_TIMEOUT)
.user_agent("lemmy-stats-crawler")
.build()
.expect("build reqwest client")
});
#[derive(Serialize, Debug)]
pub struct CrawlResult2 {
pub domain: String,
pub site_info: GetSiteResponse,
pub federated_counts: Option<NodeInfoUsage>,
}
pub async fn start_crawl(
start_instances: Vec<String>,
exclude_domains: Vec<String>,
max_distance: i32,
) -> Result<Vec<CrawlResult2>, Error> {
let params = Arc::new(CrawlParams::new(
min_lemmy_version().await?,
exclude_domains,
max_distance,
Arc::new(Mutex::new(HashSet::new())),
));
let mut jobs = vec![];
for domain in start_instances.into_iter() {
let job = CrawlJob::new(domain, 0, params.clone());
jobs.push(job.crawl());
}
let crawl_results: Vec<CrawlResult> = join_all(jobs)
.await
.into_iter()
.flatten()
.inspect(|r| {
if let Err(e) = r {
warn!("{}", e)
}
})
.filter_map(Result::ok)
.collect();
let mut crawl_results = calculate_federated_site_aggregates(crawl_results)?;
// Sort by active monthly users descending
crawl_results.sort_unstable_by_key(|i| {
i.site_info
.site_view
.as_ref()
.map(|s| s.counts.users_active_month)
.unwrap_or(0)
});
crawl_results.reverse();
Ok(crawl_results)
}
/// calculate minimum allowed lemmy version based on current version. in case of current version
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on
/// the previous version when a major lemmy release is published.
async fn min_lemmy_version() -> Result<Version, Error> {
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION";
let req = CLIENT
.get(lemmy_version_url)
.timeout(REQUEST_TIMEOUT)
.send()
.await?;
let mut version = Version::parse(req.text().await?.trim())?;
version.minor -= 1;
Ok(version)
}
fn calculate_federated_site_aggregates(
crawl_results: Vec<CrawlResult>,
) -> Result<Vec<CrawlResult2>, Error> {
let node_info: Vec<(String, NodeInfo)> = crawl_results
.iter()
.map(|c| (c.domain.clone(), c.node_info.clone()))
.collect();
let lemmy_instances: Vec<(String, GetSiteResponse)> = crawl_results
.into_iter()
.filter_map(|c| {
let domain = c.domain;
c.site_info.map(|c2| (domain, c2))
})
.collect();
let mut ret = vec![];
for instance in &lemmy_instances {
let federated_counts = if let Some(federated_instances) = &instance.1.federated_instances {
node_info
.iter()
.filter(|i| federated_instances.linked.contains(&i.0) || i.0 == instance.0)
.map(|i| i.1.usage.clone())
.reduce(|a, b| NodeInfoUsage {
users: NodeInfoUsers {
total: a.users.total + b.users.total,
active_halfyear: a.users.active_halfyear + b.users.active_halfyear,
active_month: a.users.active_month + b.users.active_month,
},
posts: a.posts + b.posts,
comments: a.comments + b.comments,
})
} else {
None
};
// TODO: workaround because GetSiteResponse doesnt implement clone
let site_info = serde_json::from_str(&serde_json::to_string(&instance.1)?)?;
ret.push(CrawlResult2 {
domain: instance.0.clone(),
site_info,
federated_counts,
});
}
Ok(ret)
}

View file

@ -1,73 +1,91 @@
use anyhow::Error; use anyhow::Error;
use clap::{App, Arg}; use lemmy_stats_crawler::{start_crawl, CrawlResult2};
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES};
use serde::Serialize; use serde::Serialize;
use structopt::StructOpt;
#[derive(StructOpt, Debug)]
#[structopt()]
struct Parameters {
#[structopt(short, long, use_delimiter = true, default_value = "lemmy.ml")]
start_instances: Vec<String>,
#[structopt(
short,
long,
use_delimiter = true,
default_value = "ds9.lemmy.ml,enterprise.lemmy.ml,voyager.lemmy.ml,test.lemmy.ml"
)]
exclude_instances: Vec<String>,
#[structopt(short, long, default_value = "20")]
max_crawl_distance: i32,
/// Silence all output
#[structopt(short, long)]
quiet: bool,
}
#[tokio::main] #[tokio::main]
pub async fn main() -> Result<(), Error> { pub async fn main() -> Result<(), Error> {
let matches = App::new("Lemmy Stats Crawler") let params = Parameters::from_args();
.arg(
Arg::with_name("start-instances") stderrlog::new()
.long("start-instances") .module(module_path!())
.takes_value(true), .quiet(params.quiet)
) .verbosity(1)
.arg(Arg::with_name("exclude").long("exclude").takes_value(true)) .init()?;
.arg(
Arg::with_name("max-crawl-depth")
.long("max-crawl-depth")
.takes_value(true),
)
.get_matches();
let start_instances: Vec<String> = matches
.value_of("start-instances")
.unwrap_or(DEFAULT_START_INSTANCES)
.split(',')
.map(|s| s.trim().to_string())
.collect();
let exclude: Vec<String> = matches
.value_of("exclude")
.unwrap_or(EXCLUDE_INSTANCES)
.split(',')
.map(|s| s.trim().to_string())
.collect();
let max_crawl_depth: i32 = matches
.value_of("max-crawl-depth")
.unwrap_or(DEFAULT_MAX_CRAWL_DEPTH)
.parse()?;
eprintln!("Crawling..."); eprintln!("Crawling...");
let (instance_details, failed_instances) = let instance_details = start_crawl(
crawl(start_instances, exclude, max_crawl_depth).await?; params.start_instances,
let total_stats = aggregate(instance_details, failed_instances); params.exclude_instances,
params.max_crawl_distance,
)
.await?;
let total_stats = aggregate(instance_details);
println!("{}", serde_json::to_string_pretty(&total_stats)?); println!("{}", serde_json::to_string_pretty(&total_stats)?);
Ok(()) Ok(())
} }
// TODO: lemmy stores these numbers in SiteAggregates, would be good to simply use that as a member
// (to avoid many members). but SiteAggregates also has id, site_id fields
#[derive(Serialize)] #[derive(Serialize)]
struct TotalStats { struct TotalStats {
crawled_instances: i32, crawled_instances: i32,
failed_instances: i32, online_users: usize,
total_users: i64, total_users: i64,
total_online_users: i32, users_active_day: i64,
instance_details: Vec<InstanceDetails>, users_active_week: i64,
users_active_month: i64,
users_active_halfyear: i64,
instance_details: Vec<CrawlResult2>,
} }
fn aggregate(instance_details: Vec<InstanceDetails>, failed_instances: i32) -> TotalStats { fn aggregate(instance_details: Vec<CrawlResult2>) -> TotalStats {
let mut crawled_instances = 0; let mut online_users = 0;
let mut total_users = 0; let mut total_users = 0;
let mut total_online_users = 0; let mut users_active_day = 0;
let mut users_active_week = 0;
let mut users_active_month = 0;
let mut users_active_halfyear = 0;
let mut crawled_instances = 0;
for i in &instance_details { for i in &instance_details {
crawled_instances += 1; crawled_instances += 1;
total_users += i.total_users; online_users += i.site_info.online;
total_online_users += i.online_users; if let Some(site_view) = &i.site_info.site_view {
total_users += site_view.counts.users;
users_active_day += site_view.counts.users_active_day;
users_active_week += site_view.counts.users_active_week;
users_active_month += site_view.counts.users_active_month;
users_active_halfyear += site_view.counts.users_active_half_year;
}
} }
TotalStats { TotalStats {
crawled_instances, crawled_instances,
failed_instances, online_users,
total_users, total_users,
total_online_users, users_active_day,
users_active_week,
users_active_halfyear,
users_active_month,
instance_details, instance_details,
} }
} }

View file

@ -1,6 +1,18 @@
use serde::Deserialize; use reqwest::Url;
use serde::{Deserialize, Serialize};
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug)]
pub struct NodeInfoWellKnown {
pub links: Vec<NodeInfoWellKnownLinks>,
}
#[derive(Deserialize, Debug)]
pub struct NodeInfoWellKnownLinks {
pub rel: Url,
pub href: Url,
}
#[derive(Deserialize, Debug, Clone)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct NodeInfo { pub struct NodeInfo {
pub version: String, pub version: String,
@ -10,22 +22,24 @@ pub struct NodeInfo {
pub open_registrations: bool, pub open_registrations: bool,
} }
#[derive(Deserialize, Debug)] #[derive(Deserialize, Debug, Clone)]
pub struct NodeInfoSoftware { pub struct NodeInfoSoftware {
pub name: String, pub name: String,
pub version: String, pub version: String,
} }
#[derive(Deserialize, Debug)] #[derive(Deserialize, Serialize, Debug, Clone, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase", default)]
pub struct NodeInfoUsage { pub struct NodeInfoUsage {
pub users: NodeInfoUsers, pub users: NodeInfoUsers,
pub local_posts: i64, #[serde(rename(deserialize = "localPosts"))]
pub local_comments: i64, pub posts: i64,
#[serde(rename(deserialize = "localComments"))]
pub comments: i64,
} }
#[derive(Deserialize, Debug)] #[derive(Deserialize, Serialize, Debug, Clone, Default)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase", default)]
pub struct NodeInfoUsers { pub struct NodeInfoUsers {
pub total: i64, pub total: i64,
pub active_halfyear: i64, pub active_halfyear: i64,