Use lemmy structs

This commit is contained in:
Felix Ableitner 2022-05-04 01:04:35 +02:00
parent 8509c19f50
commit 2e2a4888d0
7 changed files with 2602 additions and 515 deletions

2882
Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -5,12 +5,13 @@ authors = ["Felix Ableitner"]
edition = "2018"
[dependencies]
reqwest = { version = "0.10.10", default-features = false, features = ["json", "rustls-tls"] }
serde = { version = "1.0.123", features = ["derive"] }
anyhow = "1.0.38"
tokio = { version = "0.2.25", features = ["rt-threaded", "macros"] }
futures = "0.3.13"
serde_json = "1.0.64"
clap = "2.33.3"
semver = "1.0.7"
reqwest = { version = "0.11.10", default-features = false, features = ["json", "rustls-tls"] }
serde = { version = "1.0.137", features = ["derive"] }
anyhow = "1.0.57"
tokio = { version = "1.18.1", features = ["macros", "rt-multi-thread"] }
futures = "0.3.21"
serde_json = "1.0.81"
clap = "3.1.15"
semver = "1.0.9"
once_cell = "1.10.0"
lemmy_api_common = "0.16.0"

View file

@ -1,9 +1,7 @@
use crate::federated_instances::GetSiteResponse;
use crate::node_info::NodeInfo;
use crate::REQUEST_TIMEOUT;
use anyhow::anyhow;
use anyhow::Error;
use futures::try_join;
use lemmy_api_common::site::GetSiteResponse;
use once_cell::sync::Lazy;
use reqwest::Client;
use semver::Version;
@ -32,15 +30,17 @@ pub async fn crawl(
}
match fetch_instance_details(&current_instance.domain, &min_lemmy_version).await {
Ok(details) => {
instance_details.push(details.to_owned());
for i in details.linked_instances {
let is_in_crawled = crawled_instances.contains(&i);
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
if !is_in_crawled && !is_in_pending {
let ci = CrawlInstance::new(i, current_instance.depth + 1);
pending_instances.push_back(ci);
if let Some(federated) = &details.site_info.federated_instances.as_ref() {
for i in &federated.linked {
let is_in_crawled = crawled_instances.contains(i);
let is_in_pending = pending_instances.iter().any(|p| &p.domain == i);
if !is_in_crawled && !is_in_pending {
let ci = CrawlInstance::new(i.clone(), current_instance.depth + 1);
pending_instances.push_back(ci);
}
}
}
instance_details.push(details);
}
Err(e) => {
failed_instances += 1;
@ -50,29 +50,22 @@ pub async fn crawl(
}
// Sort by active monthly users descending
instance_details.sort_by_key(|i| i.users_active_month);
instance_details.sort_by_key(|i| {
i.site_info
.site_view
.as_ref()
.map(|s| s.counts.users_active_month)
.unwrap_or(0)
});
instance_details.reverse();
Ok((instance_details, failed_instances))
}
#[derive(Serialize, Clone)]
#[derive(Serialize, Debug)]
pub struct InstanceDetails {
pub domain: String,
pub name: String,
pub description: Option<String>,
pub version: String,
pub icon: Option<String>,
pub online_users: i32,
pub total_users: i64,
pub users_active_halfyear: i64,
pub users_active_month: i64,
pub open_registrations: bool,
pub linked_instances_count: i32,
pub require_application: bool,
// The following fields are only used for aggregation, but not shown in output
#[serde(skip)]
pub linked_instances: Vec<String>,
pub site_info: GetSiteResponse,
}
struct CrawlInstance {
@ -92,75 +85,23 @@ async fn fetch_instance_details(
) -> Result<InstanceDetails, Error> {
let client = Client::default();
let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain);
let node_info_request = client.get(&node_info_url).timeout(REQUEST_TIMEOUT).send();
let site_info_url_v2 = format!("https://{}/api/v2/site", domain);
let site_info_request_v2 = client
.get(&site_info_url_v2)
let site_info_url = format!("https://{}/api/v3/site", domain);
let site_info = client
.get(&site_info_url)
.timeout(REQUEST_TIMEOUT)
.send();
let site_info_url_v3 = format!("https://{}/api/v3/site", domain);
let site_info_request_v3 = client
.get(&site_info_url_v3)
.timeout(REQUEST_TIMEOUT)
.send();
.send()
.await?
.json::<GetSiteResponse>()
.await?;
let (node_info, site_info_v2, site_info_v3) = try_join!(
node_info_request,
site_info_request_v2,
site_info_request_v3
)?;
let node_info: NodeInfo = node_info.json().await?;
if node_info.software.name != "lemmy" {
return Err(anyhow!("not a lemmy instance"));
}
let version = Version::parse(&node_info.software.version)?;
let version = Version::parse(&site_info.version)?;
if &version < min_lemmy_version {
return Err(anyhow!("lemmy version is too old ({})", version));
}
let site_info_v2 = site_info_v2.json::<GetSiteResponse>().await.ok();
let site_info_v3 = site_info_v3.json::<GetSiteResponse>().await.ok();
let mut site_info: GetSiteResponse = if let Some(site_info_v2) = site_info_v2 {
site_info_v2
} else if let Some(site_info_v3) = site_info_v3 {
site_info_v3
} else {
return Err(anyhow!("Failed to read site_info"));
};
if let Some(description) = &site_info.site_view.site.description {
if description.len() > 150 {
site_info.site_view.site.description = None;
}
}
let require_application = site_info
.site_view
.site
.require_application
.unwrap_or(false);
let linked_instances: Vec<String> = site_info
.federated_instances
.map(|f| f.linked)
.unwrap_or_default()
.iter()
.map(|l| l.to_lowercase())
.collect();
Ok(InstanceDetails {
domain: domain.to_owned(),
name: site_info.site_view.site.name,
description: site_info.site_view.site.description,
version: node_info.software.version,
icon: site_info.site_view.site.icon,
online_users: site_info.online as i32,
total_users: node_info.usage.users.total,
users_active_halfyear: node_info.usage.users.active_halfyear,
users_active_month: node_info.usage.users.active_month,
open_registrations: node_info.open_registrations,
linked_instances_count: linked_instances.len() as i32,
require_application,
linked_instances,
site_info,
})
}

View file

@ -1,28 +0,0 @@
use serde::Deserialize;
#[derive(Deserialize, Debug, Clone)]
pub struct GetSiteResponse {
pub site_view: SiteView,
pub online: usize,
pub federated_instances: Option<FederatedInstances>,
}
#[derive(Deserialize, Debug, Clone)]
pub struct FederatedInstances {
pub linked: Vec<String>,
pub allowed: Option<Vec<String>>,
pub blocked: Option<Vec<String>>,
}
#[derive(Deserialize, Debug, Clone)]
pub struct SiteView {
pub site: Site,
}
#[derive(Deserialize, Debug, Clone)]
pub struct Site {
pub name: String,
pub icon: Option<String>,
pub description: Option<String>,
pub require_application: Option<bool>,
}

View file

@ -1,8 +1,6 @@
use std::time::Duration;
pub mod crawl;
pub mod federated_instances;
pub mod node_info;
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";

View file

@ -1,20 +1,20 @@
use anyhow::Error;
use clap::{App, Arg};
use clap::{Arg, Command};
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES};
use serde::Serialize;
#[tokio::main]
pub async fn main() -> Result<(), Error> {
let matches = App::new("Lemmy Stats Crawler")
let matches = Command::new("Lemmy Stats Crawler")
.arg(
Arg::with_name("start-instances")
Arg::new("start-instances")
.long("start-instances")
.takes_value(true),
)
.arg(Arg::with_name("exclude").long("exclude").takes_value(true))
.arg(Arg::new("exclude").long("exclude").takes_value(true))
.arg(
Arg::with_name("max-crawl-depth")
Arg::new("max-crawl-depth")
.long("max-crawl-depth")
.takes_value(true),
)
@ -49,25 +49,43 @@ pub async fn main() -> Result<(), Error> {
struct TotalStats {
crawled_instances: i32,
failed_instances: i32,
online_users: usize,
total_users: i64,
total_online_users: i32,
users_active_day: i64,
users_active_week: i64,
users_active_month: i64,
users_active_halfyear: i64,
instance_details: Vec<InstanceDetails>,
}
fn aggregate(instance_details: Vec<InstanceDetails>, failed_instances: i32) -> TotalStats {
let mut crawled_instances = 0;
let mut online_users = 0;
let mut total_users = 0;
let mut total_online_users = 0;
let mut users_active_day = 0;
let mut users_active_week = 0;
let mut users_active_month = 0;
let mut users_active_halfyear = 0;
let mut crawled_instances = 0;
for i in &instance_details {
crawled_instances += 1;
total_users += i.total_users;
total_online_users += i.online_users;
online_users += i.site_info.online;
if let Some(site_view) = &i.site_info.site_view {
total_users += site_view.counts.users;
users_active_day += site_view.counts.users_active_day;
users_active_week += site_view.counts.users_active_week;
users_active_month += site_view.counts.users_active_month;
users_active_halfyear += site_view.counts.users_active_half_year;
}
}
TotalStats {
crawled_instances,
failed_instances,
online_users,
total_users,
total_online_users,
users_active_day,
users_active_week,
users_active_halfyear,
users_active_month,
instance_details,
}
}

View file

@ -1,33 +0,0 @@
use serde::Deserialize;
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct NodeInfo {
pub version: String,
pub software: NodeInfoSoftware,
pub protocols: Vec<String>,
pub usage: NodeInfoUsage,
pub open_registrations: bool,
}
#[derive(Deserialize, Debug)]
pub struct NodeInfoSoftware {
pub name: String,
pub version: String,
}
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct NodeInfoUsage {
pub users: NodeInfoUsers,
pub local_posts: i64,
pub local_comments: i64,
}
#[derive(Deserialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct NodeInfoUsers {
pub total: i64,
pub active_halfyear: i64,
pub active_month: i64,
}