Merge pull request 'Recursive, parallel crawl' (#11) from recursive-crawl into main
Reviewed-on: https://yerbamate.ml/LemmyNet/lemmy-stats-crawler/pulls/11
This commit is contained in:
commit
575672cbe3
5 changed files with 326 additions and 172 deletions
173
Cargo.lock
generated
173
Cargo.lock
generated
|
@ -205,6 +205,15 @@ dependencies = [
|
||||||
"memchr",
|
"memchr",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ansi_term"
|
||||||
|
version = "0.12.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2"
|
||||||
|
dependencies = [
|
||||||
|
"winapi",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "anyhow"
|
name = "anyhow"
|
||||||
version = "1.0.57"
|
version = "1.0.57"
|
||||||
|
@ -220,6 +229,17 @@ dependencies = [
|
||||||
"event-listener",
|
"event-listener",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-recursion"
|
||||||
|
version = "1.0.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2cda8f4bcc10624c4e85bc66b3f452cca98cfa5ca002dc83a16aad2367641bea"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 1.0.37",
|
||||||
|
"quote 1.0.18",
|
||||||
|
"syn 1.0.92",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-trait"
|
name = "async-trait"
|
||||||
version = "0.1.53"
|
version = "0.1.53"
|
||||||
|
@ -433,26 +453,17 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap"
|
name = "clap"
|
||||||
version = "3.1.15"
|
version = "2.34.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "85a35a599b11c089a7f49105658d089b8f2cf0882993c17daf6de15285c2c35d"
|
checksum = "a0610544180c38b88101fecf2dd634b174a62eef6946f84dfc6a7127512b381c"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
|
"ansi_term",
|
||||||
"atty",
|
"atty",
|
||||||
"bitflags",
|
"bitflags",
|
||||||
"clap_lex",
|
"strsim 0.8.0",
|
||||||
"indexmap",
|
|
||||||
"strsim",
|
|
||||||
"termcolor",
|
|
||||||
"textwrap",
|
"textwrap",
|
||||||
]
|
"unicode-width",
|
||||||
|
"vec_map",
|
||||||
[[package]]
|
|
||||||
name = "clap_lex"
|
|
||||||
version = "0.2.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "a37c35f1112dad5e6e0b1adaff798507497a18fceeb30cceb3bae7d1427b9213"
|
|
||||||
dependencies = [
|
|
||||||
"os_str_bytes",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -545,7 +556,7 @@ dependencies = [
|
||||||
"ident_case",
|
"ident_case",
|
||||||
"proc-macro2 1.0.37",
|
"proc-macro2 1.0.37",
|
||||||
"quote 1.0.18",
|
"quote 1.0.18",
|
||||||
"strsim",
|
"strsim 0.10.0",
|
||||||
"syn 1.0.92",
|
"syn 1.0.92",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -560,6 +571,17 @@ dependencies = [
|
||||||
"syn 1.0.92",
|
"syn 1.0.92",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "derive-new"
|
||||||
|
version = "0.5.9"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "3418329ca0ad70234b9735dc4ceed10af4df60eff9c8e7b06cb5e520d92c3535"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 1.0.37",
|
||||||
|
"quote 1.0.18",
|
||||||
|
"syn 1.0.92",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "derive_more"
|
name = "derive_more"
|
||||||
version = "0.99.17"
|
version = "0.99.17"
|
||||||
|
@ -1008,6 +1030,15 @@ version = "0.11.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
|
checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "heck"
|
||||||
|
version = "0.3.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-segmentation",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heck"
|
name = "heck"
|
||||||
version = "0.4.0"
|
version = "0.4.0"
|
||||||
|
@ -1295,14 +1326,18 @@ name = "lemmy-stats-crawler"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"clap",
|
"async-recursion",
|
||||||
|
"derive-new",
|
||||||
"futures",
|
"futures",
|
||||||
"lemmy_api_common",
|
"lemmy_api_common",
|
||||||
|
"log",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"semver",
|
"semver",
|
||||||
"serde",
|
"serde",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"stderrlog",
|
||||||
|
"structopt",
|
||||||
"tokio",
|
"tokio",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -1806,12 +1841,6 @@ dependencies = [
|
||||||
"vcpkg",
|
"vcpkg",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "os_str_bytes"
|
|
||||||
version = "6.0.0"
|
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
||||||
checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "parking_lot"
|
name = "parking_lot"
|
||||||
version = "0.11.2"
|
version = "0.11.2"
|
||||||
|
@ -2040,6 +2069,30 @@ version = "0.1.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro-error"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro-error-attr",
|
||||||
|
"proc-macro2 1.0.37",
|
||||||
|
"quote 1.0.18",
|
||||||
|
"syn 1.0.92",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "proc-macro-error-attr"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 1.0.37",
|
||||||
|
"quote 1.0.18",
|
||||||
|
"version_check",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "proc-macro2"
|
name = "proc-macro2"
|
||||||
version = "0.4.30"
|
version = "0.4.30"
|
||||||
|
@ -2587,6 +2640,19 @@ version = "0.5.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "stderrlog"
|
||||||
|
version = "0.5.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "45a53e2eff3e94a019afa6265e8ee04cb05b9d33fe9f5078b14e4e391d155a38"
|
||||||
|
dependencies = [
|
||||||
|
"atty",
|
||||||
|
"chrono",
|
||||||
|
"log",
|
||||||
|
"termcolor",
|
||||||
|
"thread_local",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "string_cache"
|
name = "string_cache"
|
||||||
version = "0.8.4"
|
version = "0.8.4"
|
||||||
|
@ -2613,12 +2679,42 @@ dependencies = [
|
||||||
"quote 1.0.18",
|
"quote 1.0.18",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "strsim"
|
||||||
|
version = "0.8.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "8ea5119cdb4c55b55d432abb513a0429384878c15dde60cc77b1c99de1a95a6a"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strsim"
|
name = "strsim"
|
||||||
version = "0.10.0"
|
version = "0.10.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "structopt"
|
||||||
|
version = "0.3.26"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0c6b5c64445ba8094a6ab0c3cd2ad323e07171012d9c98b0b15651daf1787a10"
|
||||||
|
dependencies = [
|
||||||
|
"clap",
|
||||||
|
"lazy_static",
|
||||||
|
"structopt-derive",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "structopt-derive"
|
||||||
|
version = "0.4.18"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "dcb5ae327f9cc13b68763b5749770cb9e048a99bd9dfdfa58d0cf05d5f64afe0"
|
||||||
|
dependencies = [
|
||||||
|
"heck 0.3.3",
|
||||||
|
"proc-macro-error",
|
||||||
|
"proc-macro2 1.0.37",
|
||||||
|
"quote 1.0.18",
|
||||||
|
"syn 1.0.92",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "strum"
|
name = "strum"
|
||||||
version = "0.24.0"
|
version = "0.24.0"
|
||||||
|
@ -2631,7 +2727,7 @@ version = "0.24.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef"
|
checksum = "6878079b17446e4d3eba6192bb0a2950d5b14f0ed8424b852310e5a94345d0ef"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"heck",
|
"heck 0.4.0",
|
||||||
"proc-macro2 1.0.37",
|
"proc-macro2 1.0.37",
|
||||||
"quote 1.0.18",
|
"quote 1.0.18",
|
||||||
"rustversion",
|
"rustversion",
|
||||||
|
@ -2705,9 +2801,12 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "textwrap"
|
name = "textwrap"
|
||||||
version = "0.15.0"
|
version = "0.11.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "b1141d4d61095b28419e22cb0bbf02755f5e54e0526f97f1e3d1d160e60885fb"
|
checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060"
|
||||||
|
dependencies = [
|
||||||
|
"unicode-width",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
|
@ -2731,11 +2830,11 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thread_local"
|
name = "thread_local"
|
||||||
version = "1.1.4"
|
version = "1.0.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5516c27b78311c50bf42c071425c560ac799b11c30b31f87e3081965fe5e0180"
|
checksum = "d40c6d1b69745a6ec6fb1ca717914848da4b44ae29d9b3080cbee91d72a69b14"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"once_cell",
|
"lazy_static",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
@ -2918,9 +3017,9 @@ dependencies = [
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "tracing-subscriber"
|
name = "tracing-subscriber"
|
||||||
version = "0.3.11"
|
version = "0.3.6"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "4bc28f93baff38037f64e6f43d34cfa1605f27a49c34e8a04c5e78b0babf2596"
|
checksum = "77be66445c4eeebb934a7340f227bfe7b338173d3f8c00a60a5a58005c9faecf"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"sharded-slab",
|
"sharded-slab",
|
||||||
"thread_local",
|
"thread_local",
|
||||||
|
@ -2991,6 +3090,12 @@ dependencies = [
|
||||||
"tinyvec",
|
"tinyvec",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "unicode-segmentation"
|
||||||
|
version = "1.9.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "unicode-width"
|
name = "unicode-width"
|
||||||
version = "0.1.9"
|
version = "0.1.9"
|
||||||
|
@ -3062,6 +3167,12 @@ version = "0.2.15"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "vec_map"
|
||||||
|
version = "0.8.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "f1bddf1187be692e79c5ffeab891132dfb0f236ed36a43c7ed39f1165ee20191"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "version_check"
|
name = "version_check"
|
||||||
version = "0.9.4"
|
version = "0.9.4"
|
||||||
|
|
|
@ -11,7 +11,11 @@ anyhow = "1.0.57"
|
||||||
tokio = { version = "1.18.1", features = ["macros", "rt-multi-thread"] }
|
tokio = { version = "1.18.1", features = ["macros", "rt-multi-thread"] }
|
||||||
futures = "0.3.21"
|
futures = "0.3.21"
|
||||||
serde_json = "1.0.81"
|
serde_json = "1.0.81"
|
||||||
clap = "3.1.15"
|
|
||||||
semver = "1.0.9"
|
semver = "1.0.9"
|
||||||
once_cell = "1.10.0"
|
once_cell = "1.10.0"
|
||||||
lemmy_api_common = "0.16.0"
|
lemmy_api_common = "0.16.0"
|
||||||
|
async-recursion = "1.0.0"
|
||||||
|
log = "0.4.17"
|
||||||
|
derive-new = "0.5.9"
|
||||||
|
stderrlog = "0.5.1"
|
||||||
|
structopt = "0.3.26"
|
||||||
|
|
172
src/crawl.rs
172
src/crawl.rs
|
@ -1,65 +1,30 @@
|
||||||
|
use crate::CLIENT;
|
||||||
use crate::REQUEST_TIMEOUT;
|
use crate::REQUEST_TIMEOUT;
|
||||||
use anyhow::anyhow;
|
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
|
use async_recursion::async_recursion;
|
||||||
|
use futures::future::join_all;
|
||||||
use lemmy_api_common::site::GetSiteResponse;
|
use lemmy_api_common::site::GetSiteResponse;
|
||||||
use once_cell::sync::Lazy;
|
use log::debug;
|
||||||
use reqwest::Client;
|
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::collections::VecDeque;
|
use std::collections::HashSet;
|
||||||
|
use std::ops::Deref;
|
||||||
|
use std::sync::Arc;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
static CLIENT: Lazy<Client> = Lazy::new(Client::default);
|
#[derive(new)]
|
||||||
|
pub struct CrawlJob {
|
||||||
|
domain: String,
|
||||||
|
current_distance: i32,
|
||||||
|
params: Arc<CrawlParams>,
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn crawl(
|
#[derive(new)]
|
||||||
start_instances: Vec<String>,
|
pub struct CrawlParams {
|
||||||
exclude: Vec<String>,
|
min_lemmy_version: Version,
|
||||||
|
exclude_domains: Vec<String>,
|
||||||
max_depth: i32,
|
max_depth: i32,
|
||||||
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
crawled_instances: Arc<Mutex<HashSet<String>>>,
|
||||||
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
|
|
||||||
.iter()
|
|
||||||
.map(|s| CrawlInstance::new(s.to_string(), 0))
|
|
||||||
.collect();
|
|
||||||
let min_lemmy_version = min_lemmy_version().await?;
|
|
||||||
let mut crawled_instances = vec![];
|
|
||||||
let mut instance_details = vec![];
|
|
||||||
let mut failed_instances = 0;
|
|
||||||
while let Some(current_instance) = pending_instances.pop_back() {
|
|
||||||
crawled_instances.push(current_instance.domain.clone());
|
|
||||||
if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
match fetch_instance_details(¤t_instance.domain, &min_lemmy_version).await {
|
|
||||||
Ok(details) => {
|
|
||||||
if let Some(federated) = &details.site_info.federated_instances.as_ref() {
|
|
||||||
for i in &federated.linked {
|
|
||||||
let is_in_crawled = crawled_instances.contains(i);
|
|
||||||
let is_in_pending = pending_instances.iter().any(|p| &p.domain == i);
|
|
||||||
if !is_in_crawled && !is_in_pending {
|
|
||||||
let ci = CrawlInstance::new(i.clone(), current_instance.depth + 1);
|
|
||||||
pending_instances.push_back(ci);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
instance_details.push(details);
|
|
||||||
}
|
|
||||||
Err(e) => {
|
|
||||||
failed_instances += 1;
|
|
||||||
eprintln!("Failed to crawl {}: {}", current_instance.domain, e)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sort by active monthly users descending
|
|
||||||
instance_details.sort_by_key(|i| {
|
|
||||||
i.site_info
|
|
||||||
.site_view
|
|
||||||
.as_ref()
|
|
||||||
.map(|s| s.counts.users_active_month)
|
|
||||||
.unwrap_or(0)
|
|
||||||
});
|
|
||||||
instance_details.reverse();
|
|
||||||
|
|
||||||
Ok((instance_details, failed_instances))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug)]
|
#[derive(Serialize, Debug)]
|
||||||
|
@ -68,54 +33,65 @@ pub struct InstanceDetails {
|
||||||
pub site_info: GetSiteResponse,
|
pub site_info: GetSiteResponse,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct CrawlInstance {
|
impl CrawlJob {
|
||||||
domain: String,
|
#[async_recursion]
|
||||||
depth: i32,
|
pub async fn crawl(self) -> Vec<Result<InstanceDetails, Error>> {
|
||||||
}
|
// need to acquire and release mutex before recursing, otherwise it will deadlock
|
||||||
|
{
|
||||||
|
let mut crawled_instances = self.params.crawled_instances.deref().lock().await;
|
||||||
|
if crawled_instances.contains(&self.domain) {
|
||||||
|
return vec![];
|
||||||
|
} else {
|
||||||
|
crawled_instances.insert(self.domain.clone());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl CrawlInstance {
|
if self.current_distance > self.params.max_depth
|
||||||
pub fn new(domain: String, depth: i32) -> CrawlInstance {
|
|| self.params.exclude_domains.contains(&self.domain)
|
||||||
CrawlInstance { domain, depth }
|
{
|
||||||
}
|
return vec![];
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn fetch_instance_details(
|
debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance);
|
||||||
domain: &str,
|
let site_info = match self.fetch_instance_details().await {
|
||||||
min_lemmy_version: &Version,
|
Ok(o) => o,
|
||||||
) -> Result<InstanceDetails, Error> {
|
Err(e) => return vec![Err(e)],
|
||||||
let client = Client::default();
|
};
|
||||||
|
|
||||||
let site_info_url = format!("https://{}/api/v3/site", domain);
|
if site_info.1 < self.params.min_lemmy_version {
|
||||||
let site_info = client
|
return vec![];
|
||||||
.get(&site_info_url)
|
}
|
||||||
.timeout(REQUEST_TIMEOUT)
|
|
||||||
.send()
|
|
||||||
.await?
|
|
||||||
.json::<GetSiteResponse>()
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let version = Version::parse(&site_info.version)?;
|
let mut result = vec![];
|
||||||
if &version < min_lemmy_version {
|
if let Some(federated) = &site_info.0.federated_instances {
|
||||||
return Err(anyhow!("lemmy version is too old ({})", version));
|
for domain in federated.linked.iter() {
|
||||||
|
let crawl_job =
|
||||||
|
CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone());
|
||||||
|
result.push(crawl_job.crawl());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut result2: Vec<Result<InstanceDetails, Error>> =
|
||||||
|
join_all(result).await.into_iter().flatten().collect();
|
||||||
|
debug!("Successfully finished crawl for {}", &self.domain);
|
||||||
|
result2.push(Ok(InstanceDetails {
|
||||||
|
domain: self.domain,
|
||||||
|
site_info: site_info.0,
|
||||||
|
}));
|
||||||
|
|
||||||
|
result2
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(InstanceDetails {
|
async fn fetch_instance_details(&self) -> Result<(GetSiteResponse, Version), Error> {
|
||||||
domain: domain.to_owned(),
|
let site_info_url = format!("https://{}/api/v3/site", &self.domain);
|
||||||
site_info,
|
let site_info = CLIENT
|
||||||
})
|
.get(&site_info_url)
|
||||||
}
|
.timeout(REQUEST_TIMEOUT)
|
||||||
|
.send()
|
||||||
/// calculate minimum allowed lemmy version based on current version. in case of current version
|
.await?
|
||||||
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on
|
.json::<GetSiteResponse>()
|
||||||
/// the previous version when a major lemmy release is published.
|
.await?;
|
||||||
async fn min_lemmy_version() -> Result<Version, Error> {
|
let version = Version::parse(&site_info.version)?;
|
||||||
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION";
|
Ok((site_info, version))
|
||||||
let req = CLIENT
|
}
|
||||||
.get(lemmy_version_url)
|
|
||||||
.timeout(REQUEST_TIMEOUT)
|
|
||||||
.send()
|
|
||||||
.await?;
|
|
||||||
let mut version = Version::parse(req.text().await?.trim())?;
|
|
||||||
version.minor -= 1;
|
|
||||||
Ok(version)
|
|
||||||
}
|
}
|
||||||
|
|
71
src/lib.rs
71
src/lib.rs
|
@ -1,9 +1,72 @@
|
||||||
|
#[macro_use]
|
||||||
|
extern crate derive_new;
|
||||||
|
|
||||||
|
use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails};
|
||||||
|
use anyhow::Error;
|
||||||
|
use futures::future::join_all;
|
||||||
|
use once_cell::sync::Lazy;
|
||||||
|
use reqwest::Client;
|
||||||
|
use semver::Version;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::sync::Arc;
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
use tokio::sync::Mutex;
|
||||||
|
|
||||||
pub mod crawl;
|
pub mod crawl;
|
||||||
|
|
||||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
|
|
||||||
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "20";
|
static CLIENT: Lazy<Client> = Lazy::new(Client::default);
|
||||||
pub const EXCLUDE_INSTANCES: &str =
|
|
||||||
"ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml";
|
pub async fn start_crawl(
|
||||||
|
start_instances: Vec<String>,
|
||||||
|
exclude_domains: Vec<String>,
|
||||||
|
max_distance: i32,
|
||||||
|
) -> Result<Vec<InstanceDetails>, Error> {
|
||||||
|
let params = Arc::new(CrawlParams::new(
|
||||||
|
min_lemmy_version().await?,
|
||||||
|
exclude_domains,
|
||||||
|
max_distance,
|
||||||
|
Arc::new(Mutex::new(HashSet::new())),
|
||||||
|
));
|
||||||
|
let mut jobs = vec![];
|
||||||
|
for domain in start_instances.into_iter() {
|
||||||
|
let job = CrawlJob::new(domain, 0, params.clone());
|
||||||
|
jobs.push(job.crawl());
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: log the errors
|
||||||
|
let mut instance_details: Vec<InstanceDetails> = join_all(jobs)
|
||||||
|
.await
|
||||||
|
.into_iter()
|
||||||
|
.flatten()
|
||||||
|
.filter_map(|r| r.ok())
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Sort by active monthly users descending
|
||||||
|
instance_details.sort_unstable_by_key(|i| {
|
||||||
|
i.site_info
|
||||||
|
.site_view
|
||||||
|
.as_ref()
|
||||||
|
.map(|s| s.counts.users_active_month)
|
||||||
|
.unwrap_or(0)
|
||||||
|
});
|
||||||
|
instance_details.reverse();
|
||||||
|
|
||||||
|
Ok(instance_details)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// calculate minimum allowed lemmy version based on current version. in case of current version
|
||||||
|
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on
|
||||||
|
/// the previous version when a major lemmy release is published.
|
||||||
|
async fn min_lemmy_version() -> Result<Version, Error> {
|
||||||
|
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION";
|
||||||
|
let req = CLIENT
|
||||||
|
.get(lemmy_version_url)
|
||||||
|
.timeout(REQUEST_TIMEOUT)
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
let mut version = Version::parse(req.text().await?.trim())?;
|
||||||
|
version.minor -= 1;
|
||||||
|
Ok(version)
|
||||||
|
}
|
||||||
|
|
76
src/main.rs
76
src/main.rs
|
@ -1,54 +1,55 @@
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use clap::{Arg, Command};
|
use lemmy_stats_crawler::crawl::InstanceDetails;
|
||||||
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
|
use lemmy_stats_crawler::start_crawl;
|
||||||
use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES};
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
use structopt::StructOpt;
|
||||||
|
|
||||||
|
#[derive(StructOpt, Debug)]
|
||||||
|
#[structopt()]
|
||||||
|
struct Parameters {
|
||||||
|
#[structopt(short, long, default_value = "lemmy.ml")]
|
||||||
|
start_instances: Vec<String>,
|
||||||
|
#[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")]
|
||||||
|
exclude_instances: Vec<String>,
|
||||||
|
#[structopt(short, long, default_value = "20")]
|
||||||
|
max_crawl_distance: i32,
|
||||||
|
|
||||||
|
/// Silence all output
|
||||||
|
#[structopt(short, long)]
|
||||||
|
quiet: bool,
|
||||||
|
/// Verbose mode (-v, -vv, -vvv, etc)
|
||||||
|
#[structopt(short = "v", long = "verbose", parse(from_occurrences))]
|
||||||
|
verbose: usize,
|
||||||
|
}
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() -> Result<(), Error> {
|
pub async fn main() -> Result<(), Error> {
|
||||||
let matches = Command::new("Lemmy Stats Crawler")
|
let params = Parameters::from_args();
|
||||||
.arg(
|
|
||||||
Arg::new("start-instances")
|
stderrlog::new()
|
||||||
.long("start-instances")
|
.module(module_path!())
|
||||||
.takes_value(true),
|
.quiet(params.quiet)
|
||||||
)
|
.verbosity(params.verbose)
|
||||||
.arg(Arg::new("exclude").long("exclude").takes_value(true))
|
.init()?;
|
||||||
.arg(
|
|
||||||
Arg::new("max-crawl-depth")
|
|
||||||
.long("max-crawl-depth")
|
|
||||||
.takes_value(true),
|
|
||||||
)
|
|
||||||
.get_matches();
|
|
||||||
let start_instances: Vec<String> = matches
|
|
||||||
.value_of("start-instances")
|
|
||||||
.unwrap_or(DEFAULT_START_INSTANCES)
|
|
||||||
.split(',')
|
|
||||||
.map(|s| s.trim().to_string())
|
|
||||||
.collect();
|
|
||||||
let exclude: Vec<String> = matches
|
|
||||||
.value_of("exclude")
|
|
||||||
.unwrap_or(EXCLUDE_INSTANCES)
|
|
||||||
.split(',')
|
|
||||||
.map(|s| s.trim().to_string())
|
|
||||||
.collect();
|
|
||||||
let max_crawl_depth: i32 = matches
|
|
||||||
.value_of("max-crawl-depth")
|
|
||||||
.unwrap_or(DEFAULT_MAX_CRAWL_DEPTH)
|
|
||||||
.parse()?;
|
|
||||||
|
|
||||||
eprintln!("Crawling...");
|
eprintln!("Crawling...");
|
||||||
let (instance_details, failed_instances) =
|
let instance_details = start_crawl(
|
||||||
crawl(start_instances, exclude, max_crawl_depth).await?;
|
params.start_instances,
|
||||||
let total_stats = aggregate(instance_details, failed_instances);
|
params.exclude_instances,
|
||||||
|
params.max_crawl_distance,
|
||||||
|
)
|
||||||
|
.await?;
|
||||||
|
let total_stats = aggregate(instance_details);
|
||||||
|
|
||||||
println!("{}", serde_json::to_string_pretty(&total_stats)?);
|
println!("{}", serde_json::to_string_pretty(&total_stats)?);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: lemmy stores these numbers in SiteAggregates, would be good to simply use that as a member
|
||||||
|
// (to avoid many members). but SiteAggregates also has id, site_id fields
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
struct TotalStats {
|
struct TotalStats {
|
||||||
crawled_instances: i32,
|
crawled_instances: i32,
|
||||||
failed_instances: i32,
|
|
||||||
online_users: usize,
|
online_users: usize,
|
||||||
total_users: i64,
|
total_users: i64,
|
||||||
users_active_day: i64,
|
users_active_day: i64,
|
||||||
|
@ -58,7 +59,7 @@ struct TotalStats {
|
||||||
instance_details: Vec<InstanceDetails>,
|
instance_details: Vec<InstanceDetails>,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn aggregate(instance_details: Vec<InstanceDetails>, failed_instances: i32) -> TotalStats {
|
fn aggregate(instance_details: Vec<InstanceDetails>) -> TotalStats {
|
||||||
let mut online_users = 0;
|
let mut online_users = 0;
|
||||||
let mut total_users = 0;
|
let mut total_users = 0;
|
||||||
let mut users_active_day = 0;
|
let mut users_active_day = 0;
|
||||||
|
@ -79,7 +80,6 @@ fn aggregate(instance_details: Vec<InstanceDetails>, failed_instances: i32) -> T
|
||||||
}
|
}
|
||||||
TotalStats {
|
TotalStats {
|
||||||
crawled_instances,
|
crawled_instances,
|
||||||
failed_instances,
|
|
||||||
online_users,
|
online_users,
|
||||||
total_users,
|
total_users,
|
||||||
users_active_day,
|
users_active_day,
|
||||||
|
|
Loading…
Reference in a new issue