Merge pull request 'Add min version check to avoid including old instances' (#9) from min-version into main

Reviewed-on: https://yerbamate.ml/LemmyNet/lemmy-stats-crawler/pulls/9
This commit is contained in:
nutomic 2022-05-03 12:19:39 +00:00
commit 8509c19f50
3 changed files with 44 additions and 4 deletions

12
Cargo.lock generated
View file

@ -418,7 +418,9 @@ dependencies = [
"anyhow", "anyhow",
"clap", "clap",
"futures", "futures",
"once_cell",
"reqwest", "reqwest",
"semver",
"serde", "serde",
"serde_json", "serde_json",
"tokio", "tokio",
@ -521,9 +523,9 @@ dependencies = [
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.7.2" version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "af8b08b04175473088b46763e51ee54da5f9a164bc162f615b91bc179dbf15a3" checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9"
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"
@ -680,6 +682,12 @@ dependencies = [
"untrusted", "untrusted",
] ]
[[package]]
name = "semver"
version = "1.0.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d65bd28f48be7196d222d95b9243287f48d27aca604e08497513019ff0502cc4"
[[package]] [[package]]
name = "serde" name = "serde"
version = "1.0.124" version = "1.0.124"

View file

@ -12,3 +12,5 @@ tokio = { version = "0.2.25", features = ["rt-threaded", "macros"] }
futures = "0.3.13" futures = "0.3.13"
serde_json = "1.0.64" serde_json = "1.0.64"
clap = "2.33.3" clap = "2.33.3"
semver = "1.0.7"
once_cell = "1.10.0"

View file

@ -4,10 +4,14 @@ use crate::REQUEST_TIMEOUT;
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Error; use anyhow::Error;
use futures::try_join; use futures::try_join;
use once_cell::sync::Lazy;
use reqwest::Client; use reqwest::Client;
use semver::Version;
use serde::Serialize; use serde::Serialize;
use std::collections::VecDeque; use std::collections::VecDeque;
static CLIENT: Lazy<Client> = Lazy::new(Client::default);
pub async fn crawl( pub async fn crawl(
start_instances: Vec<String>, start_instances: Vec<String>,
exclude: Vec<String>, exclude: Vec<String>,
@ -17,6 +21,7 @@ pub async fn crawl(
.iter() .iter()
.map(|s| CrawlInstance::new(s.to_string(), 0)) .map(|s| CrawlInstance::new(s.to_string(), 0))
.collect(); .collect();
let min_lemmy_version = min_lemmy_version().await?;
let mut crawled_instances = vec![]; let mut crawled_instances = vec![];
let mut instance_details = vec![]; let mut instance_details = vec![];
let mut failed_instances = 0; let mut failed_instances = 0;
@ -25,7 +30,7 @@ pub async fn crawl(
if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) { if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) {
continue; continue;
} }
match fetch_instance_details(&current_instance.domain).await { match fetch_instance_details(&current_instance.domain, &min_lemmy_version).await {
Ok(details) => { Ok(details) => {
instance_details.push(details.to_owned()); instance_details.push(details.to_owned());
for i in details.linked_instances { for i in details.linked_instances {
@ -81,7 +86,10 @@ impl CrawlInstance {
} }
} }
async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error> { async fn fetch_instance_details(
domain: &str,
min_lemmy_version: &Version,
) -> Result<InstanceDetails, Error> {
let client = Client::default(); let client = Client::default();
let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain); let node_info_url = format!("https://{}/nodeinfo/2.0.json", domain);
@ -104,6 +112,13 @@ async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error>
site_info_request_v3 site_info_request_v3
)?; )?;
let node_info: NodeInfo = node_info.json().await?; let node_info: NodeInfo = node_info.json().await?;
if node_info.software.name != "lemmy" {
return Err(anyhow!("not a lemmy instance"));
}
let version = Version::parse(&node_info.software.version)?;
if &version < min_lemmy_version {
return Err(anyhow!("lemmy version is too old ({})", version));
}
let site_info_v2 = site_info_v2.json::<GetSiteResponse>().await.ok(); let site_info_v2 = site_info_v2.json::<GetSiteResponse>().await.ok();
let site_info_v3 = site_info_v3.json::<GetSiteResponse>().await.ok(); let site_info_v3 = site_info_v3.json::<GetSiteResponse>().await.ok();
let mut site_info: GetSiteResponse = if let Some(site_info_v2) = site_info_v2 { let mut site_info: GetSiteResponse = if let Some(site_info_v2) = site_info_v2 {
@ -148,3 +163,18 @@ async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error>
linked_instances, linked_instances,
}) })
} }
/// calculate minimum allowed lemmy version based on current version. in case of current version
/// 0.16.3, the minimum from this function is 0.15.3. this is to avoid rejecting all instances on
/// the previous version when a major lemmy release is published.
async fn min_lemmy_version() -> Result<Version, Error> {
let lemmy_version_url = "https://raw.githubusercontent.com/LemmyNet/lemmy-ansible/main/VERSION";
let req = CLIENT
.get(lemmy_version_url)
.timeout(REQUEST_TIMEOUT)
.send()
.await?;
let mut version = Version::parse(req.text().await?.trim())?;
version.minor -= 1;
Ok(version)
}