diff --git a/src/crawl.rs b/src/crawl.rs index 86c77ed..8137594 100644 --- a/src/crawl.rs +++ b/src/crawl.rs @@ -1,5 +1,4 @@ use crate::CLIENT; -use crate::REQUEST_TIMEOUT; use anyhow::Error; use async_recursion::async_recursion; use futures::future::join_all; @@ -52,7 +51,10 @@ impl CrawlJob { return vec![]; } - debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance); + debug!( + "Starting crawl for {}, distance {}", + &self.domain, &self.current_distance + ); let site_info = match self.fetch_instance_details().await { Ok(o) => o, Err(e) => return vec![Err(e)], @@ -65,8 +67,11 @@ impl CrawlJob { let mut result = vec![]; if let Some(federated) = &site_info.0.federated_instances { for domain in federated.linked.iter() { - let crawl_job = - CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone()); + let crawl_job = CrawlJob::new( + domain.clone(), + self.current_distance + 1, + self.params.clone(), + ); result.push(crawl_job.crawl()); } } @@ -86,7 +91,6 @@ impl CrawlJob { let site_info_url = format!("https://{}/api/v3/site", &self.domain); let site_info = CLIENT .get(&site_info_url) - .timeout(REQUEST_TIMEOUT) .send() .await? .json::() diff --git a/src/lib.rs b/src/lib.rs index b655209..6a1d311 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,7 +5,7 @@ use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails}; use anyhow::Error; use futures::future::join_all; use once_cell::sync::Lazy; -use reqwest::Client; +use reqwest::{Client, ClientBuilder}; use semver::Version; use std::collections::HashSet; use std::sync::Arc; @@ -14,9 +14,15 @@ use tokio::sync::Mutex; pub mod crawl; -pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); +const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); -static CLIENT: Lazy = Lazy::new(Client::default); +static CLIENT: Lazy = Lazy::new(|| { + ClientBuilder::new() + .timeout(REQUEST_TIMEOUT) + .user_agent("lemmy-stats-crawler") + .build() + .unwrap() +}); pub async fn start_crawl( start_instances: Vec, diff --git a/src/main.rs b/src/main.rs index 3973317..1ac6887 100644 --- a/src/main.rs +++ b/src/main.rs @@ -9,7 +9,11 @@ use structopt::StructOpt; struct Parameters { #[structopt(short, long, default_value = "lemmy.ml")] start_instances: Vec, - #[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")] + #[structopt( + short, + long, + default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml" + )] exclude_instances: Vec, #[structopt(short, long, default_value = "20")] max_crawl_distance: i32,