Add user agent

This commit is contained in:
Felix Ableitner 2022-05-13 16:57:53 +02:00
parent 575672cbe3
commit 21cf61f847
3 changed files with 23 additions and 9 deletions

View file

@ -1,5 +1,4 @@
use crate::CLIENT; use crate::CLIENT;
use crate::REQUEST_TIMEOUT;
use anyhow::Error; use anyhow::Error;
use async_recursion::async_recursion; use async_recursion::async_recursion;
use futures::future::join_all; use futures::future::join_all;
@ -52,7 +51,10 @@ impl CrawlJob {
return vec![]; return vec![];
} }
debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance); debug!(
"Starting crawl for {}, distance {}",
&self.domain, &self.current_distance
);
let site_info = match self.fetch_instance_details().await { let site_info = match self.fetch_instance_details().await {
Ok(o) => o, Ok(o) => o,
Err(e) => return vec![Err(e)], Err(e) => return vec![Err(e)],
@ -65,8 +67,11 @@ impl CrawlJob {
let mut result = vec![]; let mut result = vec![];
if let Some(federated) = &site_info.0.federated_instances { if let Some(federated) = &site_info.0.federated_instances {
for domain in federated.linked.iter() { for domain in federated.linked.iter() {
let crawl_job = let crawl_job = CrawlJob::new(
CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone()); domain.clone(),
self.current_distance + 1,
self.params.clone(),
);
result.push(crawl_job.crawl()); result.push(crawl_job.crawl());
} }
} }
@ -86,7 +91,6 @@ impl CrawlJob {
let site_info_url = format!("https://{}/api/v3/site", &self.domain); let site_info_url = format!("https://{}/api/v3/site", &self.domain);
let site_info = CLIENT let site_info = CLIENT
.get(&site_info_url) .get(&site_info_url)
.timeout(REQUEST_TIMEOUT)
.send() .send()
.await? .await?
.json::<GetSiteResponse>() .json::<GetSiteResponse>()

View file

@ -5,7 +5,7 @@ use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails};
use anyhow::Error; use anyhow::Error;
use futures::future::join_all; use futures::future::join_all;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use reqwest::Client; use reqwest::{Client, ClientBuilder};
use semver::Version; use semver::Version;
use std::collections::HashSet; use std::collections::HashSet;
use std::sync::Arc; use std::sync::Arc;
@ -14,9 +14,15 @@ use tokio::sync::Mutex;
pub mod crawl; pub mod crawl;
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
static CLIENT: Lazy<Client> = Lazy::new(Client::default); static CLIENT: Lazy<Client> = Lazy::new(|| {
ClientBuilder::new()
.timeout(REQUEST_TIMEOUT)
.user_agent("lemmy-stats-crawler")
.build()
.unwrap()
});
pub async fn start_crawl( pub async fn start_crawl(
start_instances: Vec<String>, start_instances: Vec<String>,

View file

@ -9,7 +9,11 @@ use structopt::StructOpt;
struct Parameters { struct Parameters {
#[structopt(short, long, default_value = "lemmy.ml")] #[structopt(short, long, default_value = "lemmy.ml")]
start_instances: Vec<String>, start_instances: Vec<String>,
#[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")] #[structopt(
short,
long,
default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml"
)]
exclude_instances: Vec<String>, exclude_instances: Vec<String>,
#[structopt(short, long, default_value = "20")] #[structopt(short, long, default_value = "20")]
max_crawl_distance: i32, max_crawl_distance: i32,