Add user agent
This commit is contained in:
parent
575672cbe3
commit
21cf61f847
3 changed files with 23 additions and 9 deletions
14
src/crawl.rs
14
src/crawl.rs
|
@ -1,5 +1,4 @@
|
|||
use crate::CLIENT;
|
||||
use crate::REQUEST_TIMEOUT;
|
||||
use anyhow::Error;
|
||||
use async_recursion::async_recursion;
|
||||
use futures::future::join_all;
|
||||
|
@ -52,7 +51,10 @@ impl CrawlJob {
|
|||
return vec![];
|
||||
}
|
||||
|
||||
debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance);
|
||||
debug!(
|
||||
"Starting crawl for {}, distance {}",
|
||||
&self.domain, &self.current_distance
|
||||
);
|
||||
let site_info = match self.fetch_instance_details().await {
|
||||
Ok(o) => o,
|
||||
Err(e) => return vec![Err(e)],
|
||||
|
@ -65,8 +67,11 @@ impl CrawlJob {
|
|||
let mut result = vec![];
|
||||
if let Some(federated) = &site_info.0.federated_instances {
|
||||
for domain in federated.linked.iter() {
|
||||
let crawl_job =
|
||||
CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone());
|
||||
let crawl_job = CrawlJob::new(
|
||||
domain.clone(),
|
||||
self.current_distance + 1,
|
||||
self.params.clone(),
|
||||
);
|
||||
result.push(crawl_job.crawl());
|
||||
}
|
||||
}
|
||||
|
@ -86,7 +91,6 @@ impl CrawlJob {
|
|||
let site_info_url = format!("https://{}/api/v3/site", &self.domain);
|
||||
let site_info = CLIENT
|
||||
.get(&site_info_url)
|
||||
.timeout(REQUEST_TIMEOUT)
|
||||
.send()
|
||||
.await?
|
||||
.json::<GetSiteResponse>()
|
||||
|
|
12
src/lib.rs
12
src/lib.rs
|
@ -5,7 +5,7 @@ use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails};
|
|||
use anyhow::Error;
|
||||
use futures::future::join_all;
|
||||
use once_cell::sync::Lazy;
|
||||
use reqwest::Client;
|
||||
use reqwest::{Client, ClientBuilder};
|
||||
use semver::Version;
|
||||
use std::collections::HashSet;
|
||||
use std::sync::Arc;
|
||||
|
@ -14,9 +14,15 @@ use tokio::sync::Mutex;
|
|||
|
||||
pub mod crawl;
|
||||
|
||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||
|
||||
static CLIENT: Lazy<Client> = Lazy::new(Client::default);
|
||||
static CLIENT: Lazy<Client> = Lazy::new(|| {
|
||||
ClientBuilder::new()
|
||||
.timeout(REQUEST_TIMEOUT)
|
||||
.user_agent("lemmy-stats-crawler")
|
||||
.build()
|
||||
.unwrap()
|
||||
});
|
||||
|
||||
pub async fn start_crawl(
|
||||
start_instances: Vec<String>,
|
||||
|
|
|
@ -9,7 +9,11 @@ use structopt::StructOpt;
|
|||
struct Parameters {
|
||||
#[structopt(short, long, default_value = "lemmy.ml")]
|
||||
start_instances: Vec<String>,
|
||||
#[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")]
|
||||
#[structopt(
|
||||
short,
|
||||
long,
|
||||
default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml"
|
||||
)]
|
||||
exclude_instances: Vec<String>,
|
||||
#[structopt(short, long, default_value = "20")]
|
||||
max_crawl_distance: i32,
|
||||
|
|
Loading…
Reference in a new issue