Add user agent
This commit is contained in:
parent
575672cbe3
commit
21cf61f847
3 changed files with 23 additions and 9 deletions
14
src/crawl.rs
14
src/crawl.rs
|
@ -1,5 +1,4 @@
|
||||||
use crate::CLIENT;
|
use crate::CLIENT;
|
||||||
use crate::REQUEST_TIMEOUT;
|
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use async_recursion::async_recursion;
|
use async_recursion::async_recursion;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
|
@ -52,7 +51,10 @@ impl CrawlJob {
|
||||||
return vec![];
|
return vec![];
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("Starting crawl for {}, distance {}", &self.domain, &self.current_distance);
|
debug!(
|
||||||
|
"Starting crawl for {}, distance {}",
|
||||||
|
&self.domain, &self.current_distance
|
||||||
|
);
|
||||||
let site_info = match self.fetch_instance_details().await {
|
let site_info = match self.fetch_instance_details().await {
|
||||||
Ok(o) => o,
|
Ok(o) => o,
|
||||||
Err(e) => return vec![Err(e)],
|
Err(e) => return vec![Err(e)],
|
||||||
|
@ -65,8 +67,11 @@ impl CrawlJob {
|
||||||
let mut result = vec![];
|
let mut result = vec![];
|
||||||
if let Some(federated) = &site_info.0.federated_instances {
|
if let Some(federated) = &site_info.0.federated_instances {
|
||||||
for domain in federated.linked.iter() {
|
for domain in federated.linked.iter() {
|
||||||
let crawl_job =
|
let crawl_job = CrawlJob::new(
|
||||||
CrawlJob::new(domain.clone(), self.current_distance + 1, self.params.clone());
|
domain.clone(),
|
||||||
|
self.current_distance + 1,
|
||||||
|
self.params.clone(),
|
||||||
|
);
|
||||||
result.push(crawl_job.crawl());
|
result.push(crawl_job.crawl());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -86,7 +91,6 @@ impl CrawlJob {
|
||||||
let site_info_url = format!("https://{}/api/v3/site", &self.domain);
|
let site_info_url = format!("https://{}/api/v3/site", &self.domain);
|
||||||
let site_info = CLIENT
|
let site_info = CLIENT
|
||||||
.get(&site_info_url)
|
.get(&site_info_url)
|
||||||
.timeout(REQUEST_TIMEOUT)
|
|
||||||
.send()
|
.send()
|
||||||
.await?
|
.await?
|
||||||
.json::<GetSiteResponse>()
|
.json::<GetSiteResponse>()
|
||||||
|
|
12
src/lib.rs
12
src/lib.rs
|
@ -5,7 +5,7 @@ use crate::crawl::{CrawlJob, CrawlParams, InstanceDetails};
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use reqwest::Client;
|
use reqwest::{Client, ClientBuilder};
|
||||||
use semver::Version;
|
use semver::Version;
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
@ -14,9 +14,15 @@ use tokio::sync::Mutex;
|
||||||
|
|
||||||
pub mod crawl;
|
pub mod crawl;
|
||||||
|
|
||||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
|
|
||||||
static CLIENT: Lazy<Client> = Lazy::new(Client::default);
|
static CLIENT: Lazy<Client> = Lazy::new(|| {
|
||||||
|
ClientBuilder::new()
|
||||||
|
.timeout(REQUEST_TIMEOUT)
|
||||||
|
.user_agent("lemmy-stats-crawler")
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
|
});
|
||||||
|
|
||||||
pub async fn start_crawl(
|
pub async fn start_crawl(
|
||||||
start_instances: Vec<String>,
|
start_instances: Vec<String>,
|
||||||
|
|
|
@ -9,7 +9,11 @@ use structopt::StructOpt;
|
||||||
struct Parameters {
|
struct Parameters {
|
||||||
#[structopt(short, long, default_value = "lemmy.ml")]
|
#[structopt(short, long, default_value = "lemmy.ml")]
|
||||||
start_instances: Vec<String>,
|
start_instances: Vec<String>,
|
||||||
#[structopt(short, long, default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml")]
|
#[structopt(
|
||||||
|
short,
|
||||||
|
long,
|
||||||
|
default_value = "ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml"
|
||||||
|
)]
|
||||||
exclude_instances: Vec<String>,
|
exclude_instances: Vec<String>,
|
||||||
#[structopt(short, long, default_value = "20")]
|
#[structopt(short, long, default_value = "20")]
|
||||||
max_crawl_distance: i32,
|
max_crawl_distance: i32,
|
||||||
|
|
Loading…
Reference in a new issue