Add command line param to exclude instances from crawl
This commit is contained in:
parent
fefd708777
commit
f110554993
3 changed files with 17 additions and 14 deletions
|
@ -1,6 +1,6 @@
|
||||||
use crate::federated_instances::GetSiteResponse;
|
use crate::federated_instances::GetSiteResponse;
|
||||||
use crate::node_info::NodeInfo;
|
use crate::node_info::NodeInfo;
|
||||||
use crate::{EXCLUDE_INSTANCES, REQUEST_TIMEOUT};
|
use crate::REQUEST_TIMEOUT;
|
||||||
use anyhow::anyhow;
|
use anyhow::anyhow;
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use futures::try_join;
|
use futures::try_join;
|
||||||
|
@ -10,6 +10,7 @@ use std::collections::VecDeque;
|
||||||
|
|
||||||
pub async fn crawl(
|
pub async fn crawl(
|
||||||
start_instances: Vec<String>,
|
start_instances: Vec<String>,
|
||||||
|
exclude: Vec<String>,
|
||||||
max_depth: i32,
|
max_depth: i32,
|
||||||
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
||||||
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
|
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
|
||||||
|
@ -21,9 +22,7 @@ pub async fn crawl(
|
||||||
let mut failed_instances = 0;
|
let mut failed_instances = 0;
|
||||||
while let Some(current_instance) = pending_instances.pop_back() {
|
while let Some(current_instance) = pending_instances.pop_back() {
|
||||||
crawled_instances.push(current_instance.domain.clone());
|
crawled_instances.push(current_instance.domain.clone());
|
||||||
if current_instance.depth > max_depth
|
if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) {
|
||||||
|| EXCLUDE_INSTANCES.contains(&&**¤t_instance.domain)
|
|
||||||
{
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
match fetch_instance_details(¤t_instance.domain).await {
|
match fetch_instance_details(¤t_instance.domain).await {
|
||||||
|
@ -48,7 +47,7 @@ pub async fn crawl(
|
||||||
// Sort by active monthly users descending
|
// Sort by active monthly users descending
|
||||||
instance_details.sort_by_key(|i| i.users_active_month);
|
instance_details.sort_by_key(|i| i.users_active_month);
|
||||||
instance_details.reverse();
|
instance_details.reverse();
|
||||||
|
|
||||||
Ok((instance_details, failed_instances))
|
Ok((instance_details, failed_instances))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,9 +7,5 @@ pub mod node_info;
|
||||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
|
pub const DEFAULT_START_INSTANCES: &str = "lemmy.ml";
|
||||||
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "1";
|
pub const DEFAULT_MAX_CRAWL_DEPTH: &str = "1";
|
||||||
pub const EXCLUDE_INSTANCES: &'static [&str] = &[
|
pub const EXCLUDE_INSTANCES: &str =
|
||||||
"ds9.lemmy.ml",
|
"ds9.lemmy.ml, enterprise.lemmy.ml, voyager.lemmy.ml, test.lemmy.ml";
|
||||||
"enterprise.lemmy.ml",
|
|
||||||
"voyager.lemmy.ml",
|
|
||||||
"test.lemmy.ml",
|
|
||||||
];
|
|
||||||
|
|
14
src/main.rs
14
src/main.rs
|
@ -1,7 +1,7 @@
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use clap::{App, Arg};
|
use clap::{App, Arg};
|
||||||
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
|
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
|
||||||
use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES};
|
use lemmy_stats_crawler::{DEFAULT_MAX_CRAWL_DEPTH, DEFAULT_START_INSTANCES, EXCLUDE_INSTANCES};
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
|
@ -12,6 +12,7 @@ pub async fn main() -> Result<(), Error> {
|
||||||
.long("start-instances")
|
.long("start-instances")
|
||||||
.takes_value(true),
|
.takes_value(true),
|
||||||
)
|
)
|
||||||
|
.arg(Arg::with_name("exclude").long("exclude").takes_value(true))
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name("max-crawl-depth")
|
Arg::with_name("max-crawl-depth")
|
||||||
.long("max-crawl-depth")
|
.long("max-crawl-depth")
|
||||||
|
@ -22,7 +23,13 @@ pub async fn main() -> Result<(), Error> {
|
||||||
.value_of("start-instances")
|
.value_of("start-instances")
|
||||||
.unwrap_or(DEFAULT_START_INSTANCES)
|
.unwrap_or(DEFAULT_START_INSTANCES)
|
||||||
.split(',')
|
.split(',')
|
||||||
.map(|s| s.to_string())
|
.map(|s| s.trim().to_string())
|
||||||
|
.collect();
|
||||||
|
let exclude: Vec<String> = matches
|
||||||
|
.value_of("exclude")
|
||||||
|
.unwrap_or(EXCLUDE_INSTANCES)
|
||||||
|
.split(',')
|
||||||
|
.map(|s| s.trim().to_string())
|
||||||
.collect();
|
.collect();
|
||||||
let max_crawl_depth: i32 = matches
|
let max_crawl_depth: i32 = matches
|
||||||
.value_of("max-crawl-depth")
|
.value_of("max-crawl-depth")
|
||||||
|
@ -30,7 +37,8 @@ pub async fn main() -> Result<(), Error> {
|
||||||
.parse()?;
|
.parse()?;
|
||||||
|
|
||||||
eprintln!("Crawling...");
|
eprintln!("Crawling...");
|
||||||
let (instance_details, failed_instances) = crawl(start_instances, max_crawl_depth).await?;
|
let (instance_details, failed_instances) =
|
||||||
|
crawl(start_instances, exclude, max_crawl_depth).await?;
|
||||||
let total_stats = aggregate(instance_details, failed_instances);
|
let total_stats = aggregate(instance_details, failed_instances);
|
||||||
|
|
||||||
println!("{}", serde_json::to_string_pretty(&total_stats)?);
|
println!("{}", serde_json::to_string_pretty(&total_stats)?);
|
||||||
|
|
Loading…
Reference in a new issue