Add max depth parameter for crawl
This commit is contained in:
parent
5a09fa46c4
commit
f01e077020
3 changed files with 87 additions and 63 deletions
43
src/crawl.rs
43
src/crawl.rs
|
@ -1,28 +1,40 @@
|
||||||
use anyhow::Error;
|
|
||||||
use futures::try_join;
|
|
||||||
use crate::federated_instances::GetSiteResponse;
|
use crate::federated_instances::GetSiteResponse;
|
||||||
use crate::node_info::NodeInfo;
|
use crate::node_info::NodeInfo;
|
||||||
|
use crate::REQUEST_TIMEOUT;
|
||||||
|
use anyhow::Error;
|
||||||
|
use futures::try_join;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use crate::REQUEST_TIMEOUT;
|
|
||||||
|
|
||||||
pub async fn crawl(start_instances: Vec<String>) -> Result<Vec<InstanceDetails>, Error> {
|
pub async fn crawl(
|
||||||
let mut pending_instances = VecDeque::from(start_instances);
|
start_instances: Vec<String>,
|
||||||
|
max_depth: i32,
|
||||||
|
) -> Result<Vec<InstanceDetails>, Error> {
|
||||||
|
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
|
||||||
|
.iter()
|
||||||
|
.map(|s| CrawlInstance::new(s.to_string(), 0))
|
||||||
|
.collect();
|
||||||
let mut crawled_instances = vec![];
|
let mut crawled_instances = vec![];
|
||||||
let mut instance_details = vec![];
|
let mut instance_details = vec![];
|
||||||
while let Some(current_instance) = pending_instances.pop_back() {
|
while let Some(current_instance) = pending_instances.pop_back() {
|
||||||
crawled_instances.push(current_instance.clone());
|
crawled_instances.push(current_instance.domain.clone());
|
||||||
match fetch_instance_details(¤t_instance).await {
|
if current_instance.depth > max_depth {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match fetch_instance_details(¤t_instance.domain).await {
|
||||||
Ok(details) => {
|
Ok(details) => {
|
||||||
instance_details.push(details.to_owned());
|
instance_details.push(details.to_owned());
|
||||||
for i in details.linked_instances {
|
for i in details.linked_instances {
|
||||||
if !crawled_instances.contains(&i) && !pending_instances.contains(&i) {
|
let is_in_crawled = crawled_instances.contains(&i);
|
||||||
pending_instances.push_back(i);
|
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
|
||||||
|
if !is_in_crawled && !is_in_pending {
|
||||||
|
let ci = CrawlInstance::new(i, current_instance.depth + 1);
|
||||||
|
pending_instances.push_back(ci);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => eprintln!("Failed to crawl {}: {}", current_instance, e),
|
Err(e) => eprintln!("Failed to crawl {}: {}", current_instance.domain, e),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,6 +58,17 @@ pub struct InstanceDetails {
|
||||||
pub linked_instances: Vec<String>,
|
pub linked_instances: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct CrawlInstance {
|
||||||
|
domain: String,
|
||||||
|
depth: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CrawlInstance {
|
||||||
|
pub fn new(domain: String, depth: i32) -> CrawlInstance {
|
||||||
|
CrawlInstance { domain, depth }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error> {
|
async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error> {
|
||||||
let client = Client::default();
|
let client = Client::default();
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
use std::time::Duration;
|
use std::time::Duration;
|
||||||
|
|
||||||
|
pub mod crawl;
|
||||||
pub mod federated_instances;
|
pub mod federated_instances;
|
||||||
pub mod node_info;
|
pub mod node_info;
|
||||||
pub mod crawl;
|
|
||||||
|
|
||||||
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
|
||||||
pub const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"];
|
pub const START_INSTANCES: [&'static str; 1] = ["lemmy.ml"];
|
||||||
|
pub const MAX_CRAWL_DEPTH: i32 = 2;
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
use serde::Serialize;
|
|
||||||
use lemmy_stats_crawler::START_INSTANCES;
|
|
||||||
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
|
use lemmy_stats_crawler::crawl::{crawl, InstanceDetails};
|
||||||
|
use lemmy_stats_crawler::{MAX_CRAWL_DEPTH, START_INSTANCES};
|
||||||
|
use serde::Serialize;
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
pub async fn main() -> Result<(), Error> {
|
pub async fn main() -> Result<(), Error> {
|
||||||
let start_instances = START_INSTANCES.iter().map(|s| s.to_string()).collect();
|
let start_instances = START_INSTANCES.iter().map(|s| s.to_string()).collect();
|
||||||
|
|
||||||
eprintln!("Crawling...");
|
eprintln!("Crawling...");
|
||||||
let instance_details = crawl(start_instances).await?;
|
let instance_details = crawl(start_instances, MAX_CRAWL_DEPTH).await?;
|
||||||
let total_stats = aggregate(instance_details);
|
let total_stats = aggregate(instance_details);
|
||||||
|
|
||||||
println!("{}", serde_json::to_string(&total_stats)?);
|
println!("{}", serde_json::to_string(&total_stats)?);
|
||||||
|
|
Loading…
Reference in a new issue