wip: try to implement parallel crawl
This commit is contained in:
parent
4e95487842
commit
b43c6c1570
2 changed files with 62 additions and 36 deletions
96
src/crawl.rs
96
src/crawl.rs
|
@ -3,52 +3,74 @@ use crate::node_info::NodeInfo;
|
|||
use crate::REQUEST_TIMEOUT;
|
||||
use anyhow::anyhow;
|
||||
use anyhow::Error;
|
||||
use futures::try_join;
|
||||
use futures::future::join_all;
|
||||
use futures::stream::FuturesUnordered;
|
||||
use futures::{stream, try_join, StreamExt, TryStreamExt};
|
||||
use reqwest::Client;
|
||||
use serde::Serialize;
|
||||
use std::collections::VecDeque;
|
||||
use std::future::Future;
|
||||
|
||||
pub async fn crawl(
|
||||
start_instances: Vec<String>,
|
||||
exclude: Vec<String>,
|
||||
max_depth: i32,
|
||||
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
||||
let mut pending_instances: VecDeque<CrawlInstance> = start_instances
|
||||
let mut pending_instances: VecDeque<CrawlInstanceTask> = start_instances
|
||||
.iter()
|
||||
.map(|s| CrawlInstance::new(s.to_string(), 0))
|
||||
.map(|s| CrawlInstanceTask::new(s.to_string(), 0))
|
||||
.collect();
|
||||
let mut crawled_instances = vec![];
|
||||
let mut instance_details = vec![];
|
||||
let mut failed_instances = 0;
|
||||
while let Some(current_instance) = pending_instances.pop_back() {
|
||||
crawled_instances.push(current_instance.domain.clone());
|
||||
if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) {
|
||||
continue;
|
||||
}
|
||||
match fetch_instance_details(¤t_instance.domain).await {
|
||||
Ok(details) => {
|
||||
instance_details.push(details.to_owned());
|
||||
for i in details.linked_instances {
|
||||
let is_in_crawled = crawled_instances.contains(&i);
|
||||
let is_in_pending = pending_instances.iter().any(|p| p.domain == i);
|
||||
if !is_in_crawled && !is_in_pending {
|
||||
let ci = CrawlInstance::new(i, current_instance.depth + 1);
|
||||
pending_instances.push_back(ci);
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
failed_instances += 1;
|
||||
eprintln!("Failed to crawl {}: {}", current_instance.domain, e)
|
||||
}
|
||||
}
|
||||
}
|
||||
//let mut failed_instances = 0;
|
||||
let mut futures = stream::iter(pending_instances)
|
||||
.then(|instance: CrawlInstanceTask| async {
|
||||
crawled_instances.push(instance.domain.clone());
|
||||
crawl_instance(instance, exclude.clone(), max_depth).await?
|
||||
})
|
||||
.flat_map(|(instance_details, depth)| {
|
||||
let futures = instance_details
|
||||
.linked_instances
|
||||
.iter()
|
||||
.map(|i| {
|
||||
crawled_instances.push(i.clone());
|
||||
crawl_instance(
|
||||
CrawlInstanceTask::new(i.clone(), depth),
|
||||
exclude.clone(),
|
||||
max_depth,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
stream::iter(futures)
|
||||
})
|
||||
.collect::<FuturesUnordered<dyn Future<Output = Result<(InstanceDetails, i32), Error>>>>()
|
||||
.await;
|
||||
|
||||
todo!()
|
||||
/*
|
||||
let mut crawl_result: Vec<InstanceDetails> = todo!();
|
||||
|
||||
// Sort by active monthly users descending
|
||||
instance_details.sort_by_key(|i| i.users_active_month);
|
||||
instance_details.reverse();
|
||||
crawl_result.sort_by_key(|i| i.users_active_month);
|
||||
crawl_result.reverse();
|
||||
|
||||
Ok((instance_details, failed_instances))
|
||||
Ok((crawl_result, failed_instances))
|
||||
*/
|
||||
}
|
||||
|
||||
async fn crawl_instance(
|
||||
current_instance: CrawlInstanceTask,
|
||||
exclude: Vec<String>,
|
||||
max_depth: i32,
|
||||
) -> Result<(InstanceDetails, i32), Error> {
|
||||
if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) {
|
||||
return Err(anyhow!("max depth reached"));
|
||||
}
|
||||
Ok((
|
||||
fetch_instance_details(¤t_instance.domain).await?,
|
||||
current_instance.depth + 1,
|
||||
))
|
||||
}
|
||||
|
||||
#[derive(Serialize, Clone)]
|
||||
|
@ -70,14 +92,14 @@ pub struct InstanceDetails {
|
|||
pub linked_instances: Vec<String>,
|
||||
}
|
||||
|
||||
struct CrawlInstance {
|
||||
struct CrawlInstanceTask {
|
||||
domain: String,
|
||||
depth: i32,
|
||||
}
|
||||
|
||||
impl CrawlInstance {
|
||||
pub fn new(domain: String, depth: i32) -> CrawlInstance {
|
||||
CrawlInstance { domain, depth }
|
||||
impl CrawlInstanceTask {
|
||||
pub fn new(domain: String, depth: i32) -> CrawlInstanceTask {
|
||||
CrawlInstanceTask { domain, depth }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -136,7 +158,11 @@ async fn fetch_instance_details(domain: &str) -> Result<InstanceDetails, Error>
|
|||
users_active_month: node_info.usage.users.active_month,
|
||||
open_registrations: node_info.open_registrations,
|
||||
linked_instances_count: linked_instances.len() as i32,
|
||||
require_application: site_info.site_view.site.require_application.unwrap_or(false),
|
||||
require_application: site_info
|
||||
.site_view
|
||||
.site
|
||||
.require_application
|
||||
.unwrap_or(false),
|
||||
linked_instances,
|
||||
})
|
||||
}
|
||||
|
|
|
@ -24,5 +24,5 @@ pub struct Site {
|
|||
pub name: String,
|
||||
pub icon: Option<String>,
|
||||
pub description: Option<String>,
|
||||
pub require_application: Option<bool>
|
||||
pub require_application: Option<bool>,
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue