wip: try to implement parallel crawl

This commit is contained in:
Felix Ableitner 2022-04-01 00:31:52 +02:00
parent 9546936ddf
commit 1bd510fbb8

View file

@ -3,52 +3,74 @@ use crate::node_info::NodeInfo;
use crate::REQUEST_TIMEOUT; use crate::REQUEST_TIMEOUT;
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Error; use anyhow::Error;
use futures::try_join; use futures::future::join_all;
use futures::stream::FuturesUnordered;
use futures::{stream, try_join, StreamExt, TryStreamExt};
use reqwest::Client; use reqwest::Client;
use serde::Serialize; use serde::Serialize;
use std::collections::VecDeque; use std::collections::VecDeque;
use std::future::Future;
pub async fn crawl( pub async fn crawl(
start_instances: Vec<String>, start_instances: Vec<String>,
exclude: Vec<String>, exclude: Vec<String>,
max_depth: i32, max_depth: i32,
) -> Result<(Vec<InstanceDetails>, i32), Error> { ) -> Result<(Vec<InstanceDetails>, i32), Error> {
let mut pending_instances: VecDeque<CrawlInstance> = start_instances let mut pending_instances: VecDeque<CrawlInstanceTask> = start_instances
.iter() .iter()
.map(|s| CrawlInstance::new(s.to_string(), 0)) .map(|s| CrawlInstanceTask::new(s.to_string(), 0))
.collect(); .collect();
let mut crawled_instances = vec![]; let mut crawled_instances = vec![];
let mut instance_details = vec![]; let mut instance_details = vec![];
let mut failed_instances = 0; //let mut failed_instances = 0;
while let Some(current_instance) = pending_instances.pop_back() { let mut futures = stream::iter(pending_instances)
crawled_instances.push(current_instance.domain.clone()); .then(|instance: CrawlInstanceTask| async {
if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) { crawled_instances.push(instance.domain.clone());
continue; crawl_instance(instance, exclude.clone(), max_depth).await?
} })
match fetch_instance_details(&current_instance.domain).await { .flat_map(|(instance_details, depth)| {
Ok(details) => { let futures = instance_details
instance_details.push(details.to_owned()); .linked_instances
for i in details.linked_instances { .iter()
let is_in_crawled = crawled_instances.contains(&i); .map(|i| {
let is_in_pending = pending_instances.iter().any(|p| p.domain == i); crawled_instances.push(i.clone());
if !is_in_crawled && !is_in_pending { crawl_instance(
let ci = CrawlInstance::new(i, current_instance.depth + 1); CrawlInstanceTask::new(i.clone(), depth),
pending_instances.push_back(ci); exclude.clone(),
} max_depth,
} )
} })
Err(e) => { .collect();
failed_instances += 1;
eprintln!("Failed to crawl {}: {}", current_instance.domain, e) stream::iter(futures)
} })
} .collect::<FuturesUnordered<dyn Future<Output = Result<(InstanceDetails, i32), Error>>>>()
} .await;
todo!()
/*
let mut crawl_result: Vec<InstanceDetails> = todo!();
// Sort by active monthly users descending // Sort by active monthly users descending
instance_details.sort_by_key(|i| i.users_active_month); crawl_result.sort_by_key(|i| i.users_active_month);
instance_details.reverse(); crawl_result.reverse();
Ok((instance_details, failed_instances)) Ok((crawl_result, failed_instances))
*/
}
async fn crawl_instance(
current_instance: CrawlInstanceTask,
exclude: Vec<String>,
max_depth: i32,
) -> Result<(InstanceDetails, i32), Error> {
if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) {
return Err(anyhow!("max depth reached"));
}
Ok((
fetch_instance_details(&current_instance.domain).await?,
current_instance.depth + 1,
))
} }
#[derive(Serialize, Clone)] #[derive(Serialize, Clone)]
@ -70,14 +92,14 @@ pub struct InstanceDetails {
pub linked_instances: Vec<String>, pub linked_instances: Vec<String>,
} }
struct CrawlInstance { struct CrawlInstanceTask {
domain: String, domain: String,
depth: i32, depth: i32,
} }
impl CrawlInstance { impl CrawlInstanceTask {
pub fn new(domain: String, depth: i32) -> CrawlInstance { pub fn new(domain: String, depth: i32) -> CrawlInstanceTask {
CrawlInstance { domain, depth } CrawlInstanceTask { domain, depth }
} }
} }