some progress

This commit is contained in:
Felix Ableitner 2022-04-05 14:31:12 +02:00
parent 1bd510fbb8
commit 8224b70405

View file

@ -3,53 +3,58 @@ use crate::node_info::NodeInfo;
use crate::REQUEST_TIMEOUT; use crate::REQUEST_TIMEOUT;
use anyhow::anyhow; use anyhow::anyhow;
use anyhow::Error; use anyhow::Error;
use futures::executor::block_on_stream;
use futures::future::join_all; use futures::future::join_all;
use futures::stream::FuturesUnordered; use futures::stream::FuturesUnordered;
use futures::{stream, try_join, StreamExt, TryStreamExt}; use futures::{future, stream, try_join, StreamExt, TryStreamExt};
use reqwest::Client; use reqwest::Client;
use serde::Serialize; use serde::Serialize;
use std::cmp::max;
use std::collections::VecDeque; use std::collections::VecDeque;
use std::future::Future; use std::future::Future;
use std::sync::{Arc, Mutex};
pub async fn crawl( pub async fn crawl(
start_instances: Vec<String>, start_instances: Vec<String>,
exclude: Vec<String>, exclude: Vec<String>,
max_depth: i32, max_depth: i32,
) -> Result<(Vec<InstanceDetails>, i32), Error> { ) -> Result<(Vec<InstanceDetails>, i32), Error> {
let exclude = Arc::new(exclude);
let mut pending_instances: VecDeque<CrawlInstanceTask> = start_instances let mut pending_instances: VecDeque<CrawlInstanceTask> = start_instances
.iter() .iter()
.map(|s| CrawlInstanceTask::new(s.to_string(), 0)) .map(|s| CrawlInstanceTask::new(s.to_string(), 0, exclude.clone()))
.collect(); .collect();
let mut crawled_instances = vec![]; let mut crawled_instances = Mutex::new(vec![]);
let mut instance_details = vec![]; //let mut instance_details = vec![];
//let mut failed_instances = 0; //let mut failed_instances = 0;
let mut futures = stream::iter(pending_instances)
.then(|instance: CrawlInstanceTask| async { let stream = Box::pin(
crawled_instances.push(instance.domain.clone()); stream::iter(pending_instances)
crawl_instance(instance, exclude.clone(), max_depth).await? .then(|task: CrawlInstanceTask| async {
crawled_instances.lock().unwrap().push(task.domain.clone());
crawl_instance(task, max_depth).await.unwrap()
}) })
.flat_map(|(instance_details, depth)| { .flat_map(|(instance_details, task)| {
let futures = instance_details let futures = instance_details.linked_instances.iter().map(|i| {
.linked_instances crawled_instances.lock().unwrap().push(i.clone());
.iter()
.map(|i| {
crawled_instances.push(i.clone());
crawl_instance( crawl_instance(
CrawlInstanceTask::new(i.clone(), depth), CrawlInstanceTask::new(i.clone(), task.depth + 1, task.exclude.clone()),
exclude.clone(),
max_depth, max_depth,
) )
}) });
.collect();
stream::iter(futures) stream::iter(futures)
}) }),
.collect::<FuturesUnordered<dyn Future<Output = Result<(InstanceDetails, i32), Error>>>>() );
let crawl_result: Vec<Result<InstanceDetails, Error>> = stream
.buffer_unordered(10)
.map_ok(|(details, _)| details)
.collect()
.await; .await;
todo!() todo!()
/* /*
let mut crawl_result: Vec<InstanceDetails> = todo!();
// Sort by active monthly users descending // Sort by active monthly users descending
crawl_result.sort_by_key(|i| i.users_active_month); crawl_result.sort_by_key(|i| i.users_active_month);
@ -60,17 +65,13 @@ pub async fn crawl(
} }
async fn crawl_instance( async fn crawl_instance(
current_instance: CrawlInstanceTask, task: CrawlInstanceTask,
exclude: Vec<String>,
max_depth: i32, max_depth: i32,
) -> Result<(InstanceDetails, i32), Error> { ) -> Result<(InstanceDetails, CrawlInstanceTask), Error> {
if current_instance.depth > max_depth || exclude.contains(&current_instance.domain) { if task.depth > max_depth || task.exclude.contains(&task.domain) {
return Err(anyhow!("max depth reached")); return Err(anyhow!("max depth reached"));
} }
Ok(( Ok((fetch_instance_details(&task.domain).await?, task))
fetch_instance_details(&current_instance.domain).await?,
current_instance.depth + 1,
))
} }
#[derive(Serialize, Clone)] #[derive(Serialize, Clone)]
@ -95,11 +96,16 @@ pub struct InstanceDetails {
struct CrawlInstanceTask { struct CrawlInstanceTask {
domain: String, domain: String,
depth: i32, depth: i32,
exclude: Arc<Vec<String>>,
} }
impl CrawlInstanceTask { impl CrawlInstanceTask {
pub fn new(domain: String, depth: i32) -> CrawlInstanceTask { pub fn new(domain: String, depth: i32, exclude: Arc<Vec<String>>) -> CrawlInstanceTask {
CrawlInstanceTask { domain, depth } CrawlInstanceTask {
domain,
depth,
exclude,
}
} }
} }