some progress
This commit is contained in:
parent
1bd510fbb8
commit
8224b70405
1 changed files with 39 additions and 33 deletions
72
src/crawl.rs
72
src/crawl.rs
|
@ -3,53 +3,58 @@ use crate::node_info::NodeInfo;
|
||||||
use crate::REQUEST_TIMEOUT;
|
use crate::REQUEST_TIMEOUT;
|
||||||
use anyhow::anyhow;
|
use anyhow::anyhow;
|
||||||
use anyhow::Error;
|
use anyhow::Error;
|
||||||
|
use futures::executor::block_on_stream;
|
||||||
use futures::future::join_all;
|
use futures::future::join_all;
|
||||||
use futures::stream::FuturesUnordered;
|
use futures::stream::FuturesUnordered;
|
||||||
use futures::{stream, try_join, StreamExt, TryStreamExt};
|
use futures::{future, stream, try_join, StreamExt, TryStreamExt};
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
use std::cmp::max;
|
||||||
use std::collections::VecDeque;
|
use std::collections::VecDeque;
|
||||||
use std::future::Future;
|
use std::future::Future;
|
||||||
|
use std::sync::{Arc, Mutex};
|
||||||
|
|
||||||
pub async fn crawl(
|
pub async fn crawl(
|
||||||
start_instances: Vec<String>,
|
start_instances: Vec<String>,
|
||||||
exclude: Vec<String>,
|
exclude: Vec<String>,
|
||||||
max_depth: i32,
|
max_depth: i32,
|
||||||
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
) -> Result<(Vec<InstanceDetails>, i32), Error> {
|
||||||
|
let exclude = Arc::new(exclude);
|
||||||
let mut pending_instances: VecDeque<CrawlInstanceTask> = start_instances
|
let mut pending_instances: VecDeque<CrawlInstanceTask> = start_instances
|
||||||
.iter()
|
.iter()
|
||||||
.map(|s| CrawlInstanceTask::new(s.to_string(), 0))
|
.map(|s| CrawlInstanceTask::new(s.to_string(), 0, exclude.clone()))
|
||||||
.collect();
|
.collect();
|
||||||
let mut crawled_instances = vec![];
|
let mut crawled_instances = Mutex::new(vec![]);
|
||||||
let mut instance_details = vec![];
|
//let mut instance_details = vec![];
|
||||||
//let mut failed_instances = 0;
|
//let mut failed_instances = 0;
|
||||||
let mut futures = stream::iter(pending_instances)
|
|
||||||
.then(|instance: CrawlInstanceTask| async {
|
let stream = Box::pin(
|
||||||
crawled_instances.push(instance.domain.clone());
|
stream::iter(pending_instances)
|
||||||
crawl_instance(instance, exclude.clone(), max_depth).await?
|
.then(|task: CrawlInstanceTask| async {
|
||||||
})
|
crawled_instances.lock().unwrap().push(task.domain.clone());
|
||||||
.flat_map(|(instance_details, depth)| {
|
crawl_instance(task, max_depth).await.unwrap()
|
||||||
let futures = instance_details
|
})
|
||||||
.linked_instances
|
.flat_map(|(instance_details, task)| {
|
||||||
.iter()
|
let futures = instance_details.linked_instances.iter().map(|i| {
|
||||||
.map(|i| {
|
crawled_instances.lock().unwrap().push(i.clone());
|
||||||
crawled_instances.push(i.clone());
|
|
||||||
crawl_instance(
|
crawl_instance(
|
||||||
CrawlInstanceTask::new(i.clone(), depth),
|
CrawlInstanceTask::new(i.clone(), task.depth + 1, task.exclude.clone()),
|
||||||
exclude.clone(),
|
|
||||||
max_depth,
|
max_depth,
|
||||||
)
|
)
|
||||||
})
|
});
|
||||||
.collect();
|
|
||||||
|
|
||||||
stream::iter(futures)
|
stream::iter(futures)
|
||||||
})
|
}),
|
||||||
.collect::<FuturesUnordered<dyn Future<Output = Result<(InstanceDetails, i32), Error>>>>()
|
);
|
||||||
|
|
||||||
|
let crawl_result: Vec<Result<InstanceDetails, Error>> = stream
|
||||||
|
.buffer_unordered(10)
|
||||||
|
.map_ok(|(details, _)| details)
|
||||||
|
.collect()
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
todo!()
|
todo!()
|
||||||
/*
|
/*
|
||||||
let mut crawl_result: Vec<InstanceDetails> = todo!();
|
|
||||||
|
|
||||||
// Sort by active monthly users descending
|
// Sort by active monthly users descending
|
||||||
crawl_result.sort_by_key(|i| i.users_active_month);
|
crawl_result.sort_by_key(|i| i.users_active_month);
|
||||||
|
@ -60,17 +65,13 @@ pub async fn crawl(
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn crawl_instance(
|
async fn crawl_instance(
|
||||||
current_instance: CrawlInstanceTask,
|
task: CrawlInstanceTask,
|
||||||
exclude: Vec<String>,
|
|
||||||
max_depth: i32,
|
max_depth: i32,
|
||||||
) -> Result<(InstanceDetails, i32), Error> {
|
) -> Result<(InstanceDetails, CrawlInstanceTask), Error> {
|
||||||
if current_instance.depth > max_depth || exclude.contains(¤t_instance.domain) {
|
if task.depth > max_depth || task.exclude.contains(&task.domain) {
|
||||||
return Err(anyhow!("max depth reached"));
|
return Err(anyhow!("max depth reached"));
|
||||||
}
|
}
|
||||||
Ok((
|
Ok((fetch_instance_details(&task.domain).await?, task))
|
||||||
fetch_instance_details(¤t_instance.domain).await?,
|
|
||||||
current_instance.depth + 1,
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Clone)]
|
#[derive(Serialize, Clone)]
|
||||||
|
@ -95,11 +96,16 @@ pub struct InstanceDetails {
|
||||||
struct CrawlInstanceTask {
|
struct CrawlInstanceTask {
|
||||||
domain: String,
|
domain: String,
|
||||||
depth: i32,
|
depth: i32,
|
||||||
|
exclude: Arc<Vec<String>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CrawlInstanceTask {
|
impl CrawlInstanceTask {
|
||||||
pub fn new(domain: String, depth: i32) -> CrawlInstanceTask {
|
pub fn new(domain: String, depth: i32, exclude: Arc<Vec<String>>) -> CrawlInstanceTask {
|
||||||
CrawlInstanceTask { domain, depth }
|
CrawlInstanceTask {
|
||||||
|
domain,
|
||||||
|
depth,
|
||||||
|
exclude,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue