-
Notifications
You must be signed in to change notification settings - Fork 128
/
Copy pathadvanced_configuration.rs
70 lines (58 loc) · 2.09 KB
/
advanced_configuration.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
//! `cargo run --example advanced_configuration`
extern crate spider;
use spider::{configuration::Configuration, tokio, website::Website};
use std::{io::Error, time::Instant};
const CAPACITY: usize = 5;
const CRAWL_LIST: [&str; CAPACITY] = [
"https://rsseau.fr/en",
"https://choosealicense.com",
"https://jeffmendez.com",
"https://spider-rs.github.io/spider-nodejs/",
"https://spider-rs.github.io/spider-py/",
];
#[tokio::main]
async fn main() -> Result<(), Error> {
let config = Configuration::new()
.with_user_agent(Some("SpiderBot"))
.with_blacklist_url(Some(Vec::from(["https://rsseau.fr/resume".into()])))
.with_subdomains(false)
.with_tld(false)
.with_redirect_limit(3)
.with_respect_robots_txt(true)
.with_external_domains(Some(
Vec::from(["http://loto.rsseau.fr/"].map(|d| d.to_string())).into_iter(),
))
.build();
let mut handles = Vec::with_capacity(CAPACITY);
for website_url in CRAWL_LIST {
match Website::new(website_url)
.with_config(config.to_owned())
.build()
{
Ok(mut website) => {
let handle = tokio::spawn(async move {
println!("Starting Crawl - {:?}", website.get_url().inner());
let start = Instant::now();
website.crawl().await;
let duration = start.elapsed();
let links = website.get_all_links_visited().await;
for link in links.iter() {
println!("- {:?}", link.as_ref());
}
println!(
"{:?} - Time elapsed in website.crawl() is: {:?} for total pages: {:?}",
website.get_url().inner(),
duration,
links.len()
);
});
handles.push(handle);
}
Err(e) => println!("{:?}", e),
}
}
for handle in handles {
let _ = handle.await;
}
Ok(())
}