spider_util/
util.rs

1//! Small utility helpers shared across the workspace.
2
3use psl::{List, Psl};
4use std::fs;
5use std::path::Path;
6use url::Url;
7
8use crate::error::SpiderError;
9use crate::request::Request;
10
11/// Checks if two URLs belong to the same site.
12pub fn is_same_site(a: &Url, b: &Url) -> bool {
13    a.host_str().and_then(|h| List.domain(h.as_bytes()))
14        == b.host_str().and_then(|h| List.domain(h.as_bytes()))
15}
16
17/// Normalizes the origin of a request's URL.
18pub fn normalize_origin(request: &Request) -> String {
19    let url = &request.url;
20    let scheme = url.scheme();
21    let host = url.host_str().unwrap_or("");
22    let port = url.port_or_known_default().unwrap_or(0);
23
24    format!("{scheme}://{host}:{port}")
25}
26
27/// Validates that the parent directory of a given file path exists, creating it if necessary.
28///
29/// # Errors
30///
31/// Returns an error if the parent directory cannot be created.
32pub fn validate_output_dir(file_path: impl AsRef<Path>) -> Result<(), SpiderError> {
33    let Some(parent_dir) = file_path.as_ref().parent() else {
34        return Ok(());
35    };
36
37    if !parent_dir.as_os_str().is_empty() && !parent_dir.exists() {
38        fs::create_dir_all(parent_dir)?;
39    }
40
41    Ok(())
42}
43
44/// Creates a directory and all of its parent components if they are missing.
45///
46/// # Errors
47///
48/// Returns an error if the directory cannot be created.
49pub fn create_dir(dir_path: impl AsRef<Path>) -> Result<(), SpiderError> {
50    fs::create_dir_all(dir_path)?;
51    Ok(())
52}