spider_middleware/
robots.rs

1//! `robots.txt` middleware.
2//!
3//! [`RobotsTxtMiddleware`] fetches and caches `robots.txt` per origin and blocks
4//! requests that are disallowed for the outgoing user agent.
5
6use std::sync::Arc;
7use std::time::Duration;
8
9use async_trait::async_trait;
10use http::header::USER_AGENT;
11use log::{debug, info, warn};
12use moka::future::Cache;
13use robotstxt::DefaultMatcher;
14
15use crate::middleware::{Middleware, MiddlewareAction};
16use spider_util::error::SpiderError;
17use spider_util::http_client::HttpClient;
18use spider_util::request::Request;
19
20/// Middleware that enforces `robots.txt` rules before download.
21#[derive(Debug)]
22pub struct RobotsTxtMiddleware {
23    cache_ttl: Duration,
24    cache_capacity: u64,
25    request_timeout: Duration,
26    cache: Cache<String, Arc<String>>,
27}
28
29impl Default for RobotsTxtMiddleware {
30    fn default() -> Self {
31        let cache_ttl = Duration::from_secs(60 * 60 * 24);
32        let cache_capacity = 10_000;
33        let cache = Cache::builder()
34            .time_to_live(cache_ttl)
35            .max_capacity(cache_capacity)
36            .build();
37
38        let middleware = Self {
39            cache_ttl,
40            cache_capacity,
41            request_timeout: Duration::from_secs(5),
42            cache,
43        };
44        info!(
45            "Initializing RobotsTxtMiddleware with config: {:?}",
46            middleware
47        );
48        middleware
49    }
50}
51
52impl RobotsTxtMiddleware {
53    /// Creates a new `RobotsTxtMiddleware` with default settings.
54    pub fn new() -> Self {
55        Self::default()
56    }
57
58    /// Set the time-to-live for the cache.
59    pub fn cache_ttl(mut self, cache_ttl: Duration) -> Self {
60        self.cache_ttl = cache_ttl;
61        self.rebuild_cache();
62        self
63    }
64
65    /// Set the max capacity for the cache.
66    pub fn cache_capacity(mut self, cache_capacity: u64) -> Self {
67        self.cache_capacity = cache_capacity;
68        self.rebuild_cache();
69        self
70    }
71
72    /// Set the timeout for fetching robots.txt files.
73    pub fn request_timeout(mut self, request_timeout: Duration) -> Self {
74        self.request_timeout = request_timeout;
75        self
76    }
77
78    /// Rebuilds the cache with the current settings.
79    fn rebuild_cache(&mut self) {
80        self.cache = Cache::builder()
81            .time_to_live(self.cache_ttl)
82            .max_capacity(self.cache_capacity)
83            .build();
84    }
85
86    async fn fetch_robots_content<C: HttpClient>(&self, client: &C, origin: &str) -> Arc<String> {
87        let robots_url = format!("{}/robots.txt", origin);
88        debug!("Fetching robots.txt from: {}", robots_url);
89
90        let permissive = || Arc::new(String::new());
91
92        match client.get_text(&robots_url, self.request_timeout).await {
93            Ok((status, body)) if status.is_success() => match String::from_utf8(body.into()) {
94                Ok(text) => Arc::new(text),
95                Err(e) => {
96                    warn!("Failed to read robots.txt {}: {}", robots_url, e);
97                    permissive()
98                }
99            },
100            Ok((status, _)) => {
101                debug!(
102                    "robots.txt {} returned {} — allowing all",
103                    robots_url, status
104                );
105                permissive()
106            }
107            Err(e) => {
108                warn!("Failed to fetch robots.txt {}: {}", robots_url, e);
109                permissive()
110            }
111        }
112    }
113}
114
115#[async_trait]
116impl<C: HttpClient> Middleware<C> for RobotsTxtMiddleware {
117    fn name(&self) -> &str {
118        "RobotsTxtMiddleware"
119    }
120
121    async fn process_request(
122        &self,
123        client: &C,
124        request: Request,
125    ) -> Result<MiddlewareAction<Request>, SpiderError> {
126        let url = request.url.clone();
127        let origin = match url.origin().unicode_serialization() {
128            s if s == "null" => return Ok(MiddlewareAction::Continue(request)),
129            s => s,
130        };
131
132        let robots_body = match self.cache.get(&origin).await {
133            Some(body) => body,
134            None => {
135                let body = self.fetch_robots_content(client, &origin).await;
136                self.cache.insert(origin.clone(), body.clone()).await;
137                body
138            }
139        };
140
141        if let Some(user_agent) = request.headers.get(USER_AGENT) {
142            let ua = user_agent
143                .to_str()
144                .map_err(|e| SpiderError::HeaderValueError(e.to_string()))?;
145
146            let mut matcher = DefaultMatcher::default();
147            if matcher.one_agent_allowed_by_robots(robots_body.as_str(), ua, url.as_str()) {
148                return Ok(MiddlewareAction::Continue(request));
149            }
150        }
151
152        debug!("Blocked by robots.txt: {}", url);
153        Err(SpiderError::BlockedByRobotsTxt)
154    }
155}