spider_middleware/
robots.rs1use std::sync::Arc;
7use std::time::Duration;
8
9use async_trait::async_trait;
10use http::header::USER_AGENT;
11use log::{debug, info, warn};
12use moka::future::Cache;
13use robotstxt::DefaultMatcher;
14
15use crate::middleware::{Middleware, MiddlewareAction};
16use spider_util::error::SpiderError;
17use spider_util::http_client::HttpClient;
18use spider_util::request::Request;
19
20#[derive(Debug)]
22pub struct RobotsTxtMiddleware {
23 cache_ttl: Duration,
24 cache_capacity: u64,
25 request_timeout: Duration,
26 cache: Cache<String, Arc<String>>,
27}
28
29impl Default for RobotsTxtMiddleware {
30 fn default() -> Self {
31 let cache_ttl = Duration::from_secs(60 * 60 * 24);
32 let cache_capacity = 10_000;
33 let cache = Cache::builder()
34 .time_to_live(cache_ttl)
35 .max_capacity(cache_capacity)
36 .build();
37
38 let middleware = Self {
39 cache_ttl,
40 cache_capacity,
41 request_timeout: Duration::from_secs(5),
42 cache,
43 };
44 info!(
45 "Initializing RobotsTxtMiddleware with config: {:?}",
46 middleware
47 );
48 middleware
49 }
50}
51
52impl RobotsTxtMiddleware {
53 pub fn new() -> Self {
55 Self::default()
56 }
57
58 pub fn cache_ttl(mut self, cache_ttl: Duration) -> Self {
60 self.cache_ttl = cache_ttl;
61 self.rebuild_cache();
62 self
63 }
64
65 pub fn cache_capacity(mut self, cache_capacity: u64) -> Self {
67 self.cache_capacity = cache_capacity;
68 self.rebuild_cache();
69 self
70 }
71
72 pub fn request_timeout(mut self, request_timeout: Duration) -> Self {
74 self.request_timeout = request_timeout;
75 self
76 }
77
78 fn rebuild_cache(&mut self) {
80 self.cache = Cache::builder()
81 .time_to_live(self.cache_ttl)
82 .max_capacity(self.cache_capacity)
83 .build();
84 }
85
86 async fn fetch_robots_content<C: HttpClient>(&self, client: &C, origin: &str) -> Arc<String> {
87 let robots_url = format!("{}/robots.txt", origin);
88 debug!("Fetching robots.txt from: {}", robots_url);
89
90 let permissive = || Arc::new(String::new());
91
92 match client.get_text(&robots_url, self.request_timeout).await {
93 Ok((status, body)) if status.is_success() => match String::from_utf8(body.into()) {
94 Ok(text) => Arc::new(text),
95 Err(e) => {
96 warn!("Failed to read robots.txt {}: {}", robots_url, e);
97 permissive()
98 }
99 },
100 Ok((status, _)) => {
101 debug!(
102 "robots.txt {} returned {} — allowing all",
103 robots_url, status
104 );
105 permissive()
106 }
107 Err(e) => {
108 warn!("Failed to fetch robots.txt {}: {}", robots_url, e);
109 permissive()
110 }
111 }
112 }
113}
114
115#[async_trait]
116impl<C: HttpClient> Middleware<C> for RobotsTxtMiddleware {
117 fn name(&self) -> &str {
118 "RobotsTxtMiddleware"
119 }
120
121 async fn process_request(
122 &self,
123 client: &C,
124 request: Request,
125 ) -> Result<MiddlewareAction<Request>, SpiderError> {
126 let url = request.url.clone();
127 let origin = match url.origin().unicode_serialization() {
128 s if s == "null" => return Ok(MiddlewareAction::Continue(request)),
129 s => s,
130 };
131
132 let robots_body = match self.cache.get(&origin).await {
133 Some(body) => body,
134 None => {
135 let body = self.fetch_robots_content(client, &origin).await;
136 self.cache.insert(origin.clone(), body.clone()).await;
137 body
138 }
139 };
140
141 if let Some(user_agent) = request.headers.get(USER_AGENT) {
142 let ua = user_agent
143 .to_str()
144 .map_err(|e| SpiderError::HeaderValueError(e.to_string()))?;
145
146 let mut matcher = DefaultMatcher::default();
147 if matcher.one_agent_allowed_by_robots(robots_body.as_str(), ua, url.as_str()) {
148 return Ok(MiddlewareAction::Continue(request));
149 }
150 }
151
152 debug!("Blocked by robots.txt: {}", url);
153 Err(SpiderError::BlockedByRobotsTxt)
154 }
155}