spider_util/
response.rs

1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! Scrapy-like CSS extraction, parsing JSON, and extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//!     url: Url::parse("https://example.com").unwrap(),
18//!     status: StatusCode::OK,
19//!     headers: http::header::HeaderMap::new(),
20//!     body: Bytes::from("<html><body>Hello</body></html>"),
21//!     request_url: Url::parse("https://example.com").unwrap(),
22//!     request_priority: 0,
23//!     meta: None,
24//!     cached: false,
25//! };
26//!
27//! // Extract text with the builtin selector API
28//! let heading = response.css("h1::text").unwrap().get();
29//!
30//! // Extract links from the response
31//! let links = response.links();
32//! ```
33//!
34//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
35//! optionally rewritten by middleware, and then handed to
36//! [`Spider::parse`](spider_core::Spider::parse).
37
38use crate::error::SpiderError;
39use crate::request::Request;
40use crate::selector::{SelectorList, get_cached_selector};
41use crate::util;
42use dashmap::{DashMap, DashSet};
43use linkify::{LinkFinder, LinkKind};
44use reqwest::StatusCode;
45use scraper::{ElementRef, Html};
46use seahash::SeaHasher;
47use serde::de::DeserializeOwned;
48use serde::{Deserialize, Serialize};
49use serde_json;
50use std::cell::RefCell;
51use std::collections::HashMap;
52use std::hash::{Hash, Hasher};
53use std::{str::Utf8Error, str::from_utf8, sync::Arc};
54use url::Url;
55
56thread_local! {
57    static HTML_CACHE: RefCell<HashMap<u64, Arc<Html>>> = RefCell::new(HashMap::new());
58}
59
60const DISCOVERY_RULE_META_KEY: &str = "__discovery_rule";
61
62/// Classification for links discovered in a response.
63///
64/// ## Variants
65///
66/// - `Page`: Links to other web pages (typically `<a>` tags)
67/// - `Script`: Links to JavaScript files (`<script>` tags)
68/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
69/// - `Image`: Links to images (`<img>` tags)
70/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
71/// - `Other`: Any other type of resource with a custom identifier
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub enum LinkType {
74    /// A link to another web page.
75    Page,
76    /// A link to a script file.
77    Script,
78    /// A link to a stylesheet.
79    Stylesheet,
80    /// A link to an image.
81    Image,
82    /// A link to a media file (audio/video).
83    Media,
84    /// A link to another type of resource.
85    Other(String),
86}
87
88/// A link discovered while extracting URLs from a response.
89///
90/// ## Example
91///
92/// ```rust,ignore
93/// use spider_util::response::{Link, LinkType};
94/// use url::Url;
95///
96/// let link = Link {
97///     url: Url::parse("https://example.com/page").unwrap(),
98///     link_type: LinkType::Page,
99/// };
100/// ```
101#[derive(Debug, Clone, PartialEq, Eq, Hash)]
102pub struct Link {
103    /// The URL of the discovered link.
104    pub url: Url,
105    /// The type of the discovered link.
106    pub link_type: LinkType,
107}
108
109/// One selector/attribute pair used during link extraction.
110///
111/// This is useful when the default HTML link sources are not enough for the
112/// target site and you need to teach the extractor about custom attributes.
113#[derive(Debug, Clone, PartialEq, Eq)]
114pub struct LinkSource {
115    /// CSS selector used to find candidate elements.
116    pub selector: String,
117    /// Attribute name that contains the URL.
118    pub attribute: String,
119    /// Optional fixed link type for matches from this source.
120    pub link_type: Option<LinkType>,
121}
122
123impl LinkSource {
124    /// Creates a new source definition.
125    pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
126        Self {
127            selector: selector.into(),
128            attribute: attribute.into(),
129            link_type: None,
130        }
131    }
132
133    /// Overrides the inferred link type for this source.
134    pub fn with_link_type(mut self, link_type: LinkType) -> Self {
135        self.link_type = Some(link_type);
136        self
137    }
138}
139
140/// Options that control link extraction from a [`Response`].
141///
142/// The defaults are intentionally conservative for crawler use: same-site
143/// filtering is enabled, text links are included, and common HTML elements are
144/// scanned for navigable URLs.
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub struct LinkExtractOptions {
147    /// Restrict discovered links to the same registered domain.
148    pub same_site_only: bool,
149    /// Include URLs found in text content.
150    pub include_text_links: bool,
151    /// HTML sources used to discover attribute-based links.
152    pub sources: Vec<LinkSource>,
153    /// Optional allow-list of link types to include.
154    pub allowed_link_types: Option<Vec<LinkType>>,
155    /// Optional deny-list of link types to exclude.
156    pub denied_link_types: Vec<LinkType>,
157    /// Optional allow-list of glob-style URL patterns (`*` and `?` supported).
158    pub allow_patterns: Vec<String>,
159    /// Optional deny-list of glob-style URL patterns (`*` and `?` supported).
160    pub deny_patterns: Vec<String>,
161    /// Optional allow-list of domains or registered-domain suffixes.
162    pub allow_domains: Vec<String>,
163    /// Optional deny-list of domains or registered-domain suffixes.
164    pub deny_domains: Vec<String>,
165    /// Optional allow-list of URL path prefixes.
166    pub allow_path_prefixes: Vec<String>,
167    /// Optional deny-list of URL path prefixes.
168    pub deny_path_prefixes: Vec<String>,
169    /// Optional allow-list of HTML tag names used for attribute extraction.
170    pub allowed_tags: Option<Vec<String>>,
171    /// Optional allow-list of attribute names used for attribute extraction.
172    pub allowed_attributes: Option<Vec<String>>,
173}
174
175impl Default for LinkExtractOptions {
176    fn default() -> Self {
177        Self {
178            same_site_only: true,
179            include_text_links: true,
180            sources: default_link_sources(),
181            allowed_link_types: None,
182            denied_link_types: Vec::new(),
183            allow_patterns: Vec::new(),
184            deny_patterns: Vec::new(),
185            allow_domains: Vec::new(),
186            deny_domains: Vec::new(),
187            allow_path_prefixes: Vec::new(),
188            deny_path_prefixes: Vec::new(),
189            allowed_tags: None,
190            allowed_attributes: None,
191        }
192    }
193}
194
195impl LinkExtractOptions {
196    /// Sets whether only same-site URLs should be returned.
197    pub fn same_site_only(mut self, same_site_only: bool) -> Self {
198        self.same_site_only = same_site_only;
199        self
200    }
201
202    /// Sets whether URLs found in text content should be returned.
203    pub fn include_text_links(mut self, include_text_links: bool) -> Self {
204        self.include_text_links = include_text_links;
205        self
206    }
207
208    /// Replaces the configured HTML extraction sources.
209    pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
210        self.sources = sources.into_iter().collect();
211        self
212    }
213
214    /// Adds an HTML extraction source.
215    pub fn add_source(mut self, source: LinkSource) -> Self {
216        self.sources.push(source);
217        self
218    }
219
220    /// Restricts extraction to the provided link types.
221    pub fn with_allowed_link_types(
222        mut self,
223        allowed_link_types: impl IntoIterator<Item = LinkType>,
224    ) -> Self {
225        self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
226        self
227    }
228
229    /// Adds link types that should be excluded even if discovered.
230    pub fn with_denied_link_types(
231        mut self,
232        denied_link_types: impl IntoIterator<Item = LinkType>,
233    ) -> Self {
234        self.denied_link_types = denied_link_types.into_iter().collect();
235        self
236    }
237
238    /// Adds a glob-style allow pattern that URLs must match.
239    pub fn allow_pattern(mut self, pattern: impl Into<String>) -> Self {
240        self.allow_patterns.push(pattern.into());
241        self
242    }
243
244    /// Replaces the glob-style allow patterns.
245    pub fn with_allow_patterns(
246        mut self,
247        patterns: impl IntoIterator<Item = impl Into<String>>,
248    ) -> Self {
249        self.allow_patterns = patterns.into_iter().map(Into::into).collect();
250        self
251    }
252
253    /// Adds a glob-style deny pattern that excludes matching URLs.
254    pub fn deny_pattern(mut self, pattern: impl Into<String>) -> Self {
255        self.deny_patterns.push(pattern.into());
256        self
257    }
258
259    /// Replaces the glob-style deny patterns.
260    pub fn with_deny_patterns(
261        mut self,
262        patterns: impl IntoIterator<Item = impl Into<String>>,
263    ) -> Self {
264        self.deny_patterns = patterns.into_iter().map(Into::into).collect();
265        self
266    }
267
268    /// Adds a domain or registered-domain suffix to allow.
269    pub fn allow_domain(mut self, domain: impl Into<String>) -> Self {
270        self.allow_domains.push(normalize_domain_filter(domain));
271        self
272    }
273
274    /// Replaces the allowed domains.
275    pub fn with_allow_domains(
276        mut self,
277        domains: impl IntoIterator<Item = impl Into<String>>,
278    ) -> Self {
279        self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
280        self
281    }
282
283    /// Adds a domain or registered-domain suffix to deny.
284    pub fn deny_domain(mut self, domain: impl Into<String>) -> Self {
285        self.deny_domains.push(normalize_domain_filter(domain));
286        self
287    }
288
289    /// Replaces the denied domains.
290    pub fn with_deny_domains(
291        mut self,
292        domains: impl IntoIterator<Item = impl Into<String>>,
293    ) -> Self {
294        self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
295        self
296    }
297
298    /// Adds a URL path prefix that links must match.
299    pub fn allow_path_prefix(mut self, prefix: impl Into<String>) -> Self {
300        self.allow_path_prefixes.push(normalize_path_prefix(prefix));
301        self
302    }
303
304    /// Replaces the allowed URL path prefixes.
305    pub fn with_allow_path_prefixes(
306        mut self,
307        prefixes: impl IntoIterator<Item = impl Into<String>>,
308    ) -> Self {
309        self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
310        self
311    }
312
313    /// Adds a URL path prefix that should be excluded.
314    pub fn deny_path_prefix(mut self, prefix: impl Into<String>) -> Self {
315        self.deny_path_prefixes.push(normalize_path_prefix(prefix));
316        self
317    }
318
319    /// Replaces the denied URL path prefixes.
320    pub fn with_deny_path_prefixes(
321        mut self,
322        prefixes: impl IntoIterator<Item = impl Into<String>>,
323    ) -> Self {
324        self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
325        self
326    }
327
328    /// Restricts attribute-based extraction to specific HTML tag names.
329    pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
330        self.allowed_tags = Some(
331            tags.into_iter()
332                .map(Into::into)
333                .map(|tag: String| tag.to_ascii_lowercase())
334                .collect(),
335        );
336        self
337    }
338
339    /// Restricts attribute-based extraction to specific attribute names.
340    pub fn with_allowed_attributes(
341        mut self,
342        attributes: impl IntoIterator<Item = impl Into<String>>,
343    ) -> Self {
344        self.allowed_attributes = Some(
345            attributes
346                .into_iter()
347                .map(Into::into)
348                .map(|attr: String| attr.to_ascii_lowercase())
349                .collect(),
350        );
351        self
352    }
353}
354
355/// Structured page metadata extracted from an HTML response.
356#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
357pub struct PageMetadata {
358    /// Contents of the `<title>` element.
359    pub title: Option<String>,
360    /// Contents of `<meta name="description">`.
361    pub description: Option<String>,
362    /// Canonical URL from `<link rel="canonical">`.
363    pub canonical_url: Option<Url>,
364    /// Open Graph metadata such as `og:title` or `og:image`.
365    pub open_graph: HashMap<String, String>,
366    /// Feed URLs discovered from alternate RSS/Atom link tags.
367    pub feed_urls: Vec<Url>,
368}
369
370impl PageMetadata {
371    /// Returns `true` when no metadata fields were extracted.
372    pub fn is_empty(&self) -> bool {
373        self.title.is_none()
374            && self.description.is_none()
375            && self.canonical_url.is_none()
376            && self.open_graph.is_empty()
377            && self.feed_urls.is_empty()
378    }
379}
380
381/// Represents an HTTP response received from a server.
382///
383/// [`Response`] contains all information about an HTTP response, including
384/// the final URL (after redirects), status code, headers, body content,
385/// and metadata carried over from the original request.
386///
387/// The type is designed for parse-time ergonomics:
388/// - [`Response::css`] exposes the recommended Scrapy-like selector API
389/// - [`Response::json`] deserializes JSON payloads
390/// - [`Response::links`] and related helpers extract follow-up links
391/// - [`Response::to_request`] reconstructs the originating request context
392///
393/// ## Example
394///
395/// ```rust,ignore
396/// use spider_util::response::Response;
397/// use reqwest::StatusCode;
398/// use bytes::Bytes;
399/// use url::Url;
400///
401/// let response = Response {
402///     url: Url::parse("https://example.com").unwrap(),
403///     status: StatusCode::OK,
404///     headers: http::header::HeaderMap::new(),
405///     body: Bytes::from("<html><body>Hello</body></html>"),
406///     request_url: Url::parse("https://example.com").unwrap(),
407///     meta: None,
408///     cached: false,
409/// };
410///
411/// // Extract text using the builtin selector API
412/// let title = response.css("title::text").ok().and_then(|list| list.get());
413/// ```
414#[derive(Debug)]
415pub struct Response {
416    /// The final URL of the response after any redirects.
417    pub url: Url,
418    /// The HTTP status code of the response.
419    pub status: StatusCode,
420    /// The headers of the response.
421    pub headers: http::header::HeaderMap,
422    /// The body of the response.
423    pub body: bytes::Bytes,
424    /// The original URL of the request that led to this response.
425    pub request_url: Url,
426    /// The scheduling priority of the original request.
427    pub request_priority: i32,
428    /// Metadata associated with the response, carried over from the request.
429    /// Uses Option to allow lazy initialization.
430    pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
431    /// Indicates if the response was served from a cache.
432    pub cached: bool,
433}
434
435impl Response {
436    /// Creates a new response with an empty HTML cache.
437    ///
438    /// Most application code receives responses from the runtime rather than
439    /// constructing them directly. This constructor is mainly useful for custom
440    /// downloaders and lower-level integrations.
441    pub fn new(
442        url: Url,
443        status: StatusCode,
444        headers: http::header::HeaderMap,
445        body: bytes::Bytes,
446        request_url: Url,
447    ) -> Self {
448        Self {
449            url,
450            status,
451            headers,
452            body,
453            request_url,
454            request_priority: 0,
455            meta: None,
456            cached: false,
457        }
458    }
459
460    /// Reconstructs the original [`Request`] that led to this response.
461    ///
462    /// This method creates a new [`Request`] with the same URL and metadata
463    /// as the request that produced this response. Useful for retry scenarios
464    /// or when you need to re-request the same resource.
465    ///
466    /// ## Example
467    ///
468    /// ```rust,ignore
469    /// # use spider_util::response::Response;
470    /// # use reqwest::StatusCode;
471    /// # use bytes::Bytes;
472    /// # use url::Url;
473    /// # let response = Response {
474    /// #     url: Url::parse("https://example.com").unwrap(),
475    /// #     status: StatusCode::OK,
476    /// #     headers: http::header::HeaderMap::new(),
477    /// #     body: Bytes::from("hello"),
478    /// #     request_url: Url::parse("https://example.com").unwrap(),
479    /// #     request_priority: 0,
480    /// #     meta: None,
481    /// #     cached: false,
482    /// # };
483    /// let original_request = response.request_from_response();
484    /// ```
485    pub fn request_from_response(&self) -> Request {
486        let mut request =
487            Request::new(self.request_url.clone()).with_priority(self.request_priority);
488        request.set_meta_from_option(self.meta.clone());
489        request
490    }
491
492    /// Returns a cloned metadata value by key.
493    pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
494        self.meta
495            .as_ref()
496            .and_then(|m| m.get(key).map(|entry| entry.value().clone()))
497    }
498
499    /// Deserializes a metadata value into the requested type.
500    pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, serde_json::Error>
501    where
502        T: DeserializeOwned,
503    {
504        self.get_meta(key).map(serde_json::from_value).transpose()
505    }
506
507    /// Returns the runtime discovery rule name attached to this response, if any.
508    pub fn discovery_rule_name(&self) -> Option<String> {
509        self.get_meta(DISCOVERY_RULE_META_KEY)
510            .and_then(|value| value.as_str().map(ToOwned::to_owned))
511    }
512
513    /// Returns `true` when the response was reached through the named discovery rule.
514    pub fn matches_discovery_rule(&self, rule_name: &str) -> bool {
515        self.discovery_rule_name().as_deref() == Some(rule_name)
516    }
517
518    /// Inserts a metadata value, lazily allocating the map if needed.
519    pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
520        self.meta
521            .get_or_insert_with(|| Arc::new(DashMap::new()))
522            .insert(key.into(), value);
523    }
524
525    /// Returns a clone of the internal metadata map, if present.
526    pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
527        self.meta.clone()
528    }
529
530    /// Deserializes the response body as JSON.
531    ///
532    /// # Type Parameters
533    ///
534    /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
535    ///
536    /// # Errors
537    ///
538    /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
539    /// or if it cannot be deserialized into type `T`.
540    ///
541    /// ## Example
542    ///
543    /// ```rust,ignore
544    /// # use spider_util::response::Response;
545    /// # use reqwest::StatusCode;
546    /// # use bytes::Bytes;
547    /// # use url::Url;
548    /// # use serde::Deserialize;
549    /// # #[derive(Deserialize)]
550    /// # struct Data { value: String }
551    /// # let response = Response {
552    /// #     url: Url::parse("https://api.example.com").unwrap(),
553    /// #     status: StatusCode::OK,
554    /// #     headers: http::header::HeaderMap::new(),
555    /// #     body: Bytes::from(r#"{"value": "test"}"#),
556    /// #     request_url: Url::parse("https://api.example.com").unwrap(),
557    /// #     meta: None,
558    /// #     cached: false,
559    /// # };
560    /// let data: Data = response.json()?;
561    /// # Ok::<(), serde_json::Error>(())
562    /// ```
563    pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
564        serde_json::from_slice(&self.body)
565    }
566
567    /// Applies a builtin CSS selector to the response body using a Scrapy-like API.
568    ///
569    /// Supports standard CSS selectors plus terminal extraction suffixes:
570    /// - `::text`
571    /// - `::attr(name)`
572    ///
573    /// ## Example
574    ///
575    /// ```rust,ignore
576    /// # use spider_util::response::Response;
577    /// # use reqwest::StatusCode;
578    /// # use bytes::Bytes;
579    /// # use url::Url;
580    /// # let response = Response {
581    /// #     url: Url::parse("https://example.com").unwrap(),
582    /// #     status: StatusCode::OK,
583    /// #     headers: http::header::HeaderMap::new(),
584    /// #     body: Bytes::from(r#"<html><body><h1>Hello</h1><a href="/next">Next</a></body></html>"#),
585    /// #     request_url: Url::parse("https://example.com").unwrap(),
586    /// #     request_priority: 0,
587    /// #     meta: None,
588    /// #     cached: false,
589    /// # };
590    /// let heading = response.css("h1::text")?.get().unwrap_or_default();
591    /// let next_href = response.css("a::attr(href)")?.get();
592    /// # Ok::<(), crate::error::SpiderError>(())
593    /// ```
594    ///
595    /// # Errors
596    ///
597    /// Returns [`SpiderError::Utf8Error`] when the body is not valid UTF-8 and
598    /// [`SpiderError::HtmlParseError`] when the selector is invalid.
599    pub fn css(&self, query: &str) -> Result<SelectorList, SpiderError> {
600        let body = Arc::<str>::from(self.text()?);
601        SelectorList::from_document_query(body, self.html_cache_key(), query)
602    }
603
604    /// Returns the response body as UTF-8 text.
605    pub fn text(&self) -> Result<&str, Utf8Error> {
606        from_utf8(&self.body)
607    }
608
609    /// Extracts structured page metadata from HTML responses.
610    pub fn page_metadata(&self) -> Result<PageMetadata, Utf8Error> {
611        let html = self.cached_html()?;
612        let mut metadata = PageMetadata::default();
613
614        if let Some(selector) = get_cached_selector("title") {
615            metadata.title = html
616                .select(&selector)
617                .next()
618                .map(|node| node.text().collect::<String>().trim().to_string())
619                .filter(|value| !value.is_empty());
620        }
621
622        if let Some(selector) = get_cached_selector("meta[name], meta[property], meta[content]") {
623            for element in html.select(&selector) {
624                let Some(content) = element.value().attr("content") else {
625                    continue;
626                };
627                let content = content.trim();
628                if content.is_empty() {
629                    continue;
630                }
631
632                if let Some(name) = element.value().attr("name")
633                    && name.eq_ignore_ascii_case("description")
634                    && metadata.description.is_none()
635                {
636                    metadata.description = Some(content.to_string());
637                }
638
639                if let Some(property) = element.value().attr("property")
640                    && property.len() >= 3
641                    && property[..3].eq_ignore_ascii_case("og:")
642                {
643                    metadata
644                        .open_graph
645                        .entry(property.to_string())
646                        .or_insert_with(|| content.to_string());
647                }
648            }
649        }
650
651        if let Some(selector) = get_cached_selector("link[href]") {
652            for element in html.select(&selector) {
653                let Some(href) = element.value().attr("href") else {
654                    continue;
655                };
656                let rel = element.value().attr("rel").unwrap_or_default();
657
658                if rel
659                    .split_ascii_whitespace()
660                    .any(|token| token.eq_ignore_ascii_case("canonical"))
661                    && metadata.canonical_url.is_none()
662                    && let Ok(url) = self.url.join(href)
663                {
664                    metadata.canonical_url = Some(url);
665                }
666
667                let is_alternate = rel
668                    .split_ascii_whitespace()
669                    .any(|token| token.eq_ignore_ascii_case("alternate"));
670                let ty = element.value().attr("type").unwrap_or_default();
671                let is_feed = ty.eq_ignore_ascii_case("application/rss+xml")
672                    || ty.eq_ignore_ascii_case("application/atom+xml")
673                    || ty.eq_ignore_ascii_case("application/xml")
674                    || ty.eq_ignore_ascii_case("text/xml");
675
676                if is_alternate
677                    && is_feed
678                    && let Ok(url) = self.url.join(href)
679                    && !metadata.feed_urls.contains(&url)
680                {
681                    metadata.feed_urls.push(url);
682                }
683            }
684        }
685
686        Ok(metadata)
687    }
688
689    /// Returns a customizable iterator of links discovered in the response body.
690    ///
691    /// Unlike [`Response::links`], this method does not deduplicate results.
692    /// Callers that need uniqueness can collect into a set or use [`Response::links`].
693    ///
694    /// ## Example
695    ///
696    /// ```rust,ignore
697    /// # use spider_util::response::{LinkExtractOptions, Response};
698    /// # use reqwest::StatusCode;
699    /// # use bytes::Bytes;
700    /// # use url::Url;
701    /// # let response = Response {
702    /// #     url: Url::parse("https://example.com").unwrap(),
703    /// #     status: StatusCode::OK,
704    /// #     headers: http::header::HeaderMap::new(),
705    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
706    /// #     request_url: Url::parse("https://example.com").unwrap(),
707    /// #     meta: None,
708    /// #     cached: false,
709    /// # };
710    /// let links: Vec<_> = response
711    ///     .links_iter(LinkExtractOptions::default())
712    ///     .collect();
713    /// assert!(!links.is_empty());
714    /// ```
715    pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
716        self.parse_links(options).unwrap_or_default().into_iter()
717    }
718
719    /// Extracts all unique, same-site links from the response body.
720    ///
721    /// This method discovers links from:
722    /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
723    /// - URLs found in text content (using link detection)
724    ///
725    /// Only links pointing to the same site (same registered domain) are included.
726    ///
727    /// ## Returns
728    ///
729    /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
730    ///
731    /// ## Example
732    ///
733    /// ```rust,ignore
734    /// # use spider_util::response::Response;
735    /// # use reqwest::StatusCode;
736    /// # use bytes::Bytes;
737    /// # use url::Url;
738    /// # let response = Response {
739    /// #     url: Url::parse("https://example.com").unwrap(),
740    /// #     status: StatusCode::OK,
741    /// #     headers: http::header::HeaderMap::new(),
742    /// #     body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
743    /// #     request_url: Url::parse("https://example.com").unwrap(),
744    /// #     meta: None,
745    /// #     cached: false,
746    /// # };
747    /// let links = response.links();
748    /// for link in links.iter() {
749    ///     println!("Found {:?} link: {}", link.link_type, link.url);
750    /// }
751    /// ```
752    pub fn links(&self) -> DashSet<Link> {
753        let links = DashSet::new();
754
755        for link in self.links_iter(LinkExtractOptions::default()) {
756            links.insert(link);
757        }
758
759        links
760    }
761
762    fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
763        let html = self.cached_html()?;
764        let mut links = Vec::new();
765
766        self.collect_attribute_links(&html, &options, &mut links);
767
768        if options.include_text_links {
769            self.collect_text_links(&html, &options, &mut links);
770        }
771
772        Ok(links)
773    }
774
775    fn collect_attribute_links(
776        &self,
777        html: &Html,
778        options: &LinkExtractOptions,
779        links: &mut Vec<Link>,
780    ) {
781        for source in &options.sources {
782            if !options
783                .allowed_attributes
784                .as_ref()
785                .is_none_or(|allowed| allowed.iter().any(|attr| attr == &source.attribute))
786            {
787                continue;
788            }
789
790            let Some(selector) = get_cached_selector(&source.selector) else {
791                continue;
792            };
793
794            for element in html.select(&selector) {
795                let tag_name = element.value().name();
796                if !options
797                    .allowed_tags
798                    .as_ref()
799                    .is_none_or(|allowed| allowed.iter().any(|tag| tag == tag_name))
800                {
801                    continue;
802                }
803
804                let Some(attr_value) = element.value().attr(&source.attribute) else {
805                    continue;
806                };
807
808                let link_type = source
809                    .link_type
810                    .clone()
811                    .unwrap_or_else(|| infer_link_type(&element));
812
813                if let Some(link) = self.build_link(attr_value, link_type, options) {
814                    links.push(link);
815                }
816            }
817        }
818    }
819
820    fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
821        let finder = LinkFinder::new();
822
823        for text_node in html.tree.values().filter_map(|node| node.as_text()) {
824            for link in finder.links(text_node) {
825                if link.kind() != &LinkKind::Url {
826                    continue;
827                }
828
829                if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
830                    links.push(link);
831                }
832            }
833        }
834    }
835
836    fn build_link(
837        &self,
838        raw_url: &str,
839        link_type: LinkType,
840        options: &LinkExtractOptions,
841    ) -> Option<Link> {
842        let url = self.url.join(raw_url).ok()?;
843
844        if options.same_site_only && !util::is_same_site(&url, &self.url) {
845            return None;
846        }
847
848        if !options
849            .allowed_link_types
850            .as_ref()
851            .is_none_or(|allowed| allowed.contains(&link_type))
852        {
853            return None;
854        }
855
856        if options.denied_link_types.contains(&link_type) {
857            return None;
858        }
859
860        let absolute_url = url.as_str();
861        if !options.allow_patterns.is_empty()
862            && !options
863                .allow_patterns
864                .iter()
865                .any(|pattern| glob_matches(pattern, absolute_url))
866        {
867            return None;
868        }
869
870        if options
871            .deny_patterns
872            .iter()
873            .any(|pattern| glob_matches(pattern, absolute_url))
874        {
875            return None;
876        }
877
878        let host = url.host_str().unwrap_or_default();
879        if !options.allow_domains.is_empty()
880            && !options
881                .allow_domains
882                .iter()
883                .any(|domain| domain_matches(host, domain))
884        {
885            return None;
886        }
887
888        if options
889            .deny_domains
890            .iter()
891            .any(|domain| domain_matches(host, domain))
892        {
893            return None;
894        }
895
896        let path = url.path();
897        if !options.allow_path_prefixes.is_empty()
898            && !options
899                .allow_path_prefixes
900                .iter()
901                .any(|prefix| path.starts_with(prefix))
902        {
903            return None;
904        }
905
906        if options
907            .deny_path_prefixes
908            .iter()
909            .any(|prefix| path.starts_with(prefix))
910        {
911            return None;
912        }
913
914        Some(Link { url, link_type })
915    }
916
917    fn html_cache_key(&self) -> u64 {
918        let mut hasher = SeaHasher::new();
919        self.url.as_str().hash(&mut hasher);
920        self.request_url.as_str().hash(&mut hasher);
921        self.body.hash(&mut hasher);
922        hasher.finish()
923    }
924
925    fn cached_html(&self) -> Result<Arc<Html>, Utf8Error> {
926        let cache_key = self.html_cache_key();
927
928        HTML_CACHE.with(|cache| {
929            if let Some(html) = cache.borrow().get(&cache_key).cloned() {
930                return Ok(html);
931            }
932
933            let body_str = from_utf8(&self.body)?;
934            let html = Arc::new(Html::parse_document(body_str));
935            cache.borrow_mut().insert(cache_key, html.clone());
936            Ok(html)
937        })
938    }
939}
940
941impl Clone for Response {
942    fn clone(&self) -> Self {
943        Response {
944            url: self.url.clone(),
945            status: self.status,
946            headers: self.headers.clone(),
947            body: self.body.clone(),
948            request_url: self.request_url.clone(),
949            request_priority: self.request_priority,
950            meta: self.meta.clone(),
951            cached: self.cached,
952        }
953    }
954}
955
956fn default_link_sources() -> Vec<LinkSource> {
957    vec![
958        LinkSource::new("a[href]", "href"),
959        LinkSource::new("link[href]", "href"),
960        LinkSource::new("script[src]", "src"),
961        LinkSource::new("img[src]", "src"),
962        LinkSource::new("audio[src]", "src"),
963        LinkSource::new("video[src]", "src"),
964        LinkSource::new("source[src]", "src"),
965    ]
966}
967
968fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
969    match element.value().name() {
970        "a" => LinkType::Page,
971        "link" => {
972            if let Some(rel) = element.value().attr("rel") {
973                if rel.eq_ignore_ascii_case("stylesheet") {
974                    LinkType::Stylesheet
975                } else {
976                    LinkType::Other(rel.to_string())
977                }
978            } else {
979                LinkType::Other("link".to_string())
980            }
981        }
982        "script" => LinkType::Script,
983        "img" => LinkType::Image,
984        "audio" | "video" | "source" => LinkType::Media,
985        _ => LinkType::Other(element.value().name().to_string()),
986    }
987}
988
989fn normalize_domain_filter(domain: impl Into<String>) -> String {
990    domain
991        .into()
992        .trim()
993        .trim_start_matches('.')
994        .to_ascii_lowercase()
995}
996
997fn normalize_path_prefix(prefix: impl Into<String>) -> String {
998    let prefix = prefix.into();
999    let prefix = prefix.trim();
1000    if prefix.is_empty() || prefix == "/" {
1001        "/".to_string()
1002    } else if prefix.starts_with('/') {
1003        prefix.to_string()
1004    } else {
1005        format!("/{prefix}")
1006    }
1007}
1008
1009fn domain_matches(host: &str, filter: &str) -> bool {
1010    let host = host.to_ascii_lowercase();
1011    let filter = filter.to_ascii_lowercase();
1012    host == filter || host.ends_with(&format!(".{filter}"))
1013}
1014
1015fn glob_matches(pattern: &str, input: &str) -> bool {
1016    let pattern = pattern.as_bytes();
1017    let input = input.as_bytes();
1018    let (mut p, mut s) = (0usize, 0usize);
1019    let mut last_star = None;
1020    let mut match_after_star = 0usize;
1021
1022    while s < input.len() {
1023        if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
1024            p += 1;
1025            s += 1;
1026        } else if p < pattern.len() && pattern[p] == b'*' {
1027            last_star = Some(p);
1028            p += 1;
1029            match_after_star = s;
1030        } else if let Some(star_idx) = last_star {
1031            p = star_idx + 1;
1032            match_after_star += 1;
1033            s = match_after_star;
1034        } else {
1035            return false;
1036        }
1037    }
1038
1039    while p < pattern.len() && pattern[p] == b'*' {
1040        p += 1;
1041    }
1042
1043    p == pattern.len()
1044}
spider_util/response.rs

spider_util/
response.rs