spider_util/response.rs
1//! Response types and response-side helpers.
2//!
3//! [`Response`] wraps the downloaded body together with the final URL, status,
4//! headers, and request metadata. It also provides convenience methods for
5//! Scrapy-like CSS extraction, parsing JSON, and extracting links.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_util::response::Response;
11//! use reqwest::StatusCode;
12//! use bytes::Bytes;
13//! use url::Url;
14//!
15//! // Create a response (typically done internally by the downloader)
16//! let response = Response {
17//! url: Url::parse("https://example.com").unwrap(),
18//! status: StatusCode::OK,
19//! headers: http::header::HeaderMap::new(),
20//! body: Bytes::from("<html><body>Hello</body></html>"),
21//! request_url: Url::parse("https://example.com").unwrap(),
22//! request_priority: 0,
23//! meta: None,
24//! cached: false,
25//! };
26//!
27//! // Extract text with the builtin selector API
28//! let heading = response.css("h1::text").unwrap().get();
29//!
30//! // Extract links from the response
31//! let links = response.links();
32//! ```
33//!
34//! In the crawler lifecycle, a [`Response`] is produced by the downloader,
35//! optionally rewritten by middleware, and then handed to
36//! [`Spider::parse`](spider_core::Spider::parse).
37
38use crate::error::SpiderError;
39use crate::request::Request;
40use crate::selector::{SelectorList, get_cached_selector};
41use crate::util;
42use dashmap::{DashMap, DashSet};
43use linkify::{LinkFinder, LinkKind};
44use reqwest::StatusCode;
45use scraper::{ElementRef, Html};
46use seahash::SeaHasher;
47use serde::de::DeserializeOwned;
48use serde::{Deserialize, Serialize};
49use serde_json;
50use std::cell::RefCell;
51use std::collections::HashMap;
52use std::hash::{Hash, Hasher};
53use std::{str::Utf8Error, str::from_utf8, sync::Arc};
54use url::Url;
55
56thread_local! {
57 static HTML_CACHE: RefCell<HashMap<u64, Arc<Html>>> = RefCell::new(HashMap::new());
58}
59
60const DISCOVERY_RULE_META_KEY: &str = "__discovery_rule";
61
62/// Classification for links discovered in a response.
63///
64/// ## Variants
65///
66/// - `Page`: Links to other web pages (typically `<a>` tags)
67/// - `Script`: Links to JavaScript files (`<script>` tags)
68/// - `Stylesheet`: Links to CSS stylesheets (`<link rel="stylesheet">`)
69/// - `Image`: Links to images (`<img>` tags)
70/// - `Media`: Links to audio/video files (`<audio>`, `<video>`, `<source>`)
71/// - `Other`: Any other type of resource with a custom identifier
72#[derive(Debug, Clone, PartialEq, Eq, Hash)]
73pub enum LinkType {
74 /// A link to another web page.
75 Page,
76 /// A link to a script file.
77 Script,
78 /// A link to a stylesheet.
79 Stylesheet,
80 /// A link to an image.
81 Image,
82 /// A link to a media file (audio/video).
83 Media,
84 /// A link to another type of resource.
85 Other(String),
86}
87
88/// A link discovered while extracting URLs from a response.
89///
90/// ## Example
91///
92/// ```rust,ignore
93/// use spider_util::response::{Link, LinkType};
94/// use url::Url;
95///
96/// let link = Link {
97/// url: Url::parse("https://example.com/page").unwrap(),
98/// link_type: LinkType::Page,
99/// };
100/// ```
101#[derive(Debug, Clone, PartialEq, Eq, Hash)]
102pub struct Link {
103 /// The URL of the discovered link.
104 pub url: Url,
105 /// The type of the discovered link.
106 pub link_type: LinkType,
107}
108
109/// One selector/attribute pair used during link extraction.
110///
111/// This is useful when the default HTML link sources are not enough for the
112/// target site and you need to teach the extractor about custom attributes.
113#[derive(Debug, Clone, PartialEq, Eq)]
114pub struct LinkSource {
115 /// CSS selector used to find candidate elements.
116 pub selector: String,
117 /// Attribute name that contains the URL.
118 pub attribute: String,
119 /// Optional fixed link type for matches from this source.
120 pub link_type: Option<LinkType>,
121}
122
123impl LinkSource {
124 /// Creates a new source definition.
125 pub fn new(selector: impl Into<String>, attribute: impl Into<String>) -> Self {
126 Self {
127 selector: selector.into(),
128 attribute: attribute.into(),
129 link_type: None,
130 }
131 }
132
133 /// Overrides the inferred link type for this source.
134 pub fn with_link_type(mut self, link_type: LinkType) -> Self {
135 self.link_type = Some(link_type);
136 self
137 }
138}
139
140/// Options that control link extraction from a [`Response`].
141///
142/// The defaults are intentionally conservative for crawler use: same-site
143/// filtering is enabled, text links are included, and common HTML elements are
144/// scanned for navigable URLs.
145#[derive(Debug, Clone, PartialEq, Eq)]
146pub struct LinkExtractOptions {
147 /// Restrict discovered links to the same registered domain.
148 pub same_site_only: bool,
149 /// Include URLs found in text content.
150 pub include_text_links: bool,
151 /// HTML sources used to discover attribute-based links.
152 pub sources: Vec<LinkSource>,
153 /// Optional allow-list of link types to include.
154 pub allowed_link_types: Option<Vec<LinkType>>,
155 /// Optional deny-list of link types to exclude.
156 pub denied_link_types: Vec<LinkType>,
157 /// Optional allow-list of glob-style URL patterns (`*` and `?` supported).
158 pub allow_patterns: Vec<String>,
159 /// Optional deny-list of glob-style URL patterns (`*` and `?` supported).
160 pub deny_patterns: Vec<String>,
161 /// Optional allow-list of domains or registered-domain suffixes.
162 pub allow_domains: Vec<String>,
163 /// Optional deny-list of domains or registered-domain suffixes.
164 pub deny_domains: Vec<String>,
165 /// Optional allow-list of URL path prefixes.
166 pub allow_path_prefixes: Vec<String>,
167 /// Optional deny-list of URL path prefixes.
168 pub deny_path_prefixes: Vec<String>,
169 /// Optional allow-list of HTML tag names used for attribute extraction.
170 pub allowed_tags: Option<Vec<String>>,
171 /// Optional allow-list of attribute names used for attribute extraction.
172 pub allowed_attributes: Option<Vec<String>>,
173}
174
175impl Default for LinkExtractOptions {
176 fn default() -> Self {
177 Self {
178 same_site_only: true,
179 include_text_links: true,
180 sources: default_link_sources(),
181 allowed_link_types: None,
182 denied_link_types: Vec::new(),
183 allow_patterns: Vec::new(),
184 deny_patterns: Vec::new(),
185 allow_domains: Vec::new(),
186 deny_domains: Vec::new(),
187 allow_path_prefixes: Vec::new(),
188 deny_path_prefixes: Vec::new(),
189 allowed_tags: None,
190 allowed_attributes: None,
191 }
192 }
193}
194
195impl LinkExtractOptions {
196 /// Sets whether only same-site URLs should be returned.
197 pub fn same_site_only(mut self, same_site_only: bool) -> Self {
198 self.same_site_only = same_site_only;
199 self
200 }
201
202 /// Sets whether URLs found in text content should be returned.
203 pub fn include_text_links(mut self, include_text_links: bool) -> Self {
204 self.include_text_links = include_text_links;
205 self
206 }
207
208 /// Replaces the configured HTML extraction sources.
209 pub fn with_sources(mut self, sources: impl IntoIterator<Item = LinkSource>) -> Self {
210 self.sources = sources.into_iter().collect();
211 self
212 }
213
214 /// Adds an HTML extraction source.
215 pub fn add_source(mut self, source: LinkSource) -> Self {
216 self.sources.push(source);
217 self
218 }
219
220 /// Restricts extraction to the provided link types.
221 pub fn with_allowed_link_types(
222 mut self,
223 allowed_link_types: impl IntoIterator<Item = LinkType>,
224 ) -> Self {
225 self.allowed_link_types = Some(allowed_link_types.into_iter().collect());
226 self
227 }
228
229 /// Adds link types that should be excluded even if discovered.
230 pub fn with_denied_link_types(
231 mut self,
232 denied_link_types: impl IntoIterator<Item = LinkType>,
233 ) -> Self {
234 self.denied_link_types = denied_link_types.into_iter().collect();
235 self
236 }
237
238 /// Adds a glob-style allow pattern that URLs must match.
239 pub fn allow_pattern(mut self, pattern: impl Into<String>) -> Self {
240 self.allow_patterns.push(pattern.into());
241 self
242 }
243
244 /// Replaces the glob-style allow patterns.
245 pub fn with_allow_patterns(
246 mut self,
247 patterns: impl IntoIterator<Item = impl Into<String>>,
248 ) -> Self {
249 self.allow_patterns = patterns.into_iter().map(Into::into).collect();
250 self
251 }
252
253 /// Adds a glob-style deny pattern that excludes matching URLs.
254 pub fn deny_pattern(mut self, pattern: impl Into<String>) -> Self {
255 self.deny_patterns.push(pattern.into());
256 self
257 }
258
259 /// Replaces the glob-style deny patterns.
260 pub fn with_deny_patterns(
261 mut self,
262 patterns: impl IntoIterator<Item = impl Into<String>>,
263 ) -> Self {
264 self.deny_patterns = patterns.into_iter().map(Into::into).collect();
265 self
266 }
267
268 /// Adds a domain or registered-domain suffix to allow.
269 pub fn allow_domain(mut self, domain: impl Into<String>) -> Self {
270 self.allow_domains.push(normalize_domain_filter(domain));
271 self
272 }
273
274 /// Replaces the allowed domains.
275 pub fn with_allow_domains(
276 mut self,
277 domains: impl IntoIterator<Item = impl Into<String>>,
278 ) -> Self {
279 self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
280 self
281 }
282
283 /// Adds a domain or registered-domain suffix to deny.
284 pub fn deny_domain(mut self, domain: impl Into<String>) -> Self {
285 self.deny_domains.push(normalize_domain_filter(domain));
286 self
287 }
288
289 /// Replaces the denied domains.
290 pub fn with_deny_domains(
291 mut self,
292 domains: impl IntoIterator<Item = impl Into<String>>,
293 ) -> Self {
294 self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
295 self
296 }
297
298 /// Adds a URL path prefix that links must match.
299 pub fn allow_path_prefix(mut self, prefix: impl Into<String>) -> Self {
300 self.allow_path_prefixes.push(normalize_path_prefix(prefix));
301 self
302 }
303
304 /// Replaces the allowed URL path prefixes.
305 pub fn with_allow_path_prefixes(
306 mut self,
307 prefixes: impl IntoIterator<Item = impl Into<String>>,
308 ) -> Self {
309 self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
310 self
311 }
312
313 /// Adds a URL path prefix that should be excluded.
314 pub fn deny_path_prefix(mut self, prefix: impl Into<String>) -> Self {
315 self.deny_path_prefixes.push(normalize_path_prefix(prefix));
316 self
317 }
318
319 /// Replaces the denied URL path prefixes.
320 pub fn with_deny_path_prefixes(
321 mut self,
322 prefixes: impl IntoIterator<Item = impl Into<String>>,
323 ) -> Self {
324 self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
325 self
326 }
327
328 /// Restricts attribute-based extraction to specific HTML tag names.
329 pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
330 self.allowed_tags = Some(
331 tags.into_iter()
332 .map(Into::into)
333 .map(|tag: String| tag.to_ascii_lowercase())
334 .collect(),
335 );
336 self
337 }
338
339 /// Restricts attribute-based extraction to specific attribute names.
340 pub fn with_allowed_attributes(
341 mut self,
342 attributes: impl IntoIterator<Item = impl Into<String>>,
343 ) -> Self {
344 self.allowed_attributes = Some(
345 attributes
346 .into_iter()
347 .map(Into::into)
348 .map(|attr: String| attr.to_ascii_lowercase())
349 .collect(),
350 );
351 self
352 }
353}
354
355/// Structured page metadata extracted from an HTML response.
356#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
357pub struct PageMetadata {
358 /// Contents of the `<title>` element.
359 pub title: Option<String>,
360 /// Contents of `<meta name="description">`.
361 pub description: Option<String>,
362 /// Canonical URL from `<link rel="canonical">`.
363 pub canonical_url: Option<Url>,
364 /// Open Graph metadata such as `og:title` or `og:image`.
365 pub open_graph: HashMap<String, String>,
366 /// Feed URLs discovered from alternate RSS/Atom link tags.
367 pub feed_urls: Vec<Url>,
368}
369
370impl PageMetadata {
371 /// Returns `true` when no metadata fields were extracted.
372 pub fn is_empty(&self) -> bool {
373 self.title.is_none()
374 && self.description.is_none()
375 && self.canonical_url.is_none()
376 && self.open_graph.is_empty()
377 && self.feed_urls.is_empty()
378 }
379}
380
381/// Represents an HTTP response received from a server.
382///
383/// [`Response`] contains all information about an HTTP response, including
384/// the final URL (after redirects), status code, headers, body content,
385/// and metadata carried over from the original request.
386///
387/// The type is designed for parse-time ergonomics:
388/// - [`Response::css`] exposes the recommended Scrapy-like selector API
389/// - [`Response::json`] deserializes JSON payloads
390/// - [`Response::links`] and related helpers extract follow-up links
391/// - [`Response::to_request`] reconstructs the originating request context
392///
393/// ## Example
394///
395/// ```rust,ignore
396/// use spider_util::response::Response;
397/// use reqwest::StatusCode;
398/// use bytes::Bytes;
399/// use url::Url;
400///
401/// let response = Response {
402/// url: Url::parse("https://example.com").unwrap(),
403/// status: StatusCode::OK,
404/// headers: http::header::HeaderMap::new(),
405/// body: Bytes::from("<html><body>Hello</body></html>"),
406/// request_url: Url::parse("https://example.com").unwrap(),
407/// meta: None,
408/// cached: false,
409/// };
410///
411/// // Extract text using the builtin selector API
412/// let title = response.css("title::text").ok().and_then(|list| list.get());
413/// ```
414#[derive(Debug)]
415pub struct Response {
416 /// The final URL of the response after any redirects.
417 pub url: Url,
418 /// The HTTP status code of the response.
419 pub status: StatusCode,
420 /// The headers of the response.
421 pub headers: http::header::HeaderMap,
422 /// The body of the response.
423 pub body: bytes::Bytes,
424 /// The original URL of the request that led to this response.
425 pub request_url: Url,
426 /// The scheduling priority of the original request.
427 pub request_priority: i32,
428 /// Metadata associated with the response, carried over from the request.
429 /// Uses Option to allow lazy initialization.
430 pub meta: Option<Arc<DashMap<String, serde_json::Value>>>,
431 /// Indicates if the response was served from a cache.
432 pub cached: bool,
433}
434
435impl Response {
436 /// Creates a new response with an empty HTML cache.
437 ///
438 /// Most application code receives responses from the runtime rather than
439 /// constructing them directly. This constructor is mainly useful for custom
440 /// downloaders and lower-level integrations.
441 pub fn new(
442 url: Url,
443 status: StatusCode,
444 headers: http::header::HeaderMap,
445 body: bytes::Bytes,
446 request_url: Url,
447 ) -> Self {
448 Self {
449 url,
450 status,
451 headers,
452 body,
453 request_url,
454 request_priority: 0,
455 meta: None,
456 cached: false,
457 }
458 }
459
460 /// Reconstructs the original [`Request`] that led to this response.
461 ///
462 /// This method creates a new [`Request`] with the same URL and metadata
463 /// as the request that produced this response. Useful for retry scenarios
464 /// or when you need to re-request the same resource.
465 ///
466 /// ## Example
467 ///
468 /// ```rust,ignore
469 /// # use spider_util::response::Response;
470 /// # use reqwest::StatusCode;
471 /// # use bytes::Bytes;
472 /// # use url::Url;
473 /// # let response = Response {
474 /// # url: Url::parse("https://example.com").unwrap(),
475 /// # status: StatusCode::OK,
476 /// # headers: http::header::HeaderMap::new(),
477 /// # body: Bytes::from("hello"),
478 /// # request_url: Url::parse("https://example.com").unwrap(),
479 /// # request_priority: 0,
480 /// # meta: None,
481 /// # cached: false,
482 /// # };
483 /// let original_request = response.request_from_response();
484 /// ```
485 pub fn request_from_response(&self) -> Request {
486 let mut request =
487 Request::new(self.request_url.clone()).with_priority(self.request_priority);
488 request.set_meta_from_option(self.meta.clone());
489 request
490 }
491
492 /// Returns a cloned metadata value by key.
493 pub fn get_meta(&self, key: &str) -> Option<serde_json::Value> {
494 self.meta
495 .as_ref()
496 .and_then(|m| m.get(key).map(|entry| entry.value().clone()))
497 }
498
499 /// Deserializes a metadata value into the requested type.
500 pub fn meta_value<T>(&self, key: &str) -> Result<Option<T>, serde_json::Error>
501 where
502 T: DeserializeOwned,
503 {
504 self.get_meta(key).map(serde_json::from_value).transpose()
505 }
506
507 /// Returns the runtime discovery rule name attached to this response, if any.
508 pub fn discovery_rule_name(&self) -> Option<String> {
509 self.get_meta(DISCOVERY_RULE_META_KEY)
510 .and_then(|value| value.as_str().map(ToOwned::to_owned))
511 }
512
513 /// Returns `true` when the response was reached through the named discovery rule.
514 pub fn matches_discovery_rule(&self, rule_name: &str) -> bool {
515 self.discovery_rule_name().as_deref() == Some(rule_name)
516 }
517
518 /// Inserts a metadata value, lazily allocating the map if needed.
519 pub fn insert_meta(&mut self, key: impl Into<String>, value: serde_json::Value) {
520 self.meta
521 .get_or_insert_with(|| Arc::new(DashMap::new()))
522 .insert(key.into(), value);
523 }
524
525 /// Returns a clone of the internal metadata map, if present.
526 pub fn clone_meta(&self) -> Option<Arc<DashMap<String, serde_json::Value>>> {
527 self.meta.clone()
528 }
529
530 /// Deserializes the response body as JSON.
531 ///
532 /// # Type Parameters
533 ///
534 /// - `T`: The target type to deserialize into (must implement `DeserializeOwned`)
535 ///
536 /// # Errors
537 ///
538 /// Returns a [`serde_json::Error`] if the body cannot be parsed as JSON
539 /// or if it cannot be deserialized into type `T`.
540 ///
541 /// ## Example
542 ///
543 /// ```rust,ignore
544 /// # use spider_util::response::Response;
545 /// # use reqwest::StatusCode;
546 /// # use bytes::Bytes;
547 /// # use url::Url;
548 /// # use serde::Deserialize;
549 /// # #[derive(Deserialize)]
550 /// # struct Data { value: String }
551 /// # let response = Response {
552 /// # url: Url::parse("https://api.example.com").unwrap(),
553 /// # status: StatusCode::OK,
554 /// # headers: http::header::HeaderMap::new(),
555 /// # body: Bytes::from(r#"{"value": "test"}"#),
556 /// # request_url: Url::parse("https://api.example.com").unwrap(),
557 /// # meta: None,
558 /// # cached: false,
559 /// # };
560 /// let data: Data = response.json()?;
561 /// # Ok::<(), serde_json::Error>(())
562 /// ```
563 pub fn json<T: DeserializeOwned>(&self) -> Result<T, serde_json::Error> {
564 serde_json::from_slice(&self.body)
565 }
566
567 /// Applies a builtin CSS selector to the response body using a Scrapy-like API.
568 ///
569 /// Supports standard CSS selectors plus terminal extraction suffixes:
570 /// - `::text`
571 /// - `::attr(name)`
572 ///
573 /// ## Example
574 ///
575 /// ```rust,ignore
576 /// # use spider_util::response::Response;
577 /// # use reqwest::StatusCode;
578 /// # use bytes::Bytes;
579 /// # use url::Url;
580 /// # let response = Response {
581 /// # url: Url::parse("https://example.com").unwrap(),
582 /// # status: StatusCode::OK,
583 /// # headers: http::header::HeaderMap::new(),
584 /// # body: Bytes::from(r#"<html><body><h1>Hello</h1><a href="/next">Next</a></body></html>"#),
585 /// # request_url: Url::parse("https://example.com").unwrap(),
586 /// # request_priority: 0,
587 /// # meta: None,
588 /// # cached: false,
589 /// # };
590 /// let heading = response.css("h1::text")?.get().unwrap_or_default();
591 /// let next_href = response.css("a::attr(href)")?.get();
592 /// # Ok::<(), crate::error::SpiderError>(())
593 /// ```
594 ///
595 /// # Errors
596 ///
597 /// Returns [`SpiderError::Utf8Error`] when the body is not valid UTF-8 and
598 /// [`SpiderError::HtmlParseError`] when the selector is invalid.
599 pub fn css(&self, query: &str) -> Result<SelectorList, SpiderError> {
600 let body = Arc::<str>::from(self.text()?);
601 SelectorList::from_document_query(body, self.html_cache_key(), query)
602 }
603
604 /// Returns the response body as UTF-8 text.
605 pub fn text(&self) -> Result<&str, Utf8Error> {
606 from_utf8(&self.body)
607 }
608
609 /// Extracts structured page metadata from HTML responses.
610 pub fn page_metadata(&self) -> Result<PageMetadata, Utf8Error> {
611 let html = self.cached_html()?;
612 let mut metadata = PageMetadata::default();
613
614 if let Some(selector) = get_cached_selector("title") {
615 metadata.title = html
616 .select(&selector)
617 .next()
618 .map(|node| node.text().collect::<String>().trim().to_string())
619 .filter(|value| !value.is_empty());
620 }
621
622 if let Some(selector) = get_cached_selector("meta[name], meta[property], meta[content]") {
623 for element in html.select(&selector) {
624 let Some(content) = element.value().attr("content") else {
625 continue;
626 };
627 let content = content.trim();
628 if content.is_empty() {
629 continue;
630 }
631
632 if let Some(name) = element.value().attr("name")
633 && name.eq_ignore_ascii_case("description")
634 && metadata.description.is_none()
635 {
636 metadata.description = Some(content.to_string());
637 }
638
639 if let Some(property) = element.value().attr("property")
640 && property.len() >= 3
641 && property[..3].eq_ignore_ascii_case("og:")
642 {
643 metadata
644 .open_graph
645 .entry(property.to_string())
646 .or_insert_with(|| content.to_string());
647 }
648 }
649 }
650
651 if let Some(selector) = get_cached_selector("link[href]") {
652 for element in html.select(&selector) {
653 let Some(href) = element.value().attr("href") else {
654 continue;
655 };
656 let rel = element.value().attr("rel").unwrap_or_default();
657
658 if rel
659 .split_ascii_whitespace()
660 .any(|token| token.eq_ignore_ascii_case("canonical"))
661 && metadata.canonical_url.is_none()
662 && let Ok(url) = self.url.join(href)
663 {
664 metadata.canonical_url = Some(url);
665 }
666
667 let is_alternate = rel
668 .split_ascii_whitespace()
669 .any(|token| token.eq_ignore_ascii_case("alternate"));
670 let ty = element.value().attr("type").unwrap_or_default();
671 let is_feed = ty.eq_ignore_ascii_case("application/rss+xml")
672 || ty.eq_ignore_ascii_case("application/atom+xml")
673 || ty.eq_ignore_ascii_case("application/xml")
674 || ty.eq_ignore_ascii_case("text/xml");
675
676 if is_alternate
677 && is_feed
678 && let Ok(url) = self.url.join(href)
679 && !metadata.feed_urls.contains(&url)
680 {
681 metadata.feed_urls.push(url);
682 }
683 }
684 }
685
686 Ok(metadata)
687 }
688
689 /// Returns a customizable iterator of links discovered in the response body.
690 ///
691 /// Unlike [`Response::links`], this method does not deduplicate results.
692 /// Callers that need uniqueness can collect into a set or use [`Response::links`].
693 ///
694 /// ## Example
695 ///
696 /// ```rust,ignore
697 /// # use spider_util::response::{LinkExtractOptions, Response};
698 /// # use reqwest::StatusCode;
699 /// # use bytes::Bytes;
700 /// # use url::Url;
701 /// # let response = Response {
702 /// # url: Url::parse("https://example.com").unwrap(),
703 /// # status: StatusCode::OK,
704 /// # headers: http::header::HeaderMap::new(),
705 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
706 /// # request_url: Url::parse("https://example.com").unwrap(),
707 /// # meta: None,
708 /// # cached: false,
709 /// # };
710 /// let links: Vec<_> = response
711 /// .links_iter(LinkExtractOptions::default())
712 /// .collect();
713 /// assert!(!links.is_empty());
714 /// ```
715 pub fn links_iter(&self, options: LinkExtractOptions) -> impl Iterator<Item = Link> {
716 self.parse_links(options).unwrap_or_default().into_iter()
717 }
718
719 /// Extracts all unique, same-site links from the response body.
720 ///
721 /// This method discovers links from:
722 /// - HTML elements with `href` or `src` attributes (`<a>`, `<link>`, `<script>`, `<img>`, etc.)
723 /// - URLs found in text content (using link detection)
724 ///
725 /// Only links pointing to the same site (same registered domain) are included.
726 ///
727 /// ## Returns
728 ///
729 /// A [`DashSet`] of [`Link`] objects containing the URL and link type.
730 ///
731 /// ## Example
732 ///
733 /// ```rust,ignore
734 /// # use spider_util::response::Response;
735 /// # use reqwest::StatusCode;
736 /// # use bytes::Bytes;
737 /// # use url::Url;
738 /// # let response = Response {
739 /// # url: Url::parse("https://example.com").unwrap(),
740 /// # status: StatusCode::OK,
741 /// # headers: http::header::HeaderMap::new(),
742 /// # body: Bytes::from(r#"<html><body><a href="/page">Link</a></body></html>"#),
743 /// # request_url: Url::parse("https://example.com").unwrap(),
744 /// # meta: None,
745 /// # cached: false,
746 /// # };
747 /// let links = response.links();
748 /// for link in links.iter() {
749 /// println!("Found {:?} link: {}", link.link_type, link.url);
750 /// }
751 /// ```
752 pub fn links(&self) -> DashSet<Link> {
753 let links = DashSet::new();
754
755 for link in self.links_iter(LinkExtractOptions::default()) {
756 links.insert(link);
757 }
758
759 links
760 }
761
762 fn parse_links(&self, options: LinkExtractOptions) -> Result<Vec<Link>, Utf8Error> {
763 let html = self.cached_html()?;
764 let mut links = Vec::new();
765
766 self.collect_attribute_links(&html, &options, &mut links);
767
768 if options.include_text_links {
769 self.collect_text_links(&html, &options, &mut links);
770 }
771
772 Ok(links)
773 }
774
775 fn collect_attribute_links(
776 &self,
777 html: &Html,
778 options: &LinkExtractOptions,
779 links: &mut Vec<Link>,
780 ) {
781 for source in &options.sources {
782 if !options
783 .allowed_attributes
784 .as_ref()
785 .is_none_or(|allowed| allowed.iter().any(|attr| attr == &source.attribute))
786 {
787 continue;
788 }
789
790 let Some(selector) = get_cached_selector(&source.selector) else {
791 continue;
792 };
793
794 for element in html.select(&selector) {
795 let tag_name = element.value().name();
796 if !options
797 .allowed_tags
798 .as_ref()
799 .is_none_or(|allowed| allowed.iter().any(|tag| tag == tag_name))
800 {
801 continue;
802 }
803
804 let Some(attr_value) = element.value().attr(&source.attribute) else {
805 continue;
806 };
807
808 let link_type = source
809 .link_type
810 .clone()
811 .unwrap_or_else(|| infer_link_type(&element));
812
813 if let Some(link) = self.build_link(attr_value, link_type, options) {
814 links.push(link);
815 }
816 }
817 }
818 }
819
820 fn collect_text_links(&self, html: &Html, options: &LinkExtractOptions, links: &mut Vec<Link>) {
821 let finder = LinkFinder::new();
822
823 for text_node in html.tree.values().filter_map(|node| node.as_text()) {
824 for link in finder.links(text_node) {
825 if link.kind() != &LinkKind::Url {
826 continue;
827 }
828
829 if let Some(link) = self.build_link(link.as_str(), LinkType::Page, options) {
830 links.push(link);
831 }
832 }
833 }
834 }
835
836 fn build_link(
837 &self,
838 raw_url: &str,
839 link_type: LinkType,
840 options: &LinkExtractOptions,
841 ) -> Option<Link> {
842 let url = self.url.join(raw_url).ok()?;
843
844 if options.same_site_only && !util::is_same_site(&url, &self.url) {
845 return None;
846 }
847
848 if !options
849 .allowed_link_types
850 .as_ref()
851 .is_none_or(|allowed| allowed.contains(&link_type))
852 {
853 return None;
854 }
855
856 if options.denied_link_types.contains(&link_type) {
857 return None;
858 }
859
860 let absolute_url = url.as_str();
861 if !options.allow_patterns.is_empty()
862 && !options
863 .allow_patterns
864 .iter()
865 .any(|pattern| glob_matches(pattern, absolute_url))
866 {
867 return None;
868 }
869
870 if options
871 .deny_patterns
872 .iter()
873 .any(|pattern| glob_matches(pattern, absolute_url))
874 {
875 return None;
876 }
877
878 let host = url.host_str().unwrap_or_default();
879 if !options.allow_domains.is_empty()
880 && !options
881 .allow_domains
882 .iter()
883 .any(|domain| domain_matches(host, domain))
884 {
885 return None;
886 }
887
888 if options
889 .deny_domains
890 .iter()
891 .any(|domain| domain_matches(host, domain))
892 {
893 return None;
894 }
895
896 let path = url.path();
897 if !options.allow_path_prefixes.is_empty()
898 && !options
899 .allow_path_prefixes
900 .iter()
901 .any(|prefix| path.starts_with(prefix))
902 {
903 return None;
904 }
905
906 if options
907 .deny_path_prefixes
908 .iter()
909 .any(|prefix| path.starts_with(prefix))
910 {
911 return None;
912 }
913
914 Some(Link { url, link_type })
915 }
916
917 fn html_cache_key(&self) -> u64 {
918 let mut hasher = SeaHasher::new();
919 self.url.as_str().hash(&mut hasher);
920 self.request_url.as_str().hash(&mut hasher);
921 self.body.hash(&mut hasher);
922 hasher.finish()
923 }
924
925 fn cached_html(&self) -> Result<Arc<Html>, Utf8Error> {
926 let cache_key = self.html_cache_key();
927
928 HTML_CACHE.with(|cache| {
929 if let Some(html) = cache.borrow().get(&cache_key).cloned() {
930 return Ok(html);
931 }
932
933 let body_str = from_utf8(&self.body)?;
934 let html = Arc::new(Html::parse_document(body_str));
935 cache.borrow_mut().insert(cache_key, html.clone());
936 Ok(html)
937 })
938 }
939}
940
941impl Clone for Response {
942 fn clone(&self) -> Self {
943 Response {
944 url: self.url.clone(),
945 status: self.status,
946 headers: self.headers.clone(),
947 body: self.body.clone(),
948 request_url: self.request_url.clone(),
949 request_priority: self.request_priority,
950 meta: self.meta.clone(),
951 cached: self.cached,
952 }
953 }
954}
955
956fn default_link_sources() -> Vec<LinkSource> {
957 vec![
958 LinkSource::new("a[href]", "href"),
959 LinkSource::new("link[href]", "href"),
960 LinkSource::new("script[src]", "src"),
961 LinkSource::new("img[src]", "src"),
962 LinkSource::new("audio[src]", "src"),
963 LinkSource::new("video[src]", "src"),
964 LinkSource::new("source[src]", "src"),
965 ]
966}
967
968fn infer_link_type(element: &ElementRef<'_>) -> LinkType {
969 match element.value().name() {
970 "a" => LinkType::Page,
971 "link" => {
972 if let Some(rel) = element.value().attr("rel") {
973 if rel.eq_ignore_ascii_case("stylesheet") {
974 LinkType::Stylesheet
975 } else {
976 LinkType::Other(rel.to_string())
977 }
978 } else {
979 LinkType::Other("link".to_string())
980 }
981 }
982 "script" => LinkType::Script,
983 "img" => LinkType::Image,
984 "audio" | "video" | "source" => LinkType::Media,
985 _ => LinkType::Other(element.value().name().to_string()),
986 }
987}
988
989fn normalize_domain_filter(domain: impl Into<String>) -> String {
990 domain
991 .into()
992 .trim()
993 .trim_start_matches('.')
994 .to_ascii_lowercase()
995}
996
997fn normalize_path_prefix(prefix: impl Into<String>) -> String {
998 let prefix = prefix.into();
999 let prefix = prefix.trim();
1000 if prefix.is_empty() || prefix == "/" {
1001 "/".to_string()
1002 } else if prefix.starts_with('/') {
1003 prefix.to_string()
1004 } else {
1005 format!("/{prefix}")
1006 }
1007}
1008
1009fn domain_matches(host: &str, filter: &str) -> bool {
1010 let host = host.to_ascii_lowercase();
1011 let filter = filter.to_ascii_lowercase();
1012 host == filter || host.ends_with(&format!(".{filter}"))
1013}
1014
1015fn glob_matches(pattern: &str, input: &str) -> bool {
1016 let pattern = pattern.as_bytes();
1017 let input = input.as_bytes();
1018 let (mut p, mut s) = (0usize, 0usize);
1019 let mut last_star = None;
1020 let mut match_after_star = 0usize;
1021
1022 while s < input.len() {
1023 if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
1024 p += 1;
1025 s += 1;
1026 } else if p < pattern.len() && pattern[p] == b'*' {
1027 last_star = Some(p);
1028 p += 1;
1029 match_after_star = s;
1030 } else if let Some(star_idx) = last_star {
1031 p = star_idx + 1;
1032 match_after_star += 1;
1033 s = match_after_star;
1034 } else {
1035 return false;
1036 }
1037 }
1038
1039 while p < pattern.len() && pattern[p] == b'*' {
1040 p += 1;
1041 }
1042
1043 p == pattern.len()
1044}