spider_util/
item.rs

1//! Item traits and parse-time output helpers.
2//!
3//! [`ParseOutput`] is the async sink carried by a spider's parse context.
4//! Spiders typically use it indirectly through `ParseContext` helpers such as
5//! `cx.add_item(...)` and `cx.add_request(...)`, while the runtime uses it to
6//! stream scraped items and follow-up requests as they are discovered.
7//!
8//! ## Example
9//!
10//! ```rust,ignore
11//! use spider_util::item::{ScrapedItem, ParseOutput};
12//!
13//! #[spider_macro::scraped_item]
14//! struct Article {
15//!     title: String,
16//!     content: String,
17//! }
18//!
19//! // In your spider's parse method:
20//! // output.add_item(Article { title: "...", content: "..." }).await?;
21//! // output.add_request(request).await?;
22//! ```
23//!
24//! `ParseOutput` intentionally hides the runtime transport details. The
25//! crawler can backpressure parsing internally while spider code continues to
26//! use familiar `add_*` methods.
27
28use crate::request::Request;
29use async_trait::async_trait;
30use serde_json::Value;
31use std::any::Any;
32use std::fmt::Debug;
33use std::sync::Arc;
34
35use crate::error::SpiderError;
36
37/// Stable field kinds used by typed item schema metadata.
38#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
39pub enum FieldValueType {
40    Bool,
41    Integer,
42    Float,
43    String,
44    Json,
45    Sequence,
46    Map,
47    Unknown,
48}
49
50/// Static schema metadata for a single item field.
51#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
52pub struct ItemFieldSchema {
53    pub name: String,
54    pub rust_type: String,
55    pub value_type: FieldValueType,
56    pub nullable: bool,
57}
58
59/// Static schema metadata for a scraped item type.
60#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
61pub struct ItemSchema {
62    pub item_name: String,
63    pub version: u32,
64    pub fields: Vec<ItemFieldSchema>,
65}
66
67impl ItemSchema {
68    /// Returns the fields in their declared order.
69    pub fn fields(&self) -> &[ItemFieldSchema] {
70        &self.fields
71    }
72}
73
74/// Trait for typed item definitions that can expose static schema metadata.
75pub trait TypedItemSchema {
76    /// Returns the typed schema for the item.
77    fn schema() -> ItemSchema;
78
79    /// Returns the schema version used by the item.
80    fn schema_version() -> u32 {
81        1
82    }
83}
84
85#[async_trait]
86#[doc(hidden)]
87pub trait ParseSink<I>: Send + Sync + 'static {
88    async fn add_item(&self, item: I) -> Result<(), SpiderError>;
89    async fn add_request(&self, request: Request) -> Result<(), SpiderError>;
90}
91
92/// Async output sink passed into a spider's `parse` method.
93pub struct ParseOutput<I> {
94    sink: Arc<dyn ParseSink<I>>,
95}
96
97impl<I: 'static> ParseOutput<I> {
98    #[doc(hidden)]
99    pub fn from_sink(sink: Arc<dyn ParseSink<I>>) -> Self {
100        Self { sink }
101    }
102
103    /// Emits a scraped item into the runtime.
104    ///
105    /// Use this when the current page produced one structured result that
106    /// should continue through the configured pipeline chain. This call is
107    /// async so the runtime can apply backpressure when downstream work is
108    /// saturated.
109    pub async fn add_item(&self, item: I) -> Result<(), SpiderError> {
110        self.sink.add_item(item).await
111    }
112
113    /// Emits a new request to be crawled.
114    ///
115    /// Requests emitted here are forwarded into the scheduler path.
116    pub async fn add_request(&self, request: Request) -> Result<(), SpiderError> {
117        self.sink.add_request(request).await
118    }
119
120    /// Emits multiple scraped items into the runtime.
121    pub async fn add_items(&self, items: impl IntoIterator<Item = I>) -> Result<(), SpiderError> {
122        for item in items {
123            self.add_item(item).await?;
124        }
125        Ok(())
126    }
127
128    /// Emits multiple new requests to be crawled.
129    pub async fn add_requests(
130        &self,
131        requests: impl IntoIterator<Item = Request>,
132    ) -> Result<(), SpiderError> {
133        for request in requests {
134            self.add_request(request).await?;
135        }
136        Ok(())
137    }
138}
139
140impl<I: 'static> Debug for ParseOutput<I> {
141    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
142        f.debug_struct("ParseOutput").finish_non_exhaustive()
143    }
144}
145
146impl<I> Clone for ParseOutput<I> {
147    fn clone(&self) -> Self {
148        Self {
149            sink: Arc::clone(&self.sink),
150        }
151    }
152}
153
154/// Trait implemented by item types emitted from spiders.
155///
156/// In normal application code you usually do not implement this trait by hand.
157/// Prefer annotating the item struct with `#[scraped_item]`, which wires up the
158/// required serialization and cloning behavior automatically.
159pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
160    /// Returns the item as a `dyn Any` for downcasting.
161    fn as_any(&self) -> &dyn Any;
162    /// Clones the item into a `Box<dyn ScrapedItem>`.
163    fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
164    /// Converts the item to a `serde_json::Value`.
165    fn to_json_value(&self) -> Value;
166    /// Returns typed schema metadata when the item type exposes it.
167    fn item_schema(&self) -> Option<ItemSchema> {
168        None
169    }
170    /// Returns the schema version used by this item.
171    fn item_schema_version(&self) -> u32 {
172        1
173    }
174}
175
176impl Clone for Box<dyn ScrapedItem + Send + Sync> {
177    fn clone(&self) -> Self {
178        self.box_clone()
179    }
180}