spider_util/item.rs
1//! Item traits and parse-time output helpers.
2//!
3//! [`ParseOutput`] is the async sink carried by a spider's parse context.
4//! Spiders typically use it indirectly through `ParseContext` helpers such as
5//! `cx.add_item(...)` and `cx.add_request(...)`, while the runtime uses it to
6//! stream scraped items and follow-up requests as they are discovered.
7//!
8//! ## Example
9//!
10//! ```rust,ignore
11//! use spider_util::item::{ScrapedItem, ParseOutput};
12//!
13//! #[spider_macro::scraped_item]
14//! struct Article {
15//! title: String,
16//! content: String,
17//! }
18//!
19//! // In your spider's parse method:
20//! // output.add_item(Article { title: "...", content: "..." }).await?;
21//! // output.add_request(request).await?;
22//! ```
23//!
24//! `ParseOutput` intentionally hides the runtime transport details. The
25//! crawler can backpressure parsing internally while spider code continues to
26//! use familiar `add_*` methods.
27
28use crate::request::Request;
29use async_trait::async_trait;
30use serde_json::Value;
31use std::any::Any;
32use std::fmt::Debug;
33use std::sync::Arc;
34
35use crate::error::SpiderError;
36
37/// Stable field kinds used by typed item schema metadata.
38#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
39pub enum FieldValueType {
40 Bool,
41 Integer,
42 Float,
43 String,
44 Json,
45 Sequence,
46 Map,
47 Unknown,
48}
49
50/// Static schema metadata for a single item field.
51#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
52pub struct ItemFieldSchema {
53 pub name: String,
54 pub rust_type: String,
55 pub value_type: FieldValueType,
56 pub nullable: bool,
57}
58
59/// Static schema metadata for a scraped item type.
60#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
61pub struct ItemSchema {
62 pub item_name: String,
63 pub version: u32,
64 pub fields: Vec<ItemFieldSchema>,
65}
66
67impl ItemSchema {
68 /// Returns the fields in their declared order.
69 pub fn fields(&self) -> &[ItemFieldSchema] {
70 &self.fields
71 }
72}
73
74/// Trait for typed item definitions that can expose static schema metadata.
75pub trait TypedItemSchema {
76 /// Returns the typed schema for the item.
77 fn schema() -> ItemSchema;
78
79 /// Returns the schema version used by the item.
80 fn schema_version() -> u32 {
81 1
82 }
83}
84
85#[async_trait]
86#[doc(hidden)]
87pub trait ParseSink<I>: Send + Sync + 'static {
88 async fn add_item(&self, item: I) -> Result<(), SpiderError>;
89 async fn add_request(&self, request: Request) -> Result<(), SpiderError>;
90}
91
92/// Async output sink passed into a spider's `parse` method.
93pub struct ParseOutput<I> {
94 sink: Arc<dyn ParseSink<I>>,
95}
96
97impl<I: 'static> ParseOutput<I> {
98 #[doc(hidden)]
99 pub fn from_sink(sink: Arc<dyn ParseSink<I>>) -> Self {
100 Self { sink }
101 }
102
103 /// Emits a scraped item into the runtime.
104 ///
105 /// Use this when the current page produced one structured result that
106 /// should continue through the configured pipeline chain. This call is
107 /// async so the runtime can apply backpressure when downstream work is
108 /// saturated.
109 pub async fn add_item(&self, item: I) -> Result<(), SpiderError> {
110 self.sink.add_item(item).await
111 }
112
113 /// Emits a new request to be crawled.
114 ///
115 /// Requests emitted here are forwarded into the scheduler path.
116 pub async fn add_request(&self, request: Request) -> Result<(), SpiderError> {
117 self.sink.add_request(request).await
118 }
119
120 /// Emits multiple scraped items into the runtime.
121 pub async fn add_items(&self, items: impl IntoIterator<Item = I>) -> Result<(), SpiderError> {
122 for item in items {
123 self.add_item(item).await?;
124 }
125 Ok(())
126 }
127
128 /// Emits multiple new requests to be crawled.
129 pub async fn add_requests(
130 &self,
131 requests: impl IntoIterator<Item = Request>,
132 ) -> Result<(), SpiderError> {
133 for request in requests {
134 self.add_request(request).await?;
135 }
136 Ok(())
137 }
138}
139
140impl<I: 'static> Debug for ParseOutput<I> {
141 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
142 f.debug_struct("ParseOutput").finish_non_exhaustive()
143 }
144}
145
146impl<I> Clone for ParseOutput<I> {
147 fn clone(&self) -> Self {
148 Self {
149 sink: Arc::clone(&self.sink),
150 }
151 }
152}
153
154/// Trait implemented by item types emitted from spiders.
155///
156/// In normal application code you usually do not implement this trait by hand.
157/// Prefer annotating the item struct with `#[scraped_item]`, which wires up the
158/// required serialization and cloning behavior automatically.
159pub trait ScrapedItem: Debug + Send + Sync + Any + 'static {
160 /// Returns the item as a `dyn Any` for downcasting.
161 fn as_any(&self) -> &dyn Any;
162 /// Clones the item into a `Box<dyn ScrapedItem>`.
163 fn box_clone(&self) -> Box<dyn ScrapedItem + Send + Sync>;
164 /// Converts the item to a `serde_json::Value`.
165 fn to_json_value(&self) -> Value;
166 /// Returns typed schema metadata when the item type exposes it.
167 fn item_schema(&self) -> Option<ItemSchema> {
168 None
169 }
170 /// Returns the schema version used by this item.
171 fn item_schema_version(&self) -> u32 {
172 1
173 }
174}
175
176impl Clone for Box<dyn ScrapedItem + Send + Sync> {
177 fn clone(&self) -> Self {
178 self.box_clone()
179 }
180}