spider_macro/
lib.rs

1//! # spider-macro
2//!
3//! Procedural macros used by the `spider-*` workspace.
4//!
5//! Right now this crate is intentionally small: it mainly provides
6//! [`scraped_item`], the attribute macro used to turn plain structs into item
7//! types that fit the crawler and pipeline APIs.
8//!
9//! ## Dependencies
10//!
11//! ```toml
12//! [dependencies]
13//! spider-macro = "0.1.13"
14//! spider-util = "0.4.0"
15//! serde = { version = "1.0", features = ["derive"] }
16//! serde_json = "1.0"
17//! ```
18//!
19//! ## Example
20//!
21//! ```rust,ignore
22//! use spider_macro::scraped_item;
23//! use spider_util::item::ScrapedItem;
24//!
25//! #[scraped_item]
26//! struct Article {
27//!     title: String,
28//!     content: String,
29//! }
30//!
31//! // `Article` now implements Serialize, Deserialize, Clone, Debug,
32//! // and the ScrapedItem trait expected by the rest of the workspace.
33//! ```
34
35extern crate proc_macro;
36
37use proc_macro::TokenStream;
38use proc_macro_crate::{FoundCrate, crate_name};
39use quote::quote;
40use syn::{Fields, ItemStruct, Type, parse_macro_input};
41
42/// Attribute macro for defining a scraped item type.
43///
44/// This macro:
45/// 1. Implements `ScrapedItem`
46/// 2. Adds `Serialize` and `Deserialize`
47/// 3. Adds `Clone` and `Debug`
48///
49/// # Dependencies
50///
51/// Your project must include `serde` and `serde_json` as direct dependencies:
52///
53/// ```toml
54/// [dependencies]
55/// spider-util = "0.4.0"
56/// serde = { version = "1.0", features = ["derive"] }
57/// serde_json = "1.0"
58/// ```
59#[proc_macro_attribute]
60pub fn scraped_item(_attr: TokenStream, item: TokenStream) -> TokenStream {
61    let ast = parse_macro_input!(item as ItemStruct);
62    let name = &ast.ident;
63    let item_name = name.to_string();
64    let scraped_item_trait = item_type_tokens("ScrapedItem");
65    let item_field_schema = item_type_tokens("ItemFieldSchema");
66    let item_schema = item_type_tokens("ItemSchema");
67    let typed_item_schema = item_type_tokens("TypedItemSchema");
68    let fields = match &ast.fields {
69        Fields::Named(fields) => fields.named.iter().collect::<Vec<_>>(),
70        _ => {
71            return syn::Error::new_spanned(
72                &ast,
73                "#[scraped_item] only supports structs with named fields",
74            )
75            .to_compile_error()
76            .into();
77        }
78    };
79
80    let schema_fields = fields.iter().map(|field| {
81        let field_ident = field.ident.as_ref().expect("named field");
82        let field_name = field_ident.to_string();
83        let rust_type = quote!(#field.ty).to_string().replace(' ', "");
84        let nullable = is_option_type(&field.ty);
85        let value_type_tokens = field_value_type_tokens(&field.ty);
86
87        quote! {
88            #item_field_schema {
89                name: #field_name.to_string(),
90                rust_type: #rust_type.to_string(),
91                value_type: #value_type_tokens,
92                nullable: #nullable,
93            }
94        }
95    });
96
97    let expanded = quote! {
98        #[derive(
99            ::serde::Serialize,
100            ::serde::Deserialize,
101            Clone,
102            Debug
103        )]
104        #ast
105
106        impl #scraped_item_trait for #name {
107            fn as_any(&self) -> &dyn ::std::any::Any {
108                self
109            }
110
111            fn box_clone(&self) -> Box<dyn #scraped_item_trait + Send + Sync> {
112                Box::new(self.clone())
113            }
114
115            fn to_json_value(&self) -> ::serde_json::Value {
116                match ::serde_json::to_value(self) {
117                    Ok(value) => value,
118                    Err(err) => panic!("failed to serialize ScrapedItem '{}': {}", stringify!(#name), err),
119                }
120            }
121
122            fn item_schema(&self) -> ::std::option::Option<#item_schema> {
123                ::std::option::Option::Some(<Self as #typed_item_schema>::schema())
124            }
125
126            fn item_schema_version(&self) -> u32 {
127                <Self as #typed_item_schema>::schema_version()
128            }
129        }
130
131        impl #typed_item_schema for #name {
132            fn schema() -> #item_schema {
133                #item_schema {
134                    item_name: #item_name.to_string(),
135                    version: Self::schema_version(),
136                    fields: vec![#(#schema_fields),*],
137                }
138            }
139        }
140    };
141
142    TokenStream::from(expanded)
143}
144
145fn item_type_tokens(type_name: &str) -> proc_macro2::TokenStream {
146    let ident = syn::Ident::new(type_name, proc_macro2::Span::call_site());
147
148    match runtime_crate() {
149        RuntimeCrate::SpiderLib(path) => quote!(#path::#ident),
150        RuntimeCrate::SpiderUtil(path) => quote!(#path::item::#ident),
151    }
152}
153
154fn runtime_crate() -> RuntimeCrate {
155    if let Some(path) = facade_crate_tokens("spider-lib", true) {
156        return RuntimeCrate::SpiderLib(path);
157    }
158
159    if let Some(path) = facade_crate_tokens("spider-util", false) {
160        return RuntimeCrate::SpiderUtil(path);
161    }
162
163    RuntimeCrate::SpiderUtil(
164        syn::Error::new(
165            proc_macro2::Span::call_site(),
166            "#[scraped_item] requires either `spider-lib` or `spider-util` as a dependency",
167        )
168        .to_compile_error(),
169    )
170}
171
172fn facade_crate_tokens(crate_key: &str, use_prelude: bool) -> Option<proc_macro2::TokenStream> {
173    let found = crate_name(crate_key).ok()?;
174
175    Some(match found {
176        FoundCrate::Itself => {
177            let crate_name = crate_key.replace('-', "_");
178            let ident = syn::Ident::new(&crate_name, proc_macro2::Span::call_site());
179            if use_prelude {
180                quote!(::#ident::prelude)
181            } else {
182                quote!(::#ident)
183            }
184        }
185        FoundCrate::Name(name) => {
186            let ident = syn::Ident::new(&name, proc_macro2::Span::call_site());
187            if use_prelude {
188                quote!(::#ident::prelude)
189            } else {
190                quote!(::#ident)
191            }
192        }
193    })
194}
195
196enum RuntimeCrate {
197    SpiderLib(proc_macro2::TokenStream),
198    SpiderUtil(proc_macro2::TokenStream),
199}
200
201fn is_option_type(ty: &Type) -> bool {
202    match ty {
203        Type::Path(type_path) => type_path
204            .path
205            .segments
206            .last()
207            .map(|segment| segment.ident == "Option")
208            .unwrap_or(false),
209        _ => false,
210    }
211}
212
213fn field_value_type_tokens(ty: &Type) -> proc_macro2::TokenStream {
214    let field_value_type = item_type_tokens("FieldValueType");
215    let core_ty = unwrap_option_type(ty).unwrap_or(ty);
216
217    match core_ty {
218        Type::Path(type_path) => {
219            let segment = match type_path.path.segments.last() {
220                Some(segment) => segment,
221                None => {
222                    return quote!(#field_value_type::Unknown);
223                }
224            };
225            let ident = segment.ident.to_string();
226            match ident.as_str() {
227                "bool" => quote!(#field_value_type::Bool),
228                "String" | "str" => quote!(#field_value_type::String),
229                "i8" | "i16" | "i32" | "i64" | "i128" | "isize" | "u8" | "u16" | "u32" | "u64"
230                | "u128" | "usize" => quote!(#field_value_type::Integer),
231                "f32" | "f64" => quote!(#field_value_type::Float),
232                "Vec" | "VecDeque" | "HashSet" | "BTreeSet" => {
233                    quote!(#field_value_type::Sequence)
234                }
235                "HashMap" | "BTreeMap" => quote!(#field_value_type::Map),
236                "Value" => quote!(#field_value_type::Json),
237                _ => quote!(#field_value_type::Unknown),
238            }
239        }
240        Type::Array(_) | Type::Slice(_) => quote!(#field_value_type::Sequence),
241        Type::Tuple(_) => quote!(#field_value_type::Sequence),
242        _ => quote!(#field_value_type::Unknown),
243    }
244}
245
246fn unwrap_option_type(ty: &Type) -> Option<&Type> {
247    let Type::Path(type_path) = ty else {
248        return None;
249    };
250    let segment = type_path.path.segments.last()?;
251    if segment.ident != "Option" {
252        return None;
253    }
254
255    let syn::PathArguments::AngleBracketed(args) = &segment.arguments else {
256        return None;
257    };
258    let Some(syn::GenericArgument::Type(inner)) = args.args.first() else {
259        return None;
260    };
261    Some(inner)
262}