Skip to main content

binoc_sdk/
types.rs

1use serde::{Deserialize, Serialize};
2
3use crate::ir::DiffNode;
4
5// ── Artifact types ──────────────────────────────────────────────────
6
7/// Which side of a comparison an artifact describes.
8#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
9#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
10pub enum ArtifactSubject {
11    #[serde(rename = "left")]
12    Left,
13    #[serde(rename = "right")]
14    Right,
15    #[serde(rename = "pair")]
16    Pair,
17}
18
19/// Identifies an artifact's data format as a structured tuple of
20/// (package, name, version).
21///
22/// - **`package`** — the package that owns and defines this format,
23///   resolvable through the language's normal package system
24///   (e.g. `"binoc"`, `"binoc-csv"`, `"acme-parquet"`).
25/// - **`name`** — the format name within that package
26///   (e.g. `"tabular"`, `"relational-schema"`).
27/// - **`version`** — a single integer. Bump only for breaking schema
28///   changes. Adding optional fields to an existing version is fine
29///   and does not require a bump (JSON/serde naturally ignore unknown
30///   fields and default missing ones).
31#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
32#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
33pub struct ArtifactFormat {
34    pub package: String,
35    pub name: String,
36    pub version: u32,
37}
38
39impl ArtifactFormat {
40    pub fn new(package: impl Into<String>, name: impl Into<String>, version: u32) -> Self {
41        Self {
42            package: package.into(),
43            name: name.into(),
44            version,
45        }
46    }
47}
48
49impl std::fmt::Display for ArtifactFormat {
50    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51        write!(f, "{}.{}.v{}", self.package, self.name, self.version)
52    }
53}
54
55/// Descriptor for a published artifact attached to a node.
56///
57/// Artifacts are the unified mechanism for both private reuse and
58/// cross-plugin composition. A comparator or transformer publishes
59/// zero or more artifacts; downstream plugins consume them by format.
60#[derive(Debug, Clone, Serialize, Deserialize)]
61#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
62pub struct ArtifactDescriptor {
63    pub format: ArtifactFormat,
64    pub subject: ArtifactSubject,
65    pub producer: String,
66    /// Opaque handle managed by the SDK's DataAccess implementation.
67    /// Plugins should not create or interpret this value directly.
68    pub handle: String,
69}
70
71// ── Standard artifact formats ───────────────────────────────────────
72
73/// Standard format for tabular data artifacts.
74///
75/// Any comparator that parses a tabular source format (CSV, TSV, Excel,
76/// Parquet, …) should publish artifacts with this format so that
77/// generic tabular transformers and extractors can consume them without
78/// knowing the source format.
79pub fn tabular_v1() -> ArtifactFormat {
80    ArtifactFormat::new("binoc", "tabular", 1)
81}
82
83// ── Format-neutral data types ───────────────────────────────────────
84
85/// Format-neutral tabular data. Produced by CSV, Excel, Parquet comparators;
86/// consumed by tabular transformers and extractors.
87///
88/// This is the codec type for the [`tabular_v1`] artifact format.
89/// Serialize with `serde_json::to_vec`, deserialize with `serde_json::from_slice`.
90#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
91pub struct TabularData {
92    pub headers: Vec<String>,
93    pub rows: Vec<Vec<String>>,
94}
95
96impl TabularData {
97    pub fn column_index(&self, name: &str) -> Option<usize> {
98        self.headers.iter().position(|h| h == name)
99    }
100
101    pub fn column_values(&self, name: &str) -> Option<Vec<&str>> {
102        let idx = self.column_index(name)?;
103        Some(
104            self.rows
105                .iter()
106                .map(|r| r.get(idx).map(|s| s.as_str()).unwrap_or(""))
107                .collect(),
108        )
109    }
110
111    pub fn to_csv(&self) -> String {
112        let mut out = self.headers.join(",");
113        out.push('\n');
114        for row in &self.rows {
115            out.push_str(&row.join(","));
116            out.push('\n');
117        }
118        out
119    }
120}
121
122/// A pair of tabular data (left/right sides of a comparison).
123#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct TabularDataPair {
125    pub left: Option<TabularData>,
126    pub right: Option<TabularData>,
127}
128
129impl TabularDataPair {
130    /// Build a `TabularDataPair` from [`tabular_v1`] artifacts on a node.
131    ///
132    /// Returns `None` if neither left nor right artifact is present.
133    /// This is the standard way for transformers and extractors to obtain
134    /// tabular data without knowing the source format.
135    pub fn from_artifacts(
136        node: &crate::ir::DiffNode,
137        data: &dyn crate::traits::DataAccess,
138    ) -> Option<Self> {
139        let fmt = tabular_v1();
140        let left = node
141            .artifacts
142            .iter()
143            .find(|a| a.format == fmt && a.subject == ArtifactSubject::Left)
144            .and_then(|desc| data.get_artifact(desc).ok()?)
145            .and_then(|bytes| serde_json::from_slice(&bytes).ok());
146        let right = node
147            .artifacts
148            .iter()
149            .find(|a| a.format == fmt && a.subject == ArtifactSubject::Right)
150            .and_then(|desc| data.get_artifact(desc).ok()?)
151            .and_then(|bytes| serde_json::from_slice(&bytes).ok());
152        if left.is_none() && right.is_none() {
153            return None;
154        }
155        Some(Self { left, right })
156    }
157}
158
159// ── Tabular extraction ──────────────────────────────────────────────
160
161/// Shared extraction logic for tabular data.
162///
163/// Given a `TabularDataPair` and an aspect name, produces the
164/// corresponding `ExtractResult`. This is format-neutral — any
165/// comparator or transformer that works with tabular artifacts can
166/// delegate extraction here.
167pub fn tabular_extract(
168    pair: &TabularDataPair,
169    _node: &DiffNode,
170    aspect: &str,
171) -> Option<ExtractResult> {
172    match aspect {
173        "rows_added" => {
174            let right = pair.right.as_ref()?;
175            let left_len = pair.left.as_ref().map_or(0, |l| l.rows.len());
176            if left_len >= right.rows.len() {
177                return Some(ExtractResult::Text("No rows added.\n".into()));
178            }
179            let added = TabularData {
180                headers: right.headers.clone(),
181                rows: right.rows[left_len..].to_vec(),
182            };
183            Some(ExtractResult::Text(added.to_csv()))
184        }
185        "rows_removed" => {
186            let left = pair.left.as_ref()?;
187            let right_len = pair.right.as_ref().map_or(0, |r| r.rows.len());
188            if right_len >= left.rows.len() {
189                return Some(ExtractResult::Text("No rows removed.\n".into()));
190            }
191            let removed = TabularData {
192                headers: left.headers.clone(),
193                rows: left.rows[right_len..].to_vec(),
194            };
195            Some(ExtractResult::Text(removed.to_csv()))
196        }
197        "cells_changed" => {
198            let left = pair.left.as_ref()?;
199            let right = pair.right.as_ref()?;
200            let common_cols = tabular_columns_in_common(left, right);
201            let min_rows = left.rows.len().min(right.rows.len());
202
203            let mut out = String::from("row,column,old_value,new_value\n");
204            for i in 0..min_rows {
205                for col in &common_cols {
206                    let li = left.column_index(col)?;
207                    let ri = right.column_index(col)?;
208                    let lv = left.rows[i].get(li).map(|s| s.as_str()).unwrap_or("");
209                    let rv = right.rows[i].get(ri).map(|s| s.as_str()).unwrap_or("");
210                    if lv != rv {
211                        out.push_str(&format!("{i},{col},{lv},{rv}\n"));
212                    }
213                }
214            }
215            Some(ExtractResult::Text(out))
216        }
217        "columns_added" => {
218            let left = pair.left.as_ref()?;
219            let right = pair.right.as_ref()?;
220            let left_set: std::collections::BTreeSet<&str> =
221                left.headers.iter().map(|s| s.as_str()).collect();
222            let added: Vec<&str> = right
223                .headers
224                .iter()
225                .filter(|h| !left_set.contains(h.as_str()))
226                .map(|h| h.as_str())
227                .collect();
228            if added.is_empty() {
229                return Some(ExtractResult::Text("No columns added.\n".into()));
230            }
231            let mut out = String::new();
232            for col in &added {
233                out.push_str(&format!("{col}\n"));
234                if let Some(vals) = right.column_values(col) {
235                    for val in vals {
236                        out.push_str(&format!("  {val}\n"));
237                    }
238                }
239            }
240            Some(ExtractResult::Text(out))
241        }
242        "columns_removed" => {
243            let left = pair.left.as_ref()?;
244            let right = pair.right.as_ref()?;
245            let right_set: std::collections::BTreeSet<&str> =
246                right.headers.iter().map(|s| s.as_str()).collect();
247            let removed: Vec<&str> = left
248                .headers
249                .iter()
250                .filter(|h| !right_set.contains(h.as_str()))
251                .map(|h| h.as_str())
252                .collect();
253            if removed.is_empty() {
254                return Some(ExtractResult::Text("No columns removed.\n".into()));
255            }
256            let mut out = String::new();
257            for col in &removed {
258                out.push_str(&format!("{col}\n"));
259                if let Some(vals) = left.column_values(col) {
260                    for val in vals {
261                        out.push_str(&format!("  {val}\n"));
262                    }
263                }
264            }
265            Some(ExtractResult::Text(out))
266        }
267        "content" | "full" => {
268            let mut out = String::new();
269            if let Some(left) = &pair.left {
270                out.push_str("--- left\n");
271                out.push_str(&left.to_csv());
272            }
273            if let Some(right) = &pair.right {
274                out.push_str("+++ right\n");
275                out.push_str(&right.to_csv());
276            }
277            Some(ExtractResult::Text(out))
278        }
279        _ => None,
280    }
281}
282
283fn tabular_columns_in_common(left: &TabularData, right: &TabularData) -> Vec<String> {
284    let left_set: std::collections::BTreeSet<&str> =
285        left.headers.iter().map(|s| s.as_str()).collect();
286    right
287        .headers
288        .iter()
289        .filter(|h| left_set.contains(h.as_str()))
290        .cloned()
291        .collect()
292}
293
294// ── Item types ──────────────────────────────────────────────────────
295
296/// Metadata-only view of one side of a comparison. Carries logical identity
297/// and content metadata but NOT a filesystem path — data access goes through
298/// `DataAccess`.
299///
300/// # Metadata invariants
301///
302/// `content_hash`, `size`, and `media_type` are **opportunistic hints**.
303/// Producers (expanding comparators like directory/zip, or data backends)
304/// populate them when doing so is cheap — typically as a byproduct of work
305/// they were already performing. Consumers **must not assume presence**, but
306/// **may trust presence**: when a field is set, the value accurately reflects
307/// the current bytes. Use [`ItemRef::resolve_hash`] / [`ItemRef::resolve_size`]
308/// to obtain a value with a transparent fall-back read.
309///
310/// This keeps fast paths (directory-only listings, short-circuit identical
311/// detection) cheap while letting consumers that need a value — most notably
312/// the move detector, which correlates leaves across container boundaries —
313/// hydrate on demand.
314#[derive(Debug, Clone, Serialize, Deserialize)]
315#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
316pub struct ItemRef {
317    pub logical_path: String,
318    pub is_dir: bool,
319    #[serde(default, skip_serializing_if = "Option::is_none")]
320    pub content_hash: Option<String>,
321    #[serde(default, skip_serializing_if = "Option::is_none")]
322    pub size: Option<u64>,
323    #[serde(default, skip_serializing_if = "Option::is_none")]
324    pub media_type: Option<String>,
325    /// Opaque identifier used by DataAccess implementations to locate data.
326    /// Plugin authors should not create or interpret this value directly.
327    #[serde(default)]
328    pub handle: String,
329}
330
331impl ItemRef {
332    pub fn extension(&self) -> Option<String> {
333        std::path::Path::new(&self.logical_path)
334            .extension()
335            .map(|e| format!(".{}", e.to_string_lossy().to_lowercase()))
336    }
337
338    /// Return the item's BLAKE3 content hash, computing it from bytes if
339    /// not already cached on this `ItemRef`. Never valid for directories.
340    pub fn resolve_hash(&self, data: &dyn crate::DataAccess) -> crate::BinocResult<String> {
341        if let Some(hash) = &self.content_hash {
342            return Ok(hash.clone());
343        }
344        let bytes = data.read_bytes(self)?;
345        Ok(blake3::hash(&bytes).to_hex().to_string())
346    }
347
348    /// Return the item's byte length, reading from the backend if not already
349    /// cached on this `ItemRef`. Never valid for directories.
350    pub fn resolve_size(&self, data: &dyn crate::DataAccess) -> crate::BinocResult<u64> {
351        if let Some(size) = self.size {
352            return Ok(size);
353        }
354        let bytes = data.read_bytes(self)?;
355        Ok(bytes.len() as u64)
356    }
357}
358
359/// A pair of items to compare. Either side may be None (add/remove).
360#[derive(Debug, Clone, Serialize, Deserialize)]
361#[cfg_attr(feature = "schema", derive(schemars::JsonSchema))]
362pub struct ItemPair {
363    pub left: Option<ItemRef>,
364    pub right: Option<ItemRef>,
365}
366
367impl ItemPair {
368    pub fn both(left: ItemRef, right: ItemRef) -> Self {
369        Self {
370            left: Some(left),
371            right: Some(right),
372        }
373    }
374
375    pub fn added(right: ItemRef) -> Self {
376        Self {
377            left: None,
378            right: Some(right),
379        }
380    }
381
382    pub fn removed(left: ItemRef) -> Self {
383        Self {
384            left: Some(left),
385            right: None,
386        }
387    }
388
389    pub fn logical_path(&self) -> &str {
390        self.right
391            .as_ref()
392            .or(self.left.as_ref())
393            .map(|i| i.logical_path.as_str())
394            .unwrap_or("")
395    }
396
397    pub fn extension(&self) -> Option<String> {
398        self.right
399            .as_ref()
400            .or(self.left.as_ref())
401            .and_then(|i| i.extension())
402    }
403
404    pub fn media_type(&self) -> Option<&str> {
405        self.right
406            .as_ref()
407            .or(self.left.as_ref())
408            .and_then(|i| i.media_type.as_deref())
409    }
410
411    pub fn is_dir(&self) -> bool {
412        self.right.as_ref().is_some_and(|i| i.is_dir)
413            || self.left.as_ref().is_some_and(|i| i.is_dir)
414    }
415
416    pub fn matching_content_hash(&self) -> Option<&str> {
417        match (&self.left, &self.right) {
418            (Some(l), Some(r)) => match (&l.content_hash, &r.content_hash) {
419                (Some(hl), Some(hr)) if hl == hr => Some(hl.as_str()),
420                _ => None,
421            },
422            _ => None,
423        }
424    }
425}
426
427// ── Result enums ────────────────────────────────────────────────────
428
429/// Result of a comparator's compare operation.
430#[derive(Debug, Serialize, Deserialize)]
431#[non_exhaustive]
432pub enum CompareResult {
433    /// Items are identical — no diff node produced.
434    Identical,
435    /// Terminal diff — no further expansion needed.
436    Leaf(DiffNode),
437    /// Container node with children to recursively process.
438    Expand(DiffNode, Vec<ItemPair>),
439    /// Comparator cannot handle this item after all — try the next one.
440    Skip,
441}
442
443/// Result of a transformer's transform operation.
444#[non_exhaustive]
445pub enum TransformResult {
446    /// Node unchanged — zero cost.
447    Unchanged,
448    /// Replace this node with a new one.
449    Replace(Box<DiffNode>),
450    /// Replace this node with multiple sibling nodes.
451    ReplaceMany(Vec<DiffNode>),
452    /// Remove this node entirely.
453    Remove,
454}
455
456/// Dispatch filter on node shape for transformer matching.
457#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
458pub enum NodeShapeFilter {
459    /// Match any node regardless of children.
460    #[default]
461    Any,
462    /// Match only container nodes (those with children).
463    Container,
464    /// Match only leaf nodes (those without children).
465    Leaf,
466    /// Match only the tree root. Intended for tree-wide walkers
467    /// (correlation detectors, roll-ups) that need to see the entire
468    /// changeset at once and do their own traversal. Called exactly
469    /// once per diff.
470    Root,
471}
472
473/// Whether a comparator handles files, containers (directories), or both.
474#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)]
475pub enum ItemScope {
476    /// Non-directory items only (most comparators).
477    #[default]
478    Files,
479    /// Directories only (directory comparator).
480    Containers,
481    /// Both files and directories.
482    Any,
483}
484
485/// Result of an extract (on-demand detail retrieval) operation.
486pub enum ExtractResult {
487    Text(String),
488    Binary(Vec<u8>),
489}
490
491#[cfg(test)]
492mod tests {
493    use super::*;
494
495    fn bare_item(logical: &str, is_dir: bool) -> ItemRef {
496        ItemRef {
497            logical_path: logical.into(),
498            is_dir,
499            content_hash: None,
500            size: None,
501            media_type: None,
502            handle: String::new(),
503        }
504    }
505
506    #[test]
507    fn item_ref_extension() {
508        let item = bare_item("data.csv", false);
509        assert_eq!(item.extension(), Some(".csv".into()));
510    }
511
512    #[test]
513    fn item_ref_extension_none() {
514        let item = bare_item("Makefile", false);
515        assert_eq!(item.extension(), None);
516    }
517
518    #[test]
519    fn item_pair_logical_path_prefers_right() {
520        let left = bare_item("left.txt", false);
521        let right = bare_item("right.txt", false);
522        let pair = ItemPair::both(left, right);
523        assert_eq!(pair.logical_path(), "right.txt");
524    }
525
526    #[test]
527    fn item_pair_logical_path_falls_back_to_left() {
528        let left = bare_item("only.txt", false);
529        let pair = ItemPair::removed(left);
530        assert_eq!(pair.logical_path(), "only.txt");
531    }
532
533    #[test]
534    fn item_pair_is_dir() {
535        let dir = bare_item("sub", true);
536        let pair = ItemPair::added(dir);
537        assert!(pair.is_dir());
538    }
539
540    #[test]
541    fn item_pair_matching_hash() {
542        let mut left = bare_item("f", false);
543        left.content_hash = Some("abc".into());
544        let mut right = bare_item("f", false);
545        right.content_hash = Some("abc".into());
546        let pair = ItemPair::both(left, right);
547        assert_eq!(pair.matching_content_hash(), Some("abc"));
548    }
549}