diff --git a/sidemantic-rs/src/config/loader.rs b/sidemantic-rs/src/config/loader.rs index a81af10a..5a6fa6b2 100644 --- a/sidemantic-rs/src/config/loader.rs +++ b/sidemantic-rs/src/config/loader.rs @@ -494,7 +494,7 @@ pub fn load_from_directory_with_metadata(dir: impl AsRef) -> Result { let content = fs::read_to_string(&path).map_err(|e| { @@ -590,39 +590,37 @@ pub fn load_from_directory_with_metadata(dir: impl AsRef) -> Result, incoming: Option) { +fn merge_graph_metadata(acc: &mut Option, incoming: Option) { let Some(incoming) = incoming else { return; }; - let Some(incoming_osi) = incoming.get("osi").and_then(|v| v.as_object()).cloned() else { - return; - }; - - let acc_value = - acc.get_or_insert_with(|| serde_json::json!({ "osi": { "semantic_models": [] } })); - let Some(acc_osi) = acc_value - .as_object_mut() - .and_then(|m| m.get_mut("osi")) - .and_then(serde_json::Value::as_object_mut) - else { - return; - }; - - if let Some(serde_json::Value::Array(incoming_models)) = incoming_osi.get("semantic_models") { - let entry = acc_osi - .entry("semantic_models") - .or_insert_with(|| serde_json::Value::Array(Vec::new())); - if let serde_json::Value::Array(acc_models) = entry { - acc_models.extend(incoming_models.iter().cloned()); - } + match acc { + Some(existing) => deep_merge_json(existing, incoming), + None => *acc = Some(incoming), } +} - for key in ["version", "ontology"] { - if !acc_osi.contains_key(key) { - if let Some(value) = incoming_osi.get(key) { - acc_osi.insert(key.to_string(), value.clone()); +/// Recursively merge `incoming` into `target`: objects merge, arrays append, and +/// scalars keep the existing (first-wins) value. This preserves OSI accumulation +/// (semantic_models arrays append, version/ontology keep first) while also merging +/// non-OSI payloads such as `metadata.snowflake` from Python `export-native` files. +fn deep_merge_json(target: &mut serde_json::Value, incoming: serde_json::Value) { + match (target, incoming) { + (serde_json::Value::Object(target_map), serde_json::Value::Object(incoming_map)) => { + for (key, value) in incoming_map { + match target_map.get_mut(&key) { + Some(existing) => deep_merge_json(existing, value), + None => { + target_map.insert(key, value); + } + } } } + (serde_json::Value::Array(target_arr), serde_json::Value::Array(incoming_arr)) => { + target_arr.extend(incoming_arr); + } + // Scalars (or type mismatches): keep the existing value. + _ => {} } } @@ -1875,6 +1873,65 @@ models: assert!(orders.get_metric("net_revenue").is_some()); } + #[test] + fn test_load_from_directory_merges_non_osi_root_metadata() { + let dir = std::env::temp_dir().join(format!( + "sidemantic-rs-loader-metadata-{}-{}", + std::process::id(), + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_nanos() + )); + fs::create_dir_all(&dir).unwrap(); + // Python `export-native` writes root `metadata.snowflake` (no `osi` key). + fs::write( + dir.join("a.yml"), + r#" +models: + - name: orders + table: orders + primary_key: order_id +metadata: + snowflake: + custom_instructions: Prefer revenue. + verified_queries: + - name: q1 +"#, + ) + .unwrap(); + fs::write( + dir.join("b.yml"), + r#" +models: + - name: customers + table: customers + primary_key: id +metadata: + snowflake: + verified_queries: + - name: q2 +"#, + ) + .unwrap(); + + let loaded = load_from_directory_with_metadata(&dir).unwrap(); + fs::remove_dir_all(&dir).unwrap(); + + let metadata = loaded.graph.metadata().expect("graph metadata preserved"); + let snowflake = &metadata["snowflake"]; + assert_eq!(snowflake["custom_instructions"], "Prefer revenue."); + // verified_queries from both files accumulate. + let names: Vec<&str> = snowflake["verified_queries"] + .as_array() + .unwrap() + .iter() + .map(|entry| entry["name"].as_str().unwrap()) + .collect(); + assert!(names.contains(&"q1")); + assert!(names.contains(&"q2")); + } + #[test] fn test_walkdir_returns_deterministic_lexical_order() { let dir = std::env::temp_dir().join(format!( diff --git a/sidemantic-rs/src/config/schema.rs b/sidemantic-rs/src/config/schema.rs index 4bf882bf..50bfb90a 100644 --- a/sidemantic-rs/src/config/schema.rs +++ b/sidemantic-rs/src/config/schema.rs @@ -33,7 +33,8 @@ pub struct SidemanticConfig { pub sql_metrics: Option, #[serde(default)] pub sql_segments: Option, - /// Graph-level metadata payload (round-trips format-specific state such as OSI). + /// Graph-level metadata payload (round-trips format-specific state such as OSI, + /// and Snowflake Cortex top-level sections from the Python native export). #[serde(default)] pub metadata: Option, } @@ -132,6 +133,15 @@ pub struct DimensionConfig { pub metadata: Option, #[serde(default)] pub meta: Option, + /// Alternative names (e.g. Snowflake Cortex Analyst, Cube). + #[serde(default)] + pub synonyms: Option>, + /// Representative sample values for this dimension. + #[serde(default)] + pub sample_values: Option>, + /// Linked Cortex Search service name (Snowflake Cortex Analyst). + #[serde(default)] + pub cortex_search_service_name: Option, pub format: Option, pub value_format_name: Option, pub parent: Option, @@ -190,6 +200,9 @@ pub struct MetricConfig { pub metadata: Option, #[serde(default)] pub meta: Option, + /// Alternative names (e.g. Snowflake Cortex Analyst, Cube). + #[serde(default)] + pub synonyms: Option>, #[serde(default = "default_public")] pub public: bool, } @@ -1492,6 +1505,61 @@ models: ); } + #[test] + fn test_native_contract_accepts_snowflake_enrichment_fields() { + // Native YAML produced by Python `export-native` after a Snowflake import + // carries root `metadata`, dimension synonyms/sample_values/cortex search, + // and metric synonyms. The Rust native loader must accept (not reject) it. + let yaml = r#" +metadata: + snowflake: + verified_queries: + - name: total revenue + custom_instructions: Prefer revenue. +models: + - name: orders + table: orders + dimensions: + - name: status + type: categorical + synonyms: [state] + sample_values: ["1001", "1002"] + cortex_search_service_name: status_search + metrics: + - name: revenue + agg: sum + sql: amount + synonyms: [total revenue] +"#; + + let config: SidemanticConfig = serde_yaml::from_str(yaml).unwrap(); + + assert_eq!( + config.metadata.as_ref().unwrap()["snowflake"]["custom_instructions"], + "Prefer revenue." + ); + + let dim = &config.models[0].dimensions[0]; + assert_eq!(dim.synonyms.as_deref(), Some(&["state".to_string()][..])); + assert_eq!( + dim.sample_values.as_deref(), + Some(&["1001".to_string(), "1002".to_string()][..]) + ); + assert_eq!( + dim.cortex_search_service_name.as_deref(), + Some("status_search") + ); + + let metric = &config.models[0].metrics[0]; + assert_eq!( + metric.synonyms.as_deref(), + Some(&["total revenue".to_string()][..]) + ); + + // The config must still convert into the internal model without error. + config.into_parts().unwrap(); + } + #[test] fn test_parse_many_to_many_relationship_fields() { let yaml = r#" diff --git a/sidemantic-schema.json b/sidemantic-schema.json index e735fe67..c06d5b43 100644 --- a/sidemantic-schema.json +++ b/sidemantic-schema.json @@ -3,6 +3,19 @@ "Dimension": { "description": "Dimension (attribute) definition.\n\nDimensions are used for grouping and filtering in queries.", "properties": { + "cortex_search_service_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Linked Cortex Search service name (Snowflake Cortex Analyst)", + "title": "Cortex Search Service Name" + }, "dax": { "anyOf": [ { @@ -147,6 +160,22 @@ "title": "Public", "type": "boolean" }, + "sample_values": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Representative sample values for this dimension", + "title": "Sample Values" + }, "sql": { "anyOf": [ { @@ -176,6 +205,22 @@ "description": "Supported granularities for time dimensions", "title": "Supported Granularities" }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Alternative names for this dimension", + "title": "Synonyms" + }, "type": { "description": "Dimension type", "enum": [ @@ -818,6 +863,22 @@ "description": "N-step funnel filter expressions (overrides base_event/conversion_event)", "title": "Steps" }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Alternative names for this measure/metric", + "title": "Synonyms" + }, "time_offset": { "anyOf": [ { @@ -2025,6 +2086,22 @@ "description": "N-step funnel filter expressions (overrides base_event/conversion_event)", "title": "Steps" }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Alternative names for this measure/metric", + "title": "Synonyms" + }, "time_offset": { "anyOf": [ { @@ -2141,6 +2218,19 @@ "Dimension": { "description": "Dimension (attribute) definition.\n\nDimensions are used for grouping and filtering in queries.", "properties": { + "cortex_search_service_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Linked Cortex Search service name (Snowflake Cortex Analyst)", + "title": "Cortex Search Service Name" + }, "dax": { "anyOf": [ { @@ -2285,6 +2375,22 @@ "title": "Public", "type": "boolean" }, + "sample_values": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Representative sample values for this dimension", + "title": "Sample Values" + }, "sql": { "anyOf": [ { @@ -2314,6 +2420,22 @@ "description": "Supported granularities for time dimensions", "title": "Supported Granularities" }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Alternative names for this dimension", + "title": "Synonyms" + }, "type": { "description": "Dimension type", "enum": [ @@ -2956,6 +3078,22 @@ "description": "N-step funnel filter expressions (overrides base_event/conversion_event)", "title": "Steps" }, + "synonyms": { + "anyOf": [ + { + "items": { + "type": "string" + }, + "type": "array" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Alternative names for this measure/metric", + "title": "Synonyms" + }, "time_offset": { "anyOf": [ { diff --git a/sidemantic/adapters/sidemantic.py b/sidemantic/adapters/sidemantic.py index 95daaf40..fdcfcbff 100644 --- a/sidemantic/adapters/sidemantic.py +++ b/sidemantic/adapters/sidemantic.py @@ -29,6 +29,7 @@ "models", "metrics", "parameters", + "metadata", "sql_metrics", "sql_segments", } @@ -79,6 +80,9 @@ "label", "metadata", "meta", + "synonyms", + "sample_values", + "cortex_search_service_name", "format", "value_format_name", "parent", @@ -118,6 +122,7 @@ "periods", "retention_granularity", "granularity", + "synonyms", "inner_metrics", "entity_dimensions", "having", @@ -281,6 +286,10 @@ def normalize_sql_frontmatter(frontmatter: dict) -> dict: normalized.pop("connection", None) normalized.pop("models", None) normalized.pop("parameters", None) + # ``metadata`` is a root-only native field (graph-level), so it must not by + # itself make the frontmatter look like a model definition. Graph metadata is + # extracted separately by the caller before this decision. + normalized.pop("metadata", None) return normalized @@ -351,9 +360,13 @@ def parse(self, source: str | Path) -> SemanticGraph: raise ValueError(f"{source_path}: invalid SQL definitions: {exc}") from exc # Parse frontmatter as a model only when it still contains model fields - # after native contract metadata such as `version` is removed. + # after native contract metadata such as `version`/`metadata` is removed. normalized_frontmatter = normalize_sql_frontmatter(frontmatter) if frontmatter else {} if normalized_frontmatter: + # ``metadata`` is a valid model field, so re-attach it when the + # frontmatter is a model so the model keeps its own metadata. + if frontmatter.get("metadata") is not None: + normalized_frontmatter["metadata"] = frontmatter["metadata"] model = self._parse_model(normalized_frontmatter, source_path=source_path) if model: # Add SQL-defined metrics/segments to the model @@ -363,7 +376,12 @@ def parse(self, source: str | Path) -> SemanticGraph: model.pre_aggregations.extend(sql_preaggs) graph.add_model(model) else: - # No frontmatter - treat as graph-level metrics/segments + # No model frontmatter - treat as graph-level metrics/segments. + # Root-only ``metadata`` (e.g. Snowflake Cortex top-level + # sections) is preserved on the graph here. + graph_metadata = frontmatter.get("metadata") if frontmatter else None + if isinstance(graph_metadata, dict): + graph.metadata.update(graph_metadata) for metric in sql_metrics: graph.add_metric(metric) for param in sql_parameters: @@ -387,6 +405,11 @@ def parse(self, source: str | Path) -> SemanticGraph: validate_native_format_version(data) reject_unknown_fields(data, ROOT_FIELDS, "root", source_path=source_path) + # Preserve graph-level metadata (e.g. Snowflake Cortex top-level sections). + graph_metadata = data.get("metadata") + if isinstance(graph_metadata, dict): + graph.metadata.update(graph_metadata) + # Parse models for model_def in data.get("models") or []: model = self._parse_model(model_def, source_path=source_path) @@ -484,6 +507,9 @@ def export(self, graph: SemanticGraph, output_path: str | Path) -> None: if graph.parameters: data["parameters"] = [self._export_parameter(parameter) for parameter in graph.parameters.values()] + if graph.metadata: + data["metadata"] = graph.metadata + output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: @@ -561,6 +587,9 @@ def _parse_model(self, model_def: dict, *, source_path: Path | None = None) -> M parent=dim_def.get("parent"), metadata=dim_def.get("metadata"), meta=dim_def.get("meta"), + synonyms=dim_def.get("synonyms"), + sample_values=dim_def.get("sample_values"), + cortex_search_service_name=dim_def.get("cortex_search_service_name"), window=dim_def.get("window"), ) dimensions.append(dimension) @@ -781,6 +810,7 @@ def _parse_metric( "value_format_name", "drill_fields", "non_additive_dimension", + "synonyms", "meta", "public", ]: @@ -928,6 +958,12 @@ def _export_model(self, model: Model) -> dict: dim_def["metadata"] = dim.metadata if dim.meta: dim_def["meta"] = dim.meta + if dim.synonyms: + dim_def["synonyms"] = dim.synonyms + if dim.sample_values: + dim_def["sample_values"] = dim.sample_values + if dim.cortex_search_service_name: + dim_def["cortex_search_service_name"] = dim.cortex_search_service_name if dim.format: dim_def["format"] = dim.format if dim.value_format_name: @@ -966,6 +1002,8 @@ def _export_model(self, model: Model) -> dict: measure_def["metadata"] = measure.metadata if measure.meta: measure_def["meta"] = measure.meta + if measure.synonyms: + measure_def["synonyms"] = measure.synonyms if not measure.public: measure_def["public"] = measure.public if measure.format: @@ -1089,6 +1127,8 @@ def _export_metric(self, measure: Metric, graph) -> dict: result["metadata"] = measure.metadata if measure.meta: result["meta"] = measure.meta + if measure.synonyms: + result["synonyms"] = measure.synonyms if not measure.public: result["public"] = measure.public diff --git a/sidemantic/adapters/snowflake.py b/sidemantic/adapters/snowflake.py index a34ba470..f339c1a5 100644 --- a/sidemantic/adapters/snowflake.py +++ b/sidemantic/adapters/snowflake.py @@ -111,12 +111,19 @@ class SnowflakeAdapter(BaseAdapter): - tables -> Models - dimensions -> Dimensions (categorical) - time_dimensions -> Dimensions (time) - - facts -> Metrics (with default_aggregation) + - facts (a.k.a. legacy `measures`) -> Metrics (with default_aggregation) - metrics -> Metrics (derived, table-scoped aggregations) - relationships -> Relationships - filters -> Segments - Reference: https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-analyst/semantic-model-spec + Also imports newer Cortex Analyst spec features: + - `synonyms` on dimensions/facts/measures/metrics + - `sample_values` and `cortex_search_service` / `cortex_search_service_name` on dimensions + - top-level `verified_queries`, `custom_instructions`, `module_custom_instructions` + - per-field keys preserved in metadata: access_modifier, is_enum, unique, labels, + tags, non_additive_dimensions, using_relationships + + Reference: https://docs.snowflake.com/en/user-guide/views-semantic/semantic-view-yaml-spec """ def parse(self, source: str | Path) -> SemanticGraph: @@ -131,24 +138,109 @@ def parse(self, source: str | Path) -> SemanticGraph: graph = SemanticGraph() source_path = Path(source) + # Top-level metrics and relationships are resolved after every file's tables + # are loaded, so a metric or relationship referencing a table defined in a + # later file still resolves regardless of directory traversal order. + deferred_metrics: list[dict] = [] + deferred_relationships: list[dict] = [] + if source_path.is_dir(): # Parse all YAML files in directory for yaml_file in source_path.rglob("*.yml"): - self._parse_file(yaml_file, graph) + self._parse_file(yaml_file, graph, deferred_metrics, deferred_relationships) for yaml_file in source_path.rglob("*.yaml"): - self._parse_file(yaml_file, graph) + self._parse_file(yaml_file, graph, deferred_metrics, deferred_relationships) else: # Parse single file - self._parse_file(source_path, graph) + self._parse_file(source_path, graph, deferred_metrics, deferred_relationships) + + self._apply_relationships(deferred_relationships, graph) + self._apply_top_level_metrics(deferred_metrics, graph) + + # For a directory parse every file is seen here, so resolve pending metrics + # against the loaded tables; anything still unresolved (table truly absent) + # falls back to a graph-level metric so it is not dropped. For a single-file + # parse the pending list is left intact so the directory loader can resolve + # it across files. + pending = getattr(graph, "_pending_table_metrics", None) + if pending and source_path.is_dir(): + self.resolve_pending_table_metrics(graph.models, pending) + for _table_name, metric in pending: + graph.metrics.setdefault(metric.name, metric) + pending.clear() return graph - def _parse_file(self, file_path: Path, graph: SemanticGraph) -> None: + @staticmethod + def resolve_pending_table_metrics(models: dict, pending_metrics: list) -> None: + """Attach pending metrics that reference a now-loaded table. + + Multi-file CLI loads parse each Snowflake file separately, so a top-level + metric with ``table: orders`` defined before the file that declares + ``orders`` is collected as a ``(table_name, Metric)`` pending entry. Once + every file's models are loaded, attach each to its table and re-qualify its + expression with the ``{model}`` placeholder. Pending entries are a list (not + a name-keyed map) so same-named scoped metrics on different tables do not + overwrite one another. Unresolved entries are left in place. + """ + remaining = [] + for table_name, metric in pending_metrics: + model = models.get(table_name) + if model is None: + remaining.append((table_name, metric)) + continue + if metric.type == "derived" and metric.sql: + metric.sql = _qualify_columns(metric.sql) + model.metrics.append(metric) + pending_metrics[:] = remaining + + def _apply_top_level_metrics(self, metric_defs: list[dict], graph: SemanticGraph) -> None: + """Attach collected top-level metrics once all tables are loaded.""" + for metric_def in metric_defs: + table_name = metric_def.get("table") + if table_name and table_name in graph.models: + # Table-scoped: bare column refs are local to the table, so qualify + # complex expressions with the {model} placeholder. + metric = self._parse_metric(metric_def) + if metric is None: + continue + graph.models[table_name].metrics.append(metric) + elif table_name: + # The referenced table is not in this graph (multi-file CLI load + # parses each file separately). Hold the metric in a table-qualified + # pending list so the directory loader can attach it once that table + # is loaded, without colliding on metric name. + metric = self._parse_metric(metric_def, qualify=False) + if metric is None: + continue + if not hasattr(graph, "_pending_table_metrics"): + graph._pending_table_metrics = [] + graph._pending_table_metrics.append((table_name, metric)) + else: + # Graph-level metric: expressions reference other fields as + # `model.field` (already qualified), so leave them untouched + # instead of corrupting them with the {model} placeholder. + metric = self._parse_metric(metric_def, qualify=False) + if metric is None: + continue + graph.metrics[metric.name] = metric + + def _parse_file( + self, + file_path: Path, + graph: SemanticGraph, + deferred_metrics: list[dict], + deferred_relationships: list[dict], + ) -> None: """Parse a single Snowflake semantic model YAML file. Args: file_path: Path to YAML file graph: Semantic graph to add models/metrics to + deferred_metrics: Accumulator for top-level metric definitions, resolved + after every file's tables are loaded. + deferred_relationships: Accumulator for top-level relationship + definitions, applied after every file's tables are loaded. """ with open(file_path) as f: data = yaml.safe_load(f) @@ -165,9 +257,17 @@ def _parse_file(self, file_path: Path, graph: SemanticGraph) -> None: if model: graph.add_model(model) - # Parse relationships (defined at semantic model level, not table level) - relationships_def = data.get("relationships") or [] - self._apply_relationships(relationships_def, graph) + # Defer relationships (defined at the semantic-model level) until all files' + # tables are loaded so they resolve regardless of traversal order. + deferred_relationships.extend(data.get("relationships") or []) + + # Defer top-level metrics (semantic-model-scoped metrics referencing tables) + # until all files are parsed, so a metric whose table lives in a later file + # still attaches correctly regardless of traversal order. + deferred_metrics.extend(data.get("metrics") or []) + + # Parse top-level Cortex Analyst sections onto the graph. + self._apply_top_level_sections(data, graph) def _parse_table(self, table_def: dict) -> Model | None: """Parse Snowflake table definition into Model. @@ -212,9 +312,11 @@ def _parse_table(self, table_def: dict) -> Model | None: if dim: dimensions.append(dim) - # Parse facts (row-level measures with default aggregation) + # Parse facts (row-level measures with default aggregation). + # Cortex Analyst's table-level `measures:` key is a legacy alias of `facts:`; + # accept both so current Cortex Analyst files import without silent data loss. metrics = [] - for fact_def in table_def.get("facts") or []: + for fact_def in (table_def.get("facts") or []) + (table_def.get("measures") or []): metric = self._parse_fact(fact_def) if metric: metrics.append(metric) @@ -270,6 +372,11 @@ def _parse_dimension(self, dim_def: dict) -> Dimension | None: type=dim_type, sql=dim_def.get("expr"), description=dim_def.get("description"), + synonyms=dim_def.get("synonyms"), + sample_values=self._sample_values(dim_def), + cortex_search_service_name=self._cortex_search_service_name(dim_def), + metadata=self._dimension_metadata(dim_def), + public=self._public_from_access_modifier(dim_def), ) def _parse_time_dimension(self, dim_def: dict) -> Dimension | None: @@ -291,6 +398,11 @@ def _parse_time_dimension(self, dim_def: dict) -> Dimension | None: sql=dim_def.get("expr"), description=dim_def.get("description"), granularity="day", # Default granularity + synonyms=dim_def.get("synonyms"), + sample_values=self._sample_values(dim_def), + cortex_search_service_name=self._cortex_search_service_name(dim_def), + metadata=self._dimension_metadata(dim_def), + public=self._public_from_access_modifier(dim_def), ) def _parse_fact(self, fact_def: dict) -> Metric | None: @@ -309,7 +421,7 @@ def _parse_fact(self, fact_def: dict) -> Metric | None: return None # Map Snowflake default_aggregation to Sidemantic agg - default_agg = fact_def.get("default_aggregation", "sum").lower() + default_agg = (fact_def.get("default_aggregation") or "sum").lower() agg_mapping = { "sum": "sum", "avg": "avg", @@ -327,15 +439,23 @@ def _parse_fact(self, fact_def: dict) -> Metric | None: agg=agg, sql=fact_def.get("expr"), description=fact_def.get("description"), + synonyms=fact_def.get("synonyms"), + metadata=self._measure_metadata(fact_def), + public=self._public_from_access_modifier(fact_def), ) - def _parse_metric(self, metric_def: dict) -> Metric | None: + def _parse_metric(self, metric_def: dict, qualify: bool = True) -> Metric | None: """Parse Snowflake metric into Sidemantic metric. Metrics in Snowflake are table-scoped aggregations (already contain aggregate functions). Args: metric_def: Metric definition dictionary + qualify: When True (table-scoped metrics), bare column references in + complex/derived expressions are qualified with the {model} + placeholder. When False (graph-level metrics), the expression is + left as-is because it already uses ``model.field`` references that + must not be rewritten. Returns: Metric instance or None @@ -375,18 +495,130 @@ def _parse_metric(self, metric_def: dict) -> Metric | None: agg=agg_func, sql=inner_expr, description=metric_def.get("description"), + synonyms=metric_def.get("synonyms"), + metadata=self._metric_metadata(metric_def), + public=self._public_from_access_modifier(metric_def), ) # Complex expression (multiple aggregations or couldn't parse simple one) - # Mark as derived and qualify column references with {model} placeholder - qualified_expr = _qualify_columns(expr) + # Mark as derived. Table-scoped metrics qualify bare column references with + # the {model} placeholder; graph-level metrics already use `model.field` + # references and must be left untouched. + derived_expr = _qualify_columns(expr) if qualify else expr return Metric( name=name, type="derived", - sql=qualified_expr, + sql=derived_expr, description=metric_def.get("description"), + synonyms=metric_def.get("synonyms"), + metadata=self._metric_metadata(metric_def), + public=self._public_from_access_modifier(metric_def), ) + @staticmethod + def _cortex_search_service_name(dim_def: dict) -> str | None: + """Resolve the linked Cortex Search service name for a dimension. + + Supports both the legacy flat ``cortex_search_service_name`` string and + the newer nested ``cortex_search_service`` object (``{service, ...}``). + """ + flat = dim_def.get("cortex_search_service_name") + if flat: + return flat + nested = dim_def.get("cortex_search_service") + if isinstance(nested, dict): + return nested.get("service") + if isinstance(nested, str): + return nested + return None + + @staticmethod + def _public_from_access_modifier(definition: dict) -> bool: + """Map Snowflake ``access_modifier`` onto Sidemantic visibility. + + Snowflake uses ``private_access`` for hidden helper fields. The original + modifier is still preserved in metadata, but reflect it on ``public`` so + CLI ``info``/catalog and native export treat the field as non-public. + """ + return definition.get("access_modifier") != "private_access" + + @staticmethod + def _sample_values(dim_def: dict) -> list[str] | None: + """Coerce Snowflake ``sample_values`` to strings. + + Snowflake documents ``sample_values`` as raw column values, so numeric or + time dimensions can legally contain unquoted YAML scalars (e.g. + ``sample_values: [1001, 1002]``). ``Dimension.sample_values`` is typed as + ``list[str]``, so coerce any scalar to ``str`` to avoid rejecting valid + Cortex files. + """ + values = dim_def.get("sample_values") + if values is None: + return None + return [str(value) for value in values] + + @staticmethod + def _collect_metadata(definition: dict, keys: tuple[str, ...]) -> dict | None: + """Preserve newer Cortex Analyst per-field keys under a snowflake namespace.""" + extra = {key: definition[key] for key in keys if definition.get(key) is not None} + if not extra: + return None + return {"snowflake": extra} + + def _dimension_metadata(self, dim_def: dict) -> dict | None: + return self._collect_metadata( + dim_def, + ("unique", "is_enum", "access_modifier", "labels", "tags", "cortex_search_service"), + ) + + def _measure_metadata(self, measure_def: dict) -> dict | None: + return self._collect_metadata( + measure_def, + ("access_modifier", "is_enum", "labels", "tags", "non_additive_dimensions"), + ) + + def _metric_metadata(self, metric_def: dict) -> dict | None: + return self._collect_metadata( + metric_def, + ("access_modifier", "labels", "tags", "non_additive_dimensions", "using_relationships"), + ) + + @staticmethod + def _apply_top_level_sections(data: dict, graph: SemanticGraph) -> None: + """Attach top-level Cortex Analyst sections to the graph. + + Cortex Analyst defines several semantic-model-level sections that have no + direct Sidemantic equivalent. We expose them both as direct attributes on + the graph (for ergonomic access) and inside ``graph.metadata`` so they + survive serialization. + """ + verified_queries = data.get("verified_queries") or [] + custom_instructions = data.get("custom_instructions") + module_custom_instructions = data.get("module_custom_instructions") + + # Accumulate verified queries across files in a directory parse. + existing = list(getattr(graph, "verified_queries", []) or []) + existing.extend(verified_queries) + graph.verified_queries = existing + + if custom_instructions is not None: + graph.custom_instructions = custom_instructions + elif not hasattr(graph, "custom_instructions"): + graph.custom_instructions = None + + if module_custom_instructions is not None: + graph.module_custom_instructions = module_custom_instructions + elif not hasattr(graph, "module_custom_instructions"): + graph.module_custom_instructions = None + + snowflake_meta = graph.metadata.setdefault("snowflake", {}) + if existing: + snowflake_meta["verified_queries"] = existing + if graph.custom_instructions is not None: + snowflake_meta["custom_instructions"] = graph.custom_instructions + if graph.module_custom_instructions is not None: + snowflake_meta["module_custom_instructions"] = graph.module_custom_instructions + def _parse_filter(self, filter_def: dict) -> Segment | None: """Parse Snowflake filter into Sidemantic segment. @@ -414,6 +646,66 @@ def _parse_filter(self, filter_def: dict) -> Segment | None: description=filter_def.get("description"), ) + def apply_pending_relationships(self, relationships_def: list, models: dict) -> None: + """Apply relationship definitions collected from separately-parsed files. + + Used by the directory loader after every file's models are loaded (and + before foreign-key inference) so a relationship-only Cortex sidecar attaches + its joins and an explicit join takes precedence over a guessed one. Operates + on the name-keyed ``models`` dict; adjacency is rebuilt later by the loader. + """ + + def _is_duplicate(existing: Relationship) -> bool: + # Same Snowflake relationship: matched by the preserved Snowflake name + # when both carry one, otherwise by target + join columns. Distinct + # named relationships between the same two tables are NOT duplicates and + # must all survive so metrics referencing them via ``using_relationships`` + # keep resolving and alternate joins still round-trip on export. + if existing.name != right_table: + return False + existing_name = existing.metadata.get("snowflake", {}).get("name") if existing.metadata else None + if snowflake_name and existing_name: + return existing_name == snowflake_name + return existing.foreign_key == left_column and existing.primary_key == right_column + + for rel_def in relationships_def: + left_table = rel_def.get("left_table") + right_table = rel_def.get("right_table") + rel_type = rel_def.get("relationship_type", "many_to_one") + + if not left_table or not right_table: + continue + + rel_columns = rel_def.get("relationship_columns") or [] + if not rel_columns: + continue + + first_col = rel_columns[0] + left_column = first_col.get("left_column") + right_column = first_col.get("right_column") + + metadata = None + snowflake_name = rel_def.get("name") + if snowflake_name: + metadata = {"snowflake": {"name": snowflake_name}} + + model = models.get(left_table) + if model is None: + continue + # Skip only an exact duplicate (same Snowflake name, or same join + # columns to the same target); keep distinct alternate joins. + if any(_is_duplicate(r) for r in model.relationships): + continue + model.relationships.append( + Relationship( + name=right_table, + type=rel_type, + foreign_key=left_column, + primary_key=right_column, + metadata=metadata, + ) + ) + def _apply_relationships(self, relationships_def: list, graph: SemanticGraph) -> None: """Apply relationships from semantic model to models in graph. @@ -442,6 +734,15 @@ def _apply_relationships(self, relationships_def: list, graph: SemanticGraph) -> left_column = first_col.get("left_column") right_column = first_col.get("right_column") + # The Snowflake relationship name is referenced by metric + # `using_relationships`; preserve it so those references stay valid + # after export. `Relationship.name` is the related-model identifier and + # cannot hold it, so stash it in adapter metadata instead. + metadata = None + snowflake_name = rel_def.get("name") + if snowflake_name: + metadata = {"snowflake": {"name": snowflake_name}} + # In Snowflake, left_table is the "many" side, right_table is the "one" side # Add relationship to left_table pointing to right_table if left_table in graph.models: @@ -451,10 +752,18 @@ def _apply_relationships(self, relationships_def: list, graph: SemanticGraph) -> type=rel_type, foreign_key=left_column, primary_key=right_column, + metadata=metadata, ) model.relationships.append(relationship) # Rebuild adjacency after adding relationship graph.build_adjacency() + else: + # The left table is not in this graph (a multi-file CLI load parses + # each file separately). Hold the definition so the directory loader + # can apply it once that table is loaded. + if not hasattr(graph, "_pending_relationships"): + graph._pending_relationships = [] + graph._pending_relationships.append(rel_def) def export(self, graph: SemanticGraph, output_path: str | Path) -> None: """Export semantic graph to Snowflake semantic model YAML format. @@ -493,6 +802,47 @@ def export(self, graph: SemanticGraph, output_path: str | Path) -> None: if not semantic_model["relationships"]: del semantic_model["relationships"] + # Export graph-level (top-level) metrics. These have no owning table and + # were parsed from the semantic model's top-level `metrics:` section, so + # they must be serialized back there to survive a parse/export round-trip. + # + # ``graph.metrics`` also contains model-owned metrics that ``add_model()`` + # auto-registers at graph level (``time_comparison``/``conversion``). Those + # are already serialized inside their table and have no valid Snowflake + # top-level representation, so skip any metric that is owned by a model. + # Match by object identity, not name, so a distinct top-level metric that + # merely shares a name with a model-local metric still round-trips. + owned_metric_ids = {id(metric) for model in resolved_models.values() for metric in model.metrics} + top_level_metrics = [] + for name, metric in graph.metrics.items(): + if id(metric) in owned_metric_ids: + continue + metric_def = self._export_metric(metric, top_level=True) + # Skip metric types Snowflake cannot represent (no `expr`) rather than + # emitting an invalid stub that would fail to re-parse. + if "expr" not in metric_def: + continue + top_level_metrics.append(metric_def) + if top_level_metrics: + semantic_model["metrics"] = top_level_metrics + + # Export top-level Cortex Analyst sections if present on the graph. These + # live as dynamic attributes when parsed directly, but only survive a + # native (SidemanticAdapter) round-trip via ``graph.metadata["snowflake"]``, + # so fall back to that when the attributes are absent. + snowflake_meta = graph.metadata.get("snowflake") or {} + verified_queries = getattr(graph, "verified_queries", None) or snowflake_meta.get("verified_queries") + if verified_queries: + semantic_model["verified_queries"] = verified_queries + custom_instructions = getattr(graph, "custom_instructions", None) or snowflake_meta.get("custom_instructions") + if custom_instructions: + semantic_model["custom_instructions"] = custom_instructions + module_custom_instructions = getattr(graph, "module_custom_instructions", None) or snowflake_meta.get( + "module_custom_instructions" + ) + if module_custom_instructions: + semantic_model["module_custom_instructions"] = module_custom_instructions + output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: @@ -554,14 +904,27 @@ def _export_table(self, model: Model) -> dict: facts = [] metrics = [] + # Snowflake table `metrics` carry metric-only keys (e.g. using_relationships, + # non_additive_dimensions). A simple aggregation that carries one of these + # was authored as a metric, so re-export it as a metric (not a fact) to keep + # the original representation across a round-trip. + metric_only_keys = ("using_relationships", "non_additive_dimensions") for metric in model.metrics: - if metric.agg and not metric.type: + snowflake_meta = (metric.metadata or {}).get("snowflake", {}) + has_metric_only_key = any(key in snowflake_meta for key in metric_only_keys) + if metric.agg and not metric.type and not has_metric_only_key: # Simple aggregation -> fact fact = self._export_fact(metric) facts.append(fact) else: - # Complex metric or derived -> metric + # Complex metric or derived -> metric. Snowflake has no + # representation for metric types like time_comparison or + # conversion, so _export_metric() cannot build an `expr` for + # them; skip those rather than emitting an invalid stub that + # would fail to re-parse. metric_def = self._export_metric(metric) + if "expr" not in metric_def: + continue metrics.append(metric_def) if facts: @@ -603,6 +966,8 @@ def _export_dimension(self, dim: Dimension) -> dict: } dim_def["data_type"] = type_mapping.get(dim.type, "TEXT") + self._export_dimension_extras(dim, dim_def) + return dim_def def _export_time_dimension(self, dim: Dimension) -> dict: @@ -624,8 +989,27 @@ def _export_time_dimension(self, dim: Dimension) -> dict: dim_def["data_type"] = "TIMESTAMP" + self._export_dimension_extras(dim, dim_def) + return dim_def + @staticmethod + def _export_dimension_extras(dim: Dimension, dim_def: dict) -> None: + """Attach Cortex Analyst enrichment keys to an exported dimension.""" + if dim.synonyms: + dim_def["synonyms"] = dim.synonyms + if dim.sample_values: + dim_def["sample_values"] = dim.sample_values + if dim.cortex_search_service_name: + dim_def["cortex_search_service_name"] = dim.cortex_search_service_name + snowflake_meta = (dim.metadata or {}).get("snowflake", {}) + for key, value in snowflake_meta.items(): + dim_def.setdefault(key, value) + if not dim.public: + # Override any stale ``public_access`` carried over in metadata so the + # Sidemantic visibility flag wins. + dim_def["access_modifier"] = "private_access" + def _export_fact(self, metric: Metric) -> dict: """Export metric as Snowflake fact. @@ -657,13 +1041,40 @@ def _export_fact(self, metric: Metric) -> dict: fact["data_type"] = "NUMBER" + if metric.synonyms: + fact["synonyms"] = metric.synonyms + snowflake_meta = (metric.metadata or {}).get("snowflake", {}) + for key, value in snowflake_meta.items(): + fact.setdefault(key, value) + if not metric.public: + # Override any stale ``public_access`` carried over in metadata so the + # Sidemantic visibility flag wins. + fact["access_modifier"] = "private_access" + return fact - def _export_metric(self, metric: Metric) -> dict: + @staticmethod + def _strip_model_placeholder(sql: str | None) -> str | None: + """Drop the ``{model}.`` placeholder so Snowflake sees bare column refs. + + Table-scoped metric expressions are parsed with the ``{model}`` placeholder + for table-local columns; Snowflake cannot resolve that token, so it must be + removed when re-exporting these metrics to Snowflake. + """ + if sql is None: + return None + return sql.replace("{model}.", "").replace("{model}", "") + + def _export_metric(self, metric: Metric, *, top_level: bool = False) -> dict: """Export metric to Snowflake metric format. Args: metric: Metric to export + top_level: When True the metric is a graph-level (view) metric whose + references already use ``model.field`` qualifiers that Snowflake + needs to resolve cross-table references, so they are preserved. + When False the metric is table-scoped and ``{model}`` placeholders + are stripped to bare column references. Returns: Metric definition dictionary @@ -675,21 +1086,38 @@ def _export_metric(self, metric: Metric) -> dict: # Build expression based on metric type if metric.type == "ratio" and metric.numerator and metric.denominator: - # Extract measure names from qualified references - num = metric.numerator.split(".")[-1] if "." in metric.numerator else metric.numerator - denom = metric.denominator.split(".")[-1] if "." in metric.denominator else metric.denominator + if top_level: + # Graph-level metric: keep qualified references so Snowflake can + # resolve cross-table members (e.g. ``orders.revenue``). + num = metric.numerator + denom = metric.denominator + else: + # Table-scoped metric: Snowflake expressions use bare column names. + num = metric.numerator.split(".")[-1] if "." in metric.numerator else metric.numerator + denom = metric.denominator.split(".")[-1] if "." in metric.denominator else metric.denominator metric_def["expr"] = f"{num} / NULLIF({denom}, 0)" elif metric.type == "derived" and metric.sql: - metric_def["expr"] = metric.sql + metric_def["expr"] = metric.sql if top_level else self._strip_model_placeholder(metric.sql) elif metric.agg and metric.sql: # Simple aggregation - wrap in aggregate function agg_func = metric.agg.upper() + sql = metric.sql if top_level else self._strip_model_placeholder(metric.sql) if agg_func == "COUNT_DISTINCT": - metric_def["expr"] = f"COUNT(DISTINCT {metric.sql})" + metric_def["expr"] = f"COUNT(DISTINCT {sql})" else: - metric_def["expr"] = f"{agg_func}({metric.sql})" + metric_def["expr"] = f"{agg_func}({sql})" elif metric.sql: - metric_def["expr"] = metric.sql + metric_def["expr"] = metric.sql if top_level else self._strip_model_placeholder(metric.sql) + + if metric.synonyms: + metric_def["synonyms"] = metric.synonyms + snowflake_meta = (metric.metadata or {}).get("snowflake", {}) + for key, value in snowflake_meta.items(): + metric_def.setdefault(key, value) + if not metric.public: + # Override any stale ``public_access`` carried over in metadata so the + # Sidemantic visibility flag wins. + metric_def["access_modifier"] = "private_access" return metric_def @@ -735,4 +1163,10 @@ def _export_relationship(self, model: Model, rel: Relationship) -> dict: "join_type": "left_outer", } + # Preserve the original Snowflake relationship name so metric + # `using_relationships` references resolve after a round-trip. + snowflake_name = (rel.metadata or {}).get("snowflake", {}).get("name") + if snowflake_name: + rel_def = {"name": snowflake_name, **rel_def} + return rel_def diff --git a/sidemantic/core/dimension.py b/sidemantic/core/dimension.py index 92d4b582..d91a8fba 100644 --- a/sidemantic/core/dimension.py +++ b/sidemantic/core/dimension.py @@ -26,6 +26,13 @@ class Dimension(BaseModel): label: str | None = Field(None, description="Display label") metadata: dict[str, Any] | None = Field(None, description="Adapter-specific metadata payload") + # Synonyms / sample values (e.g. Snowflake Cortex Analyst, Cube) + synonyms: list[str] | None = Field(None, description="Alternative names for this dimension") + sample_values: list[str] | None = Field(None, description="Representative sample values for this dimension") + cortex_search_service_name: str | None = Field( + None, description="Linked Cortex Search service name (Snowflake Cortex Analyst)" + ) + # Display formatting format: str | None = Field(None, description="Display format string (e.g., '$#,##0.00', '0.00%')") value_format_name: str | None = Field(None, description="Named format (e.g., 'usd', 'percent', 'decimal_2')") diff --git a/sidemantic/core/metric.py b/sidemantic/core/metric.py index 791a2780..a69200e4 100644 --- a/sidemantic/core/metric.py +++ b/sidemantic/core/metric.py @@ -349,6 +349,9 @@ def validate_type_specific_fields(self): label: str | None = Field(None, description="Display label") metadata: dict[str, Any] | None = Field(None, description="Adapter-specific metadata payload") + # Synonyms (e.g. Snowflake Cortex Analyst measures/metrics, Cube) + synonyms: list[str] | None = Field(None, description="Alternative names for this measure/metric") + # Display formatting format: str | None = Field(None, description="Display format string (e.g., '$#,##0.00', '0.00%')") value_format_name: str | None = Field(None, description="Named format (e.g., 'usd', 'percent', 'decimal_2')") diff --git a/sidemantic/loaders.py b/sidemantic/loaders.py index bb9fcf0b..e6357e8f 100644 --- a/sidemantic/loaders.py +++ b/sidemantic/loaders.py @@ -54,6 +54,11 @@ def load_from_directory(layer: "SemanticLayer", directory: str | Path, *, strict all_models = {} all_metrics = {} all_parameters = {} + # Snowflake table-scoped metrics whose table lives in another file, held as + # (table_name, Metric) pairs so same-named scoped metrics never collide. + all_pending_table_metrics: list = [] + # Snowflake relationship definitions whose tables live in other files. + all_pending_relationships: list = [] import_warnings: list[dict[str, object]] = [] # Check for SML repository (catalog.yml/atscale.yml or object_type files) @@ -184,6 +189,17 @@ def load_from_directory(layer: "SemanticLayer", directory: str | Path, *, strict adapter = SidemanticAdapter() elif _looks_like_native_sidemantic_yaml(yaml_data): adapter = SidemanticAdapter() + elif _yaml_has_top_level_key(yaml_data, "tables") and _contains_yaml_key(yaml_data, "base_table"): + # Snowflake Cortex Semantic Model format. Checked before the generic + # MetricFlow `metrics:` + `type:` heuristic because a Cortex file may + # carry top-level `metrics:` and `data_type:` while `base_table` is a + # Snowflake-only signal MetricFlow never has. + adapter = SnowflakeAdapter() + elif _looks_like_snowflake_metrics_file(yaml_data): + # Cortex top-level metrics split into their own file (table + expr, + # no tables section). Route to Snowflake so the metrics defer and + # attach to tables defined in sibling files. + adapter = SnowflakeAdapter() elif _yaml_has_top_level_key(yaml_data, "metrics") and "type: " in content: adapter = MetricFlowAdapter() elif _is_hex_resource_mapping(yaml_data): @@ -205,9 +221,6 @@ def load_from_directory(layer: "SemanticLayer", directory: str | Path, *, strict ): # ThoughtSpot TML Model object (export_schema_version v2) adapter = ThoughtSpotAdapter() - elif _yaml_has_top_level_key(yaml_data, "tables") and _contains_yaml_key(yaml_data, "base_table"): - # Snowflake Cortex Semantic Model format - adapter = SnowflakeAdapter() elif _looks_like_bsl_yaml(yaml_data): # BSL format uses _.column syntax for expressions adapter = BSLAdapter() @@ -250,6 +263,8 @@ def load_from_directory(layer: "SemanticLayer", directory: str | Path, *, strict all_models.update(graph.models) all_metrics.update(graph.metrics) all_parameters.update(graph.parameters) + all_pending_table_metrics.extend(getattr(graph, "_pending_table_metrics", [])) + all_pending_relationships.extend(getattr(graph, "_pending_relationships", [])) except Exception as e: _append_import_warning( import_warnings, @@ -268,6 +283,16 @@ def load_from_directory(layer: "SemanticLayer", directory: str | Path, *, strict # declared in separate files. _finalize_bsl_join_aliases(all_models) + # Attach Snowflake top-level metrics whose referenced table was defined in a + # different file (each Snowflake file is parsed separately, so the table may + # not have been known when the metric file was parsed). + _resolve_snowflake_pending_table_metrics(all_models, all_metrics, all_pending_table_metrics) + + # Apply Snowflake relationships declared in a separate file before FK inference + # so an explicit Cortex join takes precedence over a guessed one for the same + # table pair. + _apply_snowflake_pending_relationships(all_models, all_pending_relationships) + # Infer cross-model relationships based on naming conventions _infer_relationships(all_models) @@ -484,6 +509,85 @@ def _yaml_has_top_level_key(data: dict, key: str) -> bool: return isinstance(data, dict) and key in data +_SNOWFLAKE_TOP_LEVEL_SECTIONS = ("verified_queries", "custom_instructions", "module_custom_instructions") +# Per-metric keys that only Snowflake Cortex uses (not in the native METRIC_FIELDS). +_SNOWFLAKE_METRIC_KEYS = ( + "table", + "access_modifier", + "labels", + "tags", + "non_additive_dimensions", + "using_relationships", +) + + +def _looks_like_snowflake_relationships(data: dict) -> bool: + """Return True when a file's top-level ``relationships`` are Snowflake-shaped.""" + relationships = data.get("relationships") + if not isinstance(relationships, list) or not relationships: + return False + return all( + isinstance(rel, dict) and "left_table" in rel and "right_table" in rel and "relationship_columns" in rel + for rel in relationships + ) + + +def _looks_like_snowflake_metrics_file(data: dict) -> bool: + """Detect a split Snowflake Cortex sidecar without a ``tables`` section. + + Cortex projects may split top-level ``metrics:``, ``relationships:`` and/or the + Snowflake-only sections (verified_queries / custom instructions) into their own + file. Route such a file to the Snowflake adapter when it carries a Cortex-only + signal: + + - a Snowflake-only top-level section (verified_queries / custom instructions), + even when no ``metrics`` are present (instruction-only sidecar), + - Snowflake-shaped top-level ``relationships`` (relationship-only sidecar), or + - top-level ``metrics`` carrying a Snowflake-only metric key (``table`` or per- + metric ``access_modifier``/``labels``/``tags``/``non_additive_dimensions``/ + ``using_relationships``), or + - a root ``name`` alongside Cortex-shaped ``metrics`` -- a tableless view-metric + sidecar whose only Cortex signal is the root ``name`` the native format rejects. + + Any present metrics must be Cortex-shaped (``expr`` with no MetricFlow + ``type_params``/``measure`` markers). A tableless metrics file with no root + ``name`` and none of these signals is left to native detection. + """ + if not isinstance(data, dict) or "tables" in data: + return False + + metrics = data.get("metrics") + has_snowflake_metric_key = False + has_cortex_metrics = False + if metrics is not None: + if not isinstance(metrics, list) or not metrics: + return False + for metric in metrics: + if not isinstance(metric, dict): + return False + if "expr" not in metric: + return False + if "type_params" in metric or "measure" in metric: + return False + if any(key in metric for key in _SNOWFLAKE_METRIC_KEYS): + has_snowflake_metric_key = True + has_cortex_metrics = True + + has_snowflake_section = any(section in data for section in _SNOWFLAKE_TOP_LEVEL_SECTIONS) + # A tableless Cortex sidecar may carry only a root ``name`` plus view-level + # metrics (no per-metric Snowflake key, no Snowflake sections). The root + # ``name`` is a Cortex semantic-model field the native format rejects, so its + # presence alongside Cortex-shaped metrics is a reliable Snowflake signal -- + # without it the file is dropped by both native and Snowflake detection. + has_snowflake_root_name = has_cortex_metrics and isinstance(data.get("name"), str) + return ( + has_snowflake_metric_key + or has_snowflake_section + or has_snowflake_root_name + or _looks_like_snowflake_relationships(data) + ) + + def _contains_yaml_key(value: object, key: str) -> bool: """Return True when a parsed YAML object contains an exact key anywhere.""" if isinstance(value, dict): @@ -845,12 +949,81 @@ def _merge_import_warnings(graph: object, warnings: list[dict[str, object]]) -> graph.import_warnings = merged +def _resolve_snowflake_pending_table_metrics(all_models: dict, all_metrics: dict, pending: list) -> None: + """Re-attach Snowflake top-level metrics to tables defined in other files.""" + if not pending: + return + from sidemantic.adapters.snowflake import SnowflakeAdapter + + SnowflakeAdapter.resolve_pending_table_metrics(all_models, pending) + # Any metric whose table is still unknown falls back to a graph-level metric + # so it is not silently dropped. + for _table_name, metric in pending: + all_metrics.setdefault(metric.name, metric) + pending.clear() + + +def _apply_snowflake_pending_relationships(all_models: dict, pending: list) -> None: + """Apply Snowflake relationship definitions whose tables live in other files.""" + if not pending: + return + from sidemantic.adapters.snowflake import SnowflakeAdapter + + SnowflakeAdapter().apply_pending_relationships(pending, all_models) + pending.clear() + + +def _deep_merge_metadata(target: dict, source: dict) -> None: + """Recursively merge ``source`` into ``target``. + + Nested dicts are merged, list values are appended (deduplicated by value), + and scalars from ``source`` overwrite. This keeps multi-file payloads such as + Snowflake Cortex ``verified_queries`` from clobbering one another when several + files are loaded from a directory. + """ + for key, value in source.items(): + existing = target.get(key) + if isinstance(existing, dict) and isinstance(value, dict): + _deep_merge_metadata(existing, value) + elif isinstance(existing, list) and isinstance(value, list): + for item in value: + if item not in existing: + existing.append(copy.deepcopy(item)) + else: + target[key] = copy.deepcopy(value) + + def _merge_graph_passthrough_metadata(target_graph: object, source_graph: object) -> None: for name, value in vars(source_graph).items(): if not name.startswith("_tmdl_"): continue setattr(target_graph, name, copy.deepcopy(value)) + # Merge graph-level metadata (e.g. Snowflake Cortex top-level sections) so the + # CLI-first load -> export-native path round-trips them. Deep-merge so multiple + # files in a directory each contribute their sections instead of overwriting. + source_metadata = getattr(source_graph, "metadata", None) + if isinstance(source_metadata, dict) and source_metadata: + target_metadata = getattr(target_graph, "metadata", None) + if not isinstance(target_metadata, dict): + target_metadata = {} + target_graph.metadata = target_metadata + _deep_merge_metadata(target_metadata, source_metadata) + + # Carry over Snowflake dynamic top-level attributes set by the adapter. Lists + # (verified_queries) accumulate across files; scalars take the latest value. + for attr in ("verified_queries", "custom_instructions", "module_custom_instructions"): + value = getattr(source_graph, attr, None) + if not value: + continue + existing = getattr(target_graph, attr, None) + if isinstance(existing, list) and isinstance(value, list): + for item in value: + if item not in existing: + existing.append(copy.deepcopy(item)) + else: + setattr(target_graph, attr, copy.deepcopy(value)) + def _infer_relationships(models: dict) -> None: """Infer relationships between models based on foreign key naming conventions. diff --git a/tests/adapters/sidemantic_adapter/test_parsing.py b/tests/adapters/sidemantic_adapter/test_parsing.py index c3529398..fe9442af 100644 --- a/tests/adapters/sidemantic_adapter/test_parsing.py +++ b/tests/adapters/sidemantic_adapter/test_parsing.py @@ -410,6 +410,42 @@ def test_parse_export_preserves_native_metadata_visibility_and_granularity(tmp_p assert graph2.metrics["revenue_per_order"].public is False +def test_parse_export_preserves_graph_level_metric_synonyms(tmp_path): + """Top-level derived metrics must keep `synonyms` through export round-trips.""" + adapter = SidemanticAdapter() + yaml_path = tmp_path / "orders.yml" + yaml_path.write_text( + """ +version: 1 +models: + - name: orders + table: orders + metrics: + - name: total_revenue + agg: sum + sql: amount + - name: order_count + agg: count +metrics: + - name: revenue_per_order + type: derived + sql: orders.total_revenue / orders.order_count + synonyms: [aov, average order value] +""" + ) + + graph = adapter.parse(yaml_path) + assert graph.metrics["revenue_per_order"].synonyms == ["aov", "average order value"] + + export_path = tmp_path / "exported.yml" + adapter.export(graph, export_path) + exported = yaml.safe_load(export_path.read_text()) + assert exported["metrics"][0]["synonyms"] == ["aov", "average order value"] + + graph2 = adapter.parse(export_path) + assert graph2.metrics["revenue_per_order"].synonyms == ["aov", "average order value"] + + def test_parse_export_preserves_top_level_parameters(tmp_path): adapter = SidemanticAdapter() yaml_path = tmp_path / "orders.yml" @@ -884,6 +920,46 @@ def test_parse_native_sql_version_only_frontmatter_preserves_graph_parameter(tmp assert graph.parameters["status_filter"].type == "string" +def test_parse_native_sql_metadata_only_frontmatter_preserves_graph_definitions(tmp_path): + """Root-only metadata frontmatter must not swallow graph-level SQL metrics/params.""" + adapter = SidemanticAdapter() + sql_path = tmp_path / "metrics.sql" + sql_path.write_text( + """ +--- +version: 1 +metadata: + description: Top-level Cortex sections + owner: analytics +--- + +METRIC ( + name order_count, + agg count +); + +PARAMETER ( + name status_filter, + type string, + default_value 'paid' +); +""" + ) + + graph = adapter.parse(sql_path) + + # No model is created from root-only metadata frontmatter. + assert len(graph.models) == 0 + # Graph-level metric and parameter still load. + assert "order_count" in graph.metrics + assert graph.metrics["order_count"].agg == "count" + assert "status_filter" in graph.parameters + assert graph.parameters["status_filter"].type == "string" + # Root metadata is preserved on the graph. + assert graph.metadata.get("description") == "Top-level Cortex sections" + assert graph.metadata.get("owner") == "analytics" + + def test_parse_native_sql_frontmatter_rejects_unsupported_version(tmp_path): """Test unsupported native SQL frontmatter versions fail early.""" adapter = SidemanticAdapter() @@ -1118,5 +1194,67 @@ def test_dimension_window_in_sql_generation(): assert "LEAD(event) OVER (PARTITION BY person_id ORDER BY timestamp)" in sql +def test_parse_native_yaml_round_trips_cortex_enrichment_fields(tmp_path): + """Cortex enrichment fields (synonyms/sample_values/search service) survive round-trip. + + These fields are populated when importing a Snowflake Cortex model; the native + adapter must both accept them in hand-authored YAML and re-emit them on export + so `sidemantic export-native` does not silently drop them. + """ + adapter = SidemanticAdapter() + yaml_path = tmp_path / "orders.yml" + yaml_path.write_text( + """ +version: 1 +models: + - name: orders + table: orders + primary_key: id + dimensions: + - name: status + type: categorical + synonyms: + - state + - order_status + sample_values: + - delivered + - shipped + cortex_search_service_name: ORDERS_STATUS_SEARCH + metrics: + - name: order_count + agg: count + synonyms: + - num_orders +""" + ) + + graph = adapter.parse(yaml_path) + model = graph.models["orders"] + + status = model.get_dimension("status") + assert status.synonyms == ["state", "order_status"] + assert status.sample_values == ["delivered", "shipped"] + assert status.cortex_search_service_name == "ORDERS_STATUS_SEARCH" + order_count = model.get_metric("order_count") + assert order_count.synonyms == ["num_orders"] + + export_path = tmp_path / "exported.yml" + adapter.export(graph, export_path) + exported = yaml.safe_load(export_path.read_text()) + + exported_dim = exported["models"][0]["dimensions"][0] + assert exported_dim["synonyms"] == ["state", "order_status"] + assert exported_dim["sample_values"] == ["delivered", "shipped"] + assert exported_dim["cortex_search_service_name"] == "ORDERS_STATUS_SEARCH" + exported_metric = exported["models"][0]["metrics"][0] + assert exported_metric["synonyms"] == ["num_orders"] + + # And a full re-parse preserves them. + graph2 = adapter.parse(export_path) + status2 = graph2.models["orders"].get_dimension("status") + assert status2.synonyms == ["state", "order_status"] + assert status2.cortex_search_service_name == "ORDERS_STATUS_SEARCH" + + if __name__ == "__main__": pytest.main([__file__, "-v"]) diff --git a/tests/adapters/snowflake/test_cortex_features.py b/tests/adapters/snowflake/test_cortex_features.py new file mode 100644 index 00000000..61735ccd --- /dev/null +++ b/tests/adapters/snowflake/test_cortex_features.py @@ -0,0 +1,179 @@ +"""Tests for newer Snowflake Cortex Analyst spec features. + +Covers the keys added to the adapter on top of the legacy facts-based format: +- table-level `measures` (legacy alias of `facts`) +- `synonyms` on dimensions/measures/metrics +- `sample_values`, nested `cortex_search_service`, `is_enum`, `unique`, + `access_modifier`, `labels`, `tags` on dimensions +- `non_additive_dimensions` / `using_relationships` preserved in metadata +- top-level `verified_queries`, `custom_instructions`, `module_custom_instructions` +- export round-trip preservation of all of the above +""" + +from pathlib import Path + +import pytest +import yaml + +from sidemantic.adapters.snowflake import SnowflakeAdapter + + +@pytest.fixture +def adapter(): + return SnowflakeAdapter() + + +@pytest.fixture +def fixture_path(): + return Path(__file__).parent.parent.parent / "fixtures" / "snowflake" / "cortex_features.yaml" + + +@pytest.fixture +def graph(adapter, fixture_path): + return adapter.parse(fixture_path) + + +class TestMeasuresAlias: + def test_measures_parsed_as_metrics(self, graph): + model = graph.models["orders"] + names = {m.name for m in model.metrics} + assert "order_total" in names + assert "distinct_orders" in names + + def test_measure_default_aggregation(self, graph): + model = graph.models["orders"] + order_total = model.get_metric("order_total") + assert order_total.agg == "sum" + assert order_total.sql == "total" + + +class TestSynonyms: + def test_dimension_synonyms(self, graph): + model = graph.models["orders"] + status = model.get_dimension("status") + assert status.synonyms == ["state"] + + def test_measure_synonyms(self, graph): + model = graph.models["orders"] + order_total = model.get_metric("order_total") + assert order_total.synonyms == ["revenue"] + + def test_metric_synonyms(self, graph): + model = graph.models["orders"] + distinct_orders = model.get_metric("distinct_orders") + assert distinct_orders.synonyms == ["order count"] + + +class TestDimensionEnrichment: + def test_sample_values(self, graph): + model = graph.models["orders"] + status = model.get_dimension("status") + assert "delivered" in status.sample_values + + def test_nested_cortex_search_service(self, graph): + model = graph.models["orders"] + cust = model.get_dimension("customer_name") + assert cust.cortex_search_service_name == "customer_name_search" + + def test_is_enum_and_modifier_in_metadata(self, graph): + model = graph.models["orders"] + status = model.get_dimension("status") + sf = status.metadata["snowflake"] + assert sf["is_enum"] is True + assert sf["access_modifier"] == "public_access" + assert sf["labels"] == ["Order Status"] + assert sf["tags"] == ["core"] + + def test_non_string_sample_values_coerced(self, adapter): + """Numeric/time sample_values (valid per the Cortex spec) are coerced to str.""" + dim = adapter._parse_dimension( + {"name": "order_id", "data_type": "NUMBER", "expr": "order_id", "sample_values": [1001, 1002]} + ) + assert dim.sample_values == ["1001", "1002"] + + time_dim = adapter._parse_time_dimension( + {"name": "order_ts", "expr": "order_ts", "sample_values": [1700000000, 1700000001]} + ) + assert time_dim.sample_values == ["1700000000", "1700000001"] + + +class TestMeasureMetricMetadata: + def test_non_additive_dimensions_preserved(self, graph): + model = graph.models["orders"] + order_total = model.get_metric("order_total") + sf = order_total.metadata["snowflake"] + assert sf["non_additive_dimensions"][0]["dimension"] == "order_date" + assert sf["access_modifier"] == "public_access" + + def test_using_relationships_preserved(self, graph): + model = graph.models["orders"] + distinct_orders = model.get_metric("distinct_orders") + sf = distinct_orders.metadata["snowflake"] + assert sf["using_relationships"] == ["orders_to_customers"] + + +class TestTopLevelSections: + def test_verified_queries(self, graph): + assert len(graph.verified_queries) == 1 + assert graph.verified_queries[0]["name"] == "total revenue" + + def test_custom_instructions(self, graph): + assert graph.custom_instructions == "Always prefer revenue over total when answering." + + def test_module_custom_instructions(self, graph): + mci = graph.module_custom_instructions + assert mci["sql_generation"] == "Prefer explicit column references." + assert mci["question_categorization"] == "Treat revenue questions as financial." + + +class TestRoundtrip: + def test_roundtrip_preserves_cortex_features(self, adapter, graph, tmp_path): + output = tmp_path / "out.yaml" + adapter.export(graph, output) + + data = yaml.safe_load(output.read_text()) + + # Top-level sections survive export. + assert "verified_queries" in data + assert data["custom_instructions"] == "Always prefer revenue over total when answering." + assert "module_custom_instructions" in data + + # Re-parse and confirm key fields persist. + graph2 = adapter.parse(output) + model = graph2.models["orders"] + + status = model.get_dimension("status") + assert status.synonyms == ["state"] + assert "delivered" in status.sample_values + + cust = model.get_dimension("customer_name") + assert cust.cortex_search_service_name == "customer_name_search" + + order_total = model.get_metric("order_total") + assert order_total.synonyms == ["revenue"] + + assert len(graph2.verified_queries) == 1 + + def test_top_level_sections_survive_native_roundtrip(self, adapter, graph, tmp_path): + """Snowflake -> native (export-native) -> Snowflake preserves top-level sections.""" + from sidemantic.adapters.sidemantic import SidemanticAdapter + + native = SidemanticAdapter() + native_path = tmp_path / "native.yml" + native.export(graph, native_path) + + native_data = yaml.safe_load(native_path.read_text()) + assert native_data["metadata"]["snowflake"]["verified_queries"] + + # Re-parse native YAML into a fresh graph (no dynamic Snowflake attributes). + graph2 = native.parse(native_path) + assert not hasattr(graph2, "verified_queries") + assert graph2.metadata["snowflake"]["verified_queries"] + + # Re-export to Snowflake; top-level sections come back from graph.metadata. + sf_out = tmp_path / "out_snowflake.yaml" + adapter.export(graph2, sf_out) + sf_data = yaml.safe_load(sf_out.read_text()) + assert len(sf_data["verified_queries"]) == 1 + assert sf_data["custom_instructions"] == "Always prefer revenue over total when answering." + assert "module_custom_instructions" in sf_data diff --git a/tests/adapters/snowflake/test_fixtures.py b/tests/adapters/snowflake/test_fixtures.py index 40de1b9d..ce4b087d 100644 --- a/tests/adapters/snowflake/test_fixtures.py +++ b/tests/adapters/snowflake/test_fixtures.py @@ -174,25 +174,21 @@ def test_varchar_dimension_type(self, graph): class TestRevenueTimeseriesMeasures: """Verify measures parsing. - The Cortex Analyst format uses `measures` (not `facts`). The adapter - currently only looks for `facts` and `metrics`, so measures from the - Cortex Analyst format are not imported. These tests are marked xfail - to document the gap. + The Cortex Analyst format uses the table-level `measures` key as a legacy + alias of `facts`. The adapter reads both, so measures from the tutorial + fixture are imported as Sidemantic metrics. """ - @pytest.mark.xfail(reason="Adapter parses 'facts' key, not 'measures' (Cortex Analyst format)") def test_daily_revenue_has_measures(self, graph): model = graph.models["daily_revenue"] metric_names = {m.name for m in model.metrics} assert "daily_revenue" in metric_names - @pytest.mark.xfail(reason="Adapter parses 'facts' key, not 'measures' (Cortex Analyst format)") def test_daily_revenue_measure_count(self, graph): """daily_revenue table defines 5 measures.""" model = graph.models["daily_revenue"] assert len(model.metrics) == 5 - @pytest.mark.xfail(reason="Adapter parses 'facts' key, not 'measures' (Cortex Analyst format)") def test_daily_cogs_measure(self, graph): model = graph.models["daily_revenue"] cogs = model.get_metric("daily_cogs") @@ -200,14 +196,12 @@ def test_daily_cogs_measure(self, graph): assert cogs.agg == "sum" assert cogs.sql == "cogs" - @pytest.mark.xfail(reason="Adapter parses 'facts' key, not 'measures' (Cortex Analyst format)") def test_daily_profit_computed_measure(self, graph): """daily_profit has expr 'revenue - cogs' and no default_aggregation.""" model = graph.models["daily_revenue"] profit = model.get_metric("daily_profit") assert profit is not None - @pytest.mark.xfail(reason="Adapter parses 'facts' key, not 'measures' (Cortex Analyst format)") def test_forecast_error_avg_aggregation(self, graph): """daily_forecast_abs_error has default_aggregation: avg.""" model = graph.models["daily_revenue"] @@ -255,21 +249,14 @@ def test_region_relationship_keys(self, graph): assert region_rel.primary_key == "region_id" -class TestRevenueTimeseriesUnsupportedFeatures: - """Test features present in the fixture that the adapter does not yet handle. +class TestRevenueTimeseriesCortexFeatures: + """Test newer Cortex Analyst features the adapter now imports.""" - These are marked xfail to document what a Cortex Analyst model can contain - that sidemantic does not yet import. - """ - - @pytest.mark.xfail(reason="verified_queries not imported by adapter") def test_verified_queries_imported(self, graph): """The fixture has 2 verified_queries; adapter should expose them.""" - # SemanticGraph has no verified_queries attribute yet assert hasattr(graph, "verified_queries") assert len(graph.verified_queries) == 2 - @pytest.mark.xfail(reason="synonyms on measures not imported by adapter") def test_measure_synonyms(self, graph): """daily_revenue measure has synonyms ['sales', 'income'].""" model = graph.models["daily_revenue"] @@ -277,7 +264,6 @@ def test_measure_synonyms(self, graph): assert hasattr(rev, "synonyms") assert "sales" in rev.synonyms - @pytest.mark.xfail(reason="sample_values on dimensions not imported by adapter") def test_dimension_sample_values(self, graph): """product_line dimension has sample_values.""" model = graph.models["product"] @@ -285,7 +271,6 @@ def test_dimension_sample_values(self, graph): assert hasattr(pl, "sample_values") assert "Electronics" in pl.sample_values - @pytest.mark.xfail(reason="cortex_search_service_name not imported by adapter") def test_cortex_search_service_name(self, graph): """product_dimension table has cortex_search_service_name on product_line.""" model = graph.models["product_dimension"] diff --git a/tests/adapters/snowflake/test_roundtrip.py b/tests/adapters/snowflake/test_roundtrip.py index 9bc3fd00..ce686236 100644 --- a/tests/adapters/snowflake/test_roundtrip.py +++ b/tests/adapters/snowflake/test_roundtrip.py @@ -331,3 +331,478 @@ def test_export_creates_valid_snowflake_yaml(self, adapter, examples_dir, tmp_pa # metrics should have expr for metric in table.get("metrics", []): assert "name" in metric + + +class TestSnowflakeTopLevelMetrics: + """Test parsing/exporting of graph-level (top-level) metrics. + + Snowflake semantic-view metrics that omit ``table`` (or reference a table not + present in the model) become graph-level Sidemantic metrics that reference + other fields with ``model.field`` syntax. + """ + + @pytest.fixture + def top_level_yaml(self, tmp_path): + path = tmp_path / "top_level.yaml" + path.write_text(""" +name: shop +tables: + - name: orders + base_table: + table: orders + primary_key: + columns: + - id + facts: + - name: total_revenue + expr: amount + default_aggregation: sum + - name: order_count + expr: id + default_aggregation: count +metrics: + - name: revenue_per_order + expr: orders.total_revenue / orders.order_count +""") + return path + + def test_top_level_metric_is_not_overqualified(self, adapter, top_level_yaml): + """Graph-level metric expressions must keep model.field references intact.""" + graph = adapter.parse(top_level_yaml) + + assert "revenue_per_order" in graph.metrics + metric = graph.metrics["revenue_per_order"] + assert metric.type == "derived" + # Must NOT be corrupted with the {model} placeholder. + assert "{model}" not in metric.sql + assert metric.sql == "orders.total_revenue / orders.order_count" + + def test_top_level_metric_survives_roundtrip(self, adapter, top_level_yaml, tmp_path): + """Graph-level metrics must be re-exported into the top-level metrics section.""" + graph = adapter.parse(top_level_yaml) + + output_file = tmp_path / "roundtrip.yaml" + adapter.export(graph, output_file) + + with open(output_file) as f: + data = yaml.safe_load(f) + + # Top-level metrics section must be present after export. + assert "metrics" in data + names = {m["name"]: m for m in data["metrics"]} + assert "revenue_per_order" in names + assert names["revenue_per_order"]["expr"] == "orders.total_revenue / orders.order_count" + + # And it must survive a full re-parse without being lost or corrupted. + graph2 = adapter.parse(output_file) + assert "revenue_per_order" in graph2.metrics + assert graph2.metrics["revenue_per_order"].sql == "orders.total_revenue / orders.order_count" + + def test_table_scoped_metric_still_qualified(self, adapter, tmp_path): + """Table-scoped derived metrics must still get the {model} placeholder.""" + path = tmp_path / "scoped.yaml" + path.write_text(""" +name: shop +tables: + - name: orders + base_table: + table: orders + primary_key: + columns: + - id +metrics: + - name: weird_ratio + table: orders + expr: SUM(amount) / COUNT(id) +""") + graph = adapter.parse(path) + + metric = graph.models["orders"].get_metric("weird_ratio") + assert metric is not None + assert metric.type == "derived" + # Bare table-local columns must be qualified with {model}. + assert "{model}.amount" in metric.sql + assert "{model}.id" in metric.sql + + def test_export_skips_auto_registered_model_metrics(self, adapter, tmp_path): + """Model-owned metrics auto-registered at graph level must not leak into top-level metrics. + + ``graph.add_model()`` registers ``time_comparison``/``conversion`` metrics in + ``graph.metrics``. These are already serialized inside their owning table and + have no valid Snowflake top-level representation, so export must skip them. + """ + model = Model( + name="orders", + table="ORDERS", + primary_key="id", + metrics=[ + Metric(name="total_revenue", agg="sum", sql="amount"), + Metric(name="revenue_yoy", type="time_comparison", base_metric="total_revenue", comparison_type="yoy"), + ], + ) + graph = SemanticGraph() + graph.add_model(model) + # Sanity check: the time_comparison metric is auto-registered at graph level. + assert "revenue_yoy" in graph.metrics + + output_file = tmp_path / "export.yaml" + adapter.export(graph, output_file) + + with open(output_file) as f: + data = yaml.safe_load(f) + + # No top-level metrics section should be emitted for model-owned metrics. + assert "metrics" not in data + # The export must still re-parse cleanly. + adapter.parse(output_file) + + def test_export_preserves_top_level_metric_sharing_model_metric_name(self, adapter, tmp_path): + """A distinct top-level metric must survive even if it shares a model-local name. + + The owned-metric skip in export must match by object identity, not name, so + a genuine graph-level metric that merely shares a name with a table-local + metric is not dropped on export. + """ + model = Model( + name="orders", + table="ORDERS", + primary_key="id", + metrics=[Metric(name="summary", agg="sum", sql="amount")], + ) + graph = SemanticGraph() + graph.add_model(model) + # Distinct graph-level derived metric that shares the name "summary". + top_level = Metric(name="summary", type="derived", sql="orders.summary * 2") + graph.metrics["summary"] = top_level + assert graph.metrics["summary"] is not model.metrics[0] + + output_file = tmp_path / "export.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + + # The distinct top-level metric is serialized to the top-level metrics block. + assert [m["name"] for m in data.get("metrics", [])] == ["summary"] + assert data["metrics"][0]["expr"] == "orders.summary * 2" + # And the export still re-parses cleanly. + adapter.parse(output_file) + + def test_export_skips_auto_registered_metric_by_identity_not_name(self, adapter, tmp_path): + """Auto-registered model metrics (same object) are still skipped at top level.""" + model = Model( + name="orders", + table="ORDERS", + primary_key="id", + metrics=[ + Metric(name="total_revenue", agg="sum", sql="amount"), + Metric(name="revenue_yoy", type="time_comparison", base_metric="total_revenue", comparison_type="yoy"), + ], + ) + graph = SemanticGraph() + graph.add_model(model) + # The time_comparison metric is the same object registered at graph level. + assert graph.metrics["revenue_yoy"] is model.metrics[1] + + output_file = tmp_path / "export.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + + assert "metrics" not in data + adapter.parse(output_file) + + def test_roundtrip_preserves_using_relationships_and_relationship_name(self, adapter, tmp_path): + """A metric `using_relationships` and the named relationship it points to must survive. + + Snowflake relationship `name` is referenced by metric `using_relationships`. + Both the relationship name and the metric reference must round-trip, and the + aggregate metric carrying `using_relationships` must be exported as a metric + (not a fact) so the key is not dropped on re-parse. + """ + source = tmp_path / "rel.yaml" + source.write_text( + """ +name: rel_test +tables: + - name: orders + base_table: {database: db, schema: s, table: orders} + primary_key: {columns: [order_id]} + dimensions: + - {name: order_id, expr: order_id, data_type: number} + - {name: customer_id, expr: customer_id, data_type: number} + metrics: + - name: distinct_orders + expr: COUNT(DISTINCT order_id) + using_relationships: [orders_to_customers] + - name: customers + base_table: {database: db, schema: s, table: customers} + primary_key: {columns: [id]} + dimensions: + - {name: id, expr: id, data_type: number} +relationships: + - name: orders_to_customers + left_table: orders + right_table: customers + relationship_columns: + - {left_column: customer_id, right_column: id} + relationship_type: many_to_one + join_type: left_outer +""" + ) + + graph = adapter.parse(source) + rel = graph.models["orders"].relationships[0] + assert rel.metadata["snowflake"]["name"] == "orders_to_customers" + + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + + # The relationship name is re-emitted so references stay resolvable. + assert [r["name"] for r in data["relationships"]] == ["orders_to_customers"] + + # The aggregate metric carrying using_relationships goes to metrics, not facts. + orders_table = next(t for t in data["tables"] if t["name"] == "orders") + assert "facts" not in orders_table or all(f["name"] != "distinct_orders" for f in orders_table["facts"]) + exported_metric = next(m for m in orders_table["metrics"] if m["name"] == "distinct_orders") + assert exported_metric["using_relationships"] == ["orders_to_customers"] + + # Re-parse preserves both the relationship name and the metric reference. + graph2 = adapter.parse(output_file) + rel2 = graph2.models["orders"].relationships[0] + assert rel2.metadata["snowflake"]["name"] == "orders_to_customers" + metric2 = graph2.models["orders"].get_metric("distinct_orders") + assert metric2.metadata["snowflake"]["using_relationships"] == ["orders_to_customers"] + + def test_roundtrip_aggregate_metric_with_non_additive_dimensions_stays_metric(self, adapter, tmp_path): + """A simple aggregate metric carrying non_additive_dimensions exports as a metric.""" + source = tmp_path / "na.yaml" + source.write_text( + """ +name: na_test +tables: + - name: accounts + base_table: {database: db, schema: s, table: accounts} + primary_key: {columns: [id]} + dimensions: + - {name: id, expr: id, data_type: number} + metrics: + - name: max_balance + expr: MAX(balance) + non_additive_dimensions: + - {table: accounts, dimension: snapshot_date} +""" + ) + + graph = adapter.parse(source) + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + + accounts = next(t for t in data["tables"] if t["name"] == "accounts") + # Routed to metrics, not facts, so the metric-only key keeps it a metric. + assert "facts" not in accounts or all(f["name"] != "max_balance" for f in accounts["facts"]) + exported = next(m for m in accounts["metrics"] if m["name"] == "max_balance") + assert exported["non_additive_dimensions"][0]["dimension"] == "snapshot_date" + + graph2 = adapter.parse(output_file) + metric2 = graph2.models["accounts"].get_metric("max_balance") + assert metric2.metadata["snowflake"]["non_additive_dimensions"][0]["dimension"] == "snapshot_date" + + def test_roundtrip_private_access_modifier_maps_to_public_false(self, adapter, tmp_path): + """access_modifier: private_access marks the field non-public and round-trips.""" + source = tmp_path / "priv.yaml" + source.write_text( + """ +name: priv_test +tables: + - name: orders + base_table: {database: db, schema: s, table: orders} + primary_key: {columns: [id]} + dimensions: + - name: ssn + expr: ssn + data_type: text + access_modifier: private_access + - name: status + expr: status + data_type: text + access_modifier: public_access +""" + ) + + graph = adapter.parse(source) + ssn = graph.models["orders"].get_dimension("ssn") + status = graph.models["orders"].get_dimension("status") + assert ssn.public is False + assert status.public is True + + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + orders = data["tables"][0] + exported_ssn = next(d for d in orders["dimensions"] if d["name"] == "ssn") + assert exported_ssn["access_modifier"] == "private_access" + + graph2 = adapter.parse(output_file) + assert graph2.models["orders"].get_dimension("ssn").public is False + + def test_public_false_overrides_stale_public_access_metadata(self, adapter, tmp_path): + """public=False must win over a public_access modifier carried in metadata. + + A field imported with access_modifier: public_access keeps that value in + metadata. If a user later sets public=False (native YAML/API) and exports + back to Snowflake, the field must become private_access, not stay public. + """ + source = tmp_path / "pub.yaml" + source.write_text( + """ +name: pub_test +tables: + - name: orders + base_table: {database: db, schema: s, table: orders} + primary_key: {columns: [id]} + dimensions: + - name: ssn + expr: ssn + data_type: text + access_modifier: public_access + facts: + - name: amount + expr: amount + data_type: number + access_modifier: public_access + metrics: + - name: total + expr: SUM(amount) + access_modifier: public_access + non_additive_dimensions: + - {table: orders, dimension: snapshot_date} +""" + ) + + graph = adapter.parse(source) + orders = graph.models["orders"] + # Imported public_access is preserved in metadata. + assert orders.get_dimension("ssn").metadata["snowflake"]["access_modifier"] == "public_access" + assert orders.get_metric("amount").metadata["snowflake"]["access_modifier"] == "public_access" + assert orders.get_metric("total").metadata["snowflake"]["access_modifier"] == "public_access" + + # User flips visibility to private via the native API. + orders.get_dimension("ssn").public = False + orders.get_metric("amount").public = False + orders.get_metric("total").public = False + + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + table = data["tables"][0] + exported_dim = next(d for d in table["dimensions"] if d["name"] == "ssn") + exported_fact = next(f for f in table["facts"] if f["name"] == "amount") + exported_metric = next(m for m in table["metrics"] if m["name"] == "total") + assert exported_dim["access_modifier"] == "private_access" + assert exported_fact["access_modifier"] == "private_access" + assert exported_metric["access_modifier"] == "private_access" + + # Re-parsing keeps them non-public. + graph2 = adapter.parse(output_file) + orders2 = graph2.models["orders"] + assert orders2.get_dimension("ssn").public is False + assert orders2.get_metric("amount").public is False + assert orders2.get_metric("total").public is False + + def test_export_strips_model_placeholder_from_table_scoped_metric(self, adapter, tmp_path): + """Table-scoped derived metrics must not leak {model} placeholders into Snowflake.""" + source = tmp_path / "ph.yaml" + source.write_text( + """ +name: ph_test +tables: + - name: orders + base_table: {database: db, schema: s, table: orders} + primary_key: {columns: [id]} + dimensions: + - {name: id, expr: id, data_type: number} + facts: + - {name: amount, expr: amount, data_type: number} +metrics: + - name: avg_order + table: orders + expr: SUM(amount) / COUNT(id) +""" + ) + + graph = adapter.parse(source) + # Internally the table-scoped expression is qualified for queryability. + assert "{model}" in graph.models["orders"].get_metric("avg_order").sql + + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + orders = next(t for t in data["tables"] if t["name"] == "orders") + expr = next(m["expr"] for m in orders["metrics"] if m["name"] == "avg_order") + assert "{model}" not in expr + assert expr == "SUM(amount) / COUNT(id)" + + def test_export_top_level_ratio_keeps_model_qualifiers(self, adapter, tmp_path): + """Graph-level ratio metrics keep model.field qualifiers for cross-table refs.""" + graph = SemanticGraph() + graph.add_model( + Model( + name="orders", + table="ORDERS", + primary_key="id", + metrics=[ + Metric(name="revenue", agg="sum", sql="amount"), + Metric(name="order_count", agg="count"), + ], + ) + ) + graph.add_metric(Metric(name="aov", type="ratio", numerator="orders.revenue", denominator="orders.order_count")) + + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + expr = next(m["expr"] for m in data["metrics"] if m["name"] == "aov") + assert expr == "orders.revenue / NULLIF(orders.order_count, 0)" + + def test_parse_directory_attaches_top_level_metric_regardless_of_file_order(self, adapter, tmp_path): + """A top-level metric must attach to its table even if defined in an earlier file.""" + # rglob visits files in sorted order, so a_metrics is parsed before z_tables. + (tmp_path / "a_metrics.yaml").write_text( + """ +name: a_metrics +metrics: + - name: avg_order + table: orders + expr: SUM(amount) / COUNT(order_id) +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: z_tables +tables: + - name: orders + base_table: {database: db, schema: s, table: orders} + primary_key: {columns: [order_id]} + dimensions: + - {name: order_id, expr: order_id, data_type: number} + facts: + - {name: amount, expr: amount, data_type: number} +""" + ) + + graph = adapter.parse(tmp_path) + + # The metric attaches to its table (not the graph-level branch). + orders = graph.models["orders"] + assert "avg_order" in [m.name for m in orders.metrics] + assert "avg_order" not in graph.metrics + # Table-scoped: complex expression is qualified for queryability. + assert "{model}" in orders.get_metric("avg_order").sql + + # Export drops the placeholder and keeps the metric under the orders table. + output_file = tmp_path / "out.yaml" + adapter.export(graph, output_file) + data = yaml.safe_load(output_file.read_text()) + orders_table = next(t for t in data["tables"] if t["name"] == "orders") + expr = next(m["expr"] for m in orders_table["metrics"] if m["name"] == "avg_order") + assert "{model}" not in expr diff --git a/tests/core/test_directory_loaders.py b/tests/core/test_directory_loaders.py index 7d27a52a..f5ed86e4 100644 --- a/tests/core/test_directory_loaders.py +++ b/tests/core/test_directory_loaders.py @@ -450,3 +450,715 @@ def test_validate_directory_accepts_valid_hex_view_base(tmp_path): report = validate_directory(tmp_path) assert not any("view" in err.lower() for err in report.errors) + + +def test_load_from_directory_attaches_snowflake_metric_to_table_in_another_file(tmp_path): + """A Snowflake top-level metric attaches to its table even if defined in another file.""" + # File A is Snowflake-detected (tables + base_table) and carries a top-level + # metric referencing `orders`, which lives in file B. + (tmp_path / "a_model.yaml").write_text( + """ +name: a_model +tables: + - name: products + base_table: + database: db + schema: s + table: products + primary_key: + columns: [id] + dimensions: + - name: id + expr: id + data_type: number +metrics: + - name: avg_order + table: orders + expr: SUM(amount) / COUNT(order_id) +""" + ) + (tmp_path / "b_model.yaml").write_text( + """ +name: b_model +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + assert set(graph.models) == {"products", "orders"} + orders = graph.models["orders"] + assert "avg_order" in [m.name for m in orders.metrics] + assert "avg_order" not in graph.metrics + metric = orders.get_metric("avg_order") + # Table-scoped: complex expression re-qualified for queryability. + assert "{model}" in metric.sql + # The internal pending marker is cleaned up after attachment. + assert (metric.metadata or {}).get("snowflake", {}).get("pending_table") is None + + +def test_load_from_directory_detects_instruction_only_snowflake_sidecar(tmp_path): + """A Cortex sidecar with only verified_queries/custom_instructions routes to Snowflake.""" + # No metrics and no tables: only Snowflake-only top-level sections. + (tmp_path / "a_instructions.yaml").write_text( + """ +verified_queries: + - name: total revenue + sql: SELECT SUM(amount) FROM orders +custom_instructions: Prefer revenue. +module_custom_instructions: + sql_generation: Use explicit columns. +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + snowflake_meta = layer.graph.metadata.get("snowflake", {}) + + assert snowflake_meta.get("verified_queries") + assert snowflake_meta.get("custom_instructions") == "Prefer revenue." + assert "module_custom_instructions" in snowflake_meta + + +def test_load_from_directory_detects_metric_only_snowflake_file(tmp_path): + """A Cortex file with only top-level metrics (table + expr) is routed to Snowflake.""" + # Metric-only file (no tables section) parsed before the table file. + (tmp_path / "a_metrics.yaml").write_text( + """ +metrics: + - name: avg_order + table: orders + expr: SUM(amount) / COUNT(order_id) +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tables_model +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + orders = graph.models["orders"] + assert "avg_order" in [m.name for m in orders.metrics] + assert "avg_order" not in graph.metrics + assert "{model}" in orders.get_metric("avg_order").sql + + +def test_load_from_directory_detects_metric_sidecar_with_snowflake_metric_keys(tmp_path): + """A tableless metrics sidecar carrying Snowflake-only metric keys routes to Snowflake.""" + (tmp_path / "a_metrics.yaml").write_text( + """ +metrics: + - name: global_ratio + expr: orders.revenue / orders.order_count + access_modifier: public_access + labels: [KPI] +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number + metrics: + - name: revenue + expr: SUM(amount) + - name: order_count + expr: COUNT(order_id) +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + assert "global_ratio" in graph.metrics + sf = graph.metrics["global_ratio"].metadata["snowflake"] + assert sf["access_modifier"] == "public_access" + assert sf["labels"] == ["KPI"] + + +def test_load_from_directory_detects_mixed_snowflake_metrics_file(tmp_path): + """A metrics-only Cortex file may mix table-scoped and tableless view metrics.""" + # No tables section; one metric has table (table-scoped), one omits it (graph-level). + (tmp_path / "a_metrics.yaml").write_text( + """ +metrics: + - name: avg_order + table: orders + expr: SUM(amount) / COUNT(order_id) + - name: global_ratio + expr: orders.revenue / orders.order_count +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tables_model +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number + metrics: + - name: revenue + expr: SUM(amount) + - name: order_count + expr: COUNT(order_id) +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + # Table-scoped metric attaches to its table; tableless metric stays graph-level. + assert "avg_order" in [m.name for m in graph.models["orders"].metrics] + assert "global_ratio" in graph.metrics + assert "avg_order" not in graph.metrics + + +def test_load_from_directory_detects_relationship_only_snowflake_sidecar(tmp_path): + """A Cortex sidecar with only top-level relationships routes to Snowflake and attaches joins.""" + # Non-standard join columns so foreign-key inference would NOT recreate the join. + (tmp_path / "a_rels.yaml").write_text( + """ +relationships: + - name: orders_to_customers + left_table: orders + right_table: customers + relationship_columns: + - left_column: cust_ref + right_column: cust_pk + relationship_type: many_to_one +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + - name: cust_ref + expr: cust_ref + data_type: number + - name: customers + base_table: + database: db + schema: s + table: customers + primary_key: + columns: [cust_pk] + dimensions: + - name: cust_pk + expr: cust_pk + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + orders = graph.models["orders"] + rel = next(r for r in orders.relationships if r.name == "customers") + assert rel.metadata["snowflake"]["name"] == "orders_to_customers" + assert rel.foreign_key == "cust_ref" + assert graph.find_relationship_path("orders", "customers") + + +def test_load_from_directory_keeps_same_target_snowflake_relationships(tmp_path): + """Two distinct named relationships between the same tables in a split project + must both survive: de-dup is by Snowflake name/columns, not the target table.""" + (tmp_path / "a_rels.yaml").write_text( + """ +relationships: + - name: orders_to_customers_billing + left_table: orders + right_table: customers + relationship_columns: + - left_column: billing_cust_ref + right_column: cust_pk + relationship_type: many_to_one + - name: orders_to_customers_shipping + left_table: orders + right_table: customers + relationship_columns: + - left_column: shipping_cust_ref + right_column: cust_pk + relationship_type: many_to_one +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + - name: billing_cust_ref + expr: billing_cust_ref + data_type: number + - name: shipping_cust_ref + expr: shipping_cust_ref + data_type: number + - name: customers + base_table: + database: db + schema: s + table: customers + primary_key: + columns: [cust_pk] + dimensions: + - name: cust_pk + expr: cust_pk + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + orders = layer.graph.models["orders"] + + customer_rels = [r for r in orders.relationships if r.name == "customers"] + assert {r.metadata["snowflake"]["name"] for r in customer_rels} == { + "orders_to_customers_billing", + "orders_to_customers_shipping", + } + assert {r.foreign_key for r in customer_rels} == {"billing_cust_ref", "shipping_cust_ref"} + + +def test_load_from_directory_detects_view_metric_sidecar_with_snowflake_sections(tmp_path): + """A tableless Cortex sidecar with verified_queries routes to Snowflake.""" + # Pure view-level metrics (no table) plus Snowflake-only top-level sections. + (tmp_path / "a_sidecar.yaml").write_text( + """ +metrics: + - name: global_ratio + expr: orders.revenue / orders.order_count +verified_queries: + - name: total revenue + sql: SELECT SUM(amount) FROM orders +custom_instructions: Prefer revenue. +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number + metrics: + - name: revenue + expr: SUM(amount) + - name: order_count + expr: COUNT(order_id) +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + assert "global_ratio" in graph.metrics + snowflake_meta = graph.metadata.get("snowflake", {}) + assert snowflake_meta.get("verified_queries") + assert snowflake_meta.get("custom_instructions") == "Prefer revenue." + + +def test_load_from_directory_detects_named_view_metric_sidecar(tmp_path): + """A Cortex sidecar with a root ``name`` and only tableless view metrics (no + Snowflake-only key or section) still routes to Snowflake, not silently dropped. + + The root ``name`` is the sole Cortex signal: native detection rejects ``name`` + so the file is not native-compatible, and without this routing the view metric + is lost on the CLI load_from_directory / export-native path. + """ + (tmp_path / "a_sidecar.yaml").write_text( + """ +name: view_metrics +metrics: + - name: global_ratio + expr: orders.revenue / orders.order_count +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + facts: + - name: amount + expr: amount + data_type: number + metrics: + - name: revenue + expr: SUM(amount) + - name: order_count + expr: COUNT(order_id) +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + + assert "global_ratio" in layer.graph.metrics + + +def test_load_from_directory_explicit_snowflake_relationship_beats_inference(tmp_path): + """An explicit Cortex relationship takes precedence over a guessed foreign key.""" + # orders has customer_id (inferable to customers) AND an explicit Snowflake join. + (tmp_path / "a_rels.yaml").write_text( + """ +relationships: + - name: orders_to_customers + left_table: orders + right_table: customers + relationship_columns: + - left_column: cust_ref + right_column: cust_pk + relationship_type: many_to_one +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + - name: customer_id + expr: customer_id + data_type: number + - name: cust_ref + expr: cust_ref + data_type: number + - name: customers + base_table: + database: db + schema: s + table: customers + primary_key: + columns: [cust_pk] + dimensions: + - name: cust_pk + expr: cust_pk + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + orders = layer.graph.models["orders"] + + customer_rels = [r for r in orders.relationships if r.name == "customers"] + assert len(customer_rels) == 1 + assert customer_rels[0].foreign_key == "cust_ref" + assert customer_rels[0].metadata["snowflake"]["name"] == "orders_to_customers" + + +def test_load_from_directory_merges_snowflake_metadata_across_files(tmp_path): + """Multi-file Cortex projects must accumulate top-level sections, not overwrite.""" + (tmp_path / "a.yaml").write_text( + """ +name: a +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [id] + dimensions: + - name: id + expr: id + data_type: number +verified_queries: + - name: q1 + question: x + sql: SELECT 1 +custom_instructions: from A +""" + ) + (tmp_path / "b.yaml").write_text( + """ +name: b +tables: + - name: customers + base_table: + database: db + schema: s + table: customers + primary_key: + columns: [id] + dimensions: + - name: id + expr: id + data_type: number +verified_queries: + - name: q2 + question: y + sql: SELECT 2 +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + merged = graph.metadata["snowflake"]["verified_queries"] + assert sorted(q["name"] for q in merged) == ["q1", "q2"] + # Dynamic attribute accumulates too. + assert len(getattr(graph, "verified_queries", [])) == 2 + + +def test_load_from_directory_preserves_snowflake_top_level_sections(tmp_path): + """CLI-first load -> export-native must round-trip Snowflake Cortex top-level sections.""" + import yaml + + from sidemantic.adapters.sidemantic import SidemanticAdapter + + (tmp_path / "cortex.yaml").write_text( + """ +name: cortex +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [order_id] + dimensions: + - name: order_id + expr: order_id + data_type: number + measures: + - name: order_total + expr: total + data_type: number + default_aggregation: sum +verified_queries: + - name: total revenue + question: what is the total revenue + sql: "SELECT SUM(total) FROM orders" +custom_instructions: Prefer revenue. +module_custom_instructions: + sql_generation: Use explicit columns. +""" + ) + + layer = SemanticLayer(auto_register=False) + load_from_directory(layer, tmp_path) + graph = layer.graph + + # Top-level sections reach layer.graph (both as metadata and dynamic attrs). + assert graph.metadata["snowflake"]["verified_queries"] + assert graph.metadata["snowflake"]["custom_instructions"] == "Prefer revenue." + assert getattr(graph, "verified_queries", None) + assert getattr(graph, "custom_instructions", None) == "Prefer revenue." + + # export-native emits a root metadata block carrying them. + out = tmp_path / "native.yml" + SidemanticAdapter().export(graph, out) + data = yaml.safe_load(out.read_text()) + assert data["metadata"]["snowflake"]["custom_instructions"] == "Prefer revenue." + + # And a native re-parse keeps them on graph.metadata. + graph2 = SidemanticAdapter().parse(out) + assert graph2.metadata["snowflake"]["verified_queries"] + + +def test_load_from_directory_same_named_scoped_metrics_on_different_tables(tmp_path): + """Same-named table-scoped metrics on different tables must not overwrite each other.""" + # Two metric sidecars each define a metric named "total" for a different table. + (tmp_path / "a_metrics.yaml").write_text( + """ +metrics: + - name: total + table: orders + expr: SUM(amount) +""" + ) + (tmp_path / "b_metrics.yaml").write_text( + """ +metrics: + - name: total + table: customers + expr: SUM(balance) +""" + ) + (tmp_path / "z_tables.yaml").write_text( + """ +name: tm +tables: + - name: orders + base_table: + database: db + schema: s + table: orders + primary_key: + columns: [id] + dimensions: + - name: id + expr: id + data_type: number + facts: + - name: amount + expr: amount + data_type: number + - name: customers + base_table: + database: db + schema: s + table: customers + primary_key: + columns: [id] + dimensions: + - name: id + expr: id + data_type: number + facts: + - name: balance + expr: balance + data_type: number +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmp_path) + graph = layer.graph + + # Both scoped metrics attach to their respective tables. + assert "total" in [m.name for m in graph.models["orders"].metrics] + assert "total" in [m.name for m in graph.models["customers"].metrics] diff --git a/tests/fixtures/snowflake/cortex_features.yaml b/tests/fixtures/snowflake/cortex_features.yaml new file mode 100644 index 00000000..e639b6fe --- /dev/null +++ b/tests/fixtures/snowflake/cortex_features.yaml @@ -0,0 +1,79 @@ +name: cortex_features +description: Cortex Analyst model exercising newer spec keys. +tables: + - name: orders + description: Orders fact table. + base_table: + database: analytics + schema: sales + table: orders + primary_key: + columns: + - order_id + time_dimensions: + - name: order_date + expr: order_date + data_type: timestamp + unique: false + dimensions: + - name: order_id + expr: order_id + data_type: number + - name: status + expr: status + data_type: varchar + synonyms: + - state + is_enum: true + sample_values: + - delivered + - shipped + access_modifier: public_access + labels: + - Order Status + tags: + - core + - name: customer_name + expr: customer_name + data_type: varchar + cortex_search_service: + service: customer_name_search + literal_column: customer_name + database: analytics + schema: sales + measures: + - name: order_total + expr: total + data_type: number + default_aggregation: sum + synonyms: + - revenue + access_modifier: public_access + non_additive_dimensions: + - table: orders + dimension: order_date + labels: + - Order Total + tags: + - finance + metrics: + - name: distinct_orders + expr: COUNT(DISTINCT order_id) + synonyms: + - order count + access_modifier: public_access + using_relationships: + - orders_to_customers + +verified_queries: + - name: total revenue + question: what is the total revenue + sql: "SELECT SUM(total) FROM orders" + verified_at: 1700000000 + verified_by: analyst + +custom_instructions: Always prefer revenue over total when answering. + +module_custom_instructions: + sql_generation: Prefer explicit column references. + question_categorization: Treat revenue questions as financial.