diff --git a/sidemantic/adapters/gooddata.py b/sidemantic/adapters/gooddata.py index 894a9c27..2dd82185 100644 --- a/sidemantic/adapters/gooddata.py +++ b/sidemantic/adapters/gooddata.py @@ -155,7 +155,11 @@ def _parse_cloud_dataset(self, dataset_def: dict[str, Any]) -> Model | None: "table", ) ) - sql = dataset_def.get("sql") + raw_sql = dataset_def.get("sql") + sql, sql_data_source_id = self._coerce_sql(raw_sql) + # Track object-form SQL so an export round-trip can re-emit the SDK + # shape ``{"dataSourceId": ..., "statement": ...}`` instead of a bare string. + sql_is_object = isinstance(raw_sql, dict) if sql: table = None @@ -164,6 +168,7 @@ def _parse_cloud_dataset(self, dataset_def: dict[str, Any]) -> Model | None: attributes = self._as_list(dataset_def.get("attributes")) facts = self._as_list(dataset_def.get("facts")) + aggregated_facts = self._as_list(dataset_def.get("aggregatedFacts") or dataset_def.get("aggregated_facts")) fields = dataset_def.get("fields") if fields: @@ -186,6 +191,11 @@ def _parse_cloud_dataset(self, dataset_def: dict[str, Any]) -> Model | None: if metric: metrics.append(metric) + for agg_fact_def in aggregated_facts: + metric = self._parse_cloud_aggregated_fact(agg_fact_def) + if metric: + metrics.append(metric) + if primary_key and not any(dim.name == primary_key for dim in dimensions): dimensions.append( Dimension( @@ -202,16 +212,29 @@ def _parse_cloud_dataset(self, dataset_def: dict[str, Any]) -> Model | None: if rel: relationships.append(rel) + data_source_table_id = dataset_def.get("dataSourceTableId") or dataset_def.get("data_source_table_id") + data_source_id = dataset_def.get("dataSourceId") or dataset_def.get("data_source_id") + # SQL-backed datasets carry the data source on the sql object; object-form + # dataSourceTableId carries it on the table descriptor. + if not data_source_id: + data_source_id = sql_data_source_id + if not data_source_id and isinstance(data_source_table_id, dict): + data_source_id = data_source_table_id.get("dataSourceId") or data_source_table_id.get("data_source_id") + metadata = { GOODDATA_METADATA_KEY: { "id": dataset_id, "title": dataset_def.get("title"), "description": dataset_def.get("description"), "tags": dataset_def.get("tags"), - "data_source_id": dataset_def.get("dataSourceId") or dataset_def.get("data_source_id"), - "data_source_table_id": dataset_def.get("dataSourceTableId") or dataset_def.get("data_source_table_id"), + "data_source_id": data_source_id, + "sql_is_object": sql_is_object, + "sql_data_source_id": sql_data_source_id if sql_is_object else None, + "data_source_table_id": data_source_table_id, "table_path": dataset_def.get("tablePath") or dataset_def.get("table_path"), "grain": grain_ids, + "workspace_data_filter_columns": dataset_def.get("workspaceDataFilterColumns") + or dataset_def.get("workspace_data_filter_columns"), "extra": self._extract_extra(dataset_def, self._cloud_dataset_keys()), } } @@ -244,9 +267,10 @@ def _parse_cloud_attribute(self, attr_def: dict[str, Any]) -> Dimension | None: source_column = self._get_first(label_def or {}, "sourceColumn", "source_column") or self._get_first( attr_def, "sourceColumn", "source_column" ) - data_type = self._get_first(label_def or {}, "dataType", "data_type") or self._get_first( - attr_def, "dataType", "data_type" - ) + # sourceColumnDataType is the newer name for dataType; read both. + data_type = self._get_first( + label_def or {}, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type" + ) or self._get_first(attr_def, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type") dim_type, granularity = self._map_dimension_type(data_type) @@ -283,7 +307,10 @@ def _parse_cloud_fact(self, fact_def: dict[str, Any]) -> Metric | None: raise GoodDataParseError("Fact is missing an id/identifier") source_column = self._get_first(fact_def, "sourceColumn", "source_column") - data_type = self._get_first(fact_def, "dataType", "data_type") + # sourceColumnDataType is the newer name for dataType; read both. + data_type = self._get_first( + fact_def, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type" + ) agg = self._map_fact_aggregation(fact_def, data_type) metadata = { @@ -308,6 +335,55 @@ def _parse_cloud_fact(self, fact_def: dict[str, Any]) -> Metric | None: metadata=metadata, ) + def _parse_cloud_aggregated_fact(self, agg_def: dict[str, Any]) -> Metric | None: + if "aggregatedFact" in agg_def and isinstance(agg_def["aggregatedFact"], dict): + agg_def = agg_def["aggregatedFact"] + + agg_id = self._extract_identifier(agg_def, keys=("id", "identifier", "name")) + if not agg_id: + raise GoodDataParseError("Aggregated fact is missing an id/identifier") + + source_column = self._get_first(agg_def, "sourceColumn", "source_column") + data_type = self._get_first(agg_def, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type") + + # Aggregate awareness: source fact + SUM/MIN/MAX operation. + source_fact_ref = self._get_first(agg_def, "sourceFactReference", "source_fact_reference") or {} + operation = None + source_fact_id = None + if isinstance(source_fact_ref, dict): + operation = source_fact_ref.get("operation") + reference = source_fact_ref.get("reference") + if isinstance(reference, dict): + source_fact_id = self._extract_identifier(reference, keys=("id", "identifier", "name")) + elif isinstance(reference, str): + source_fact_id = reference + + agg = self._map_fact_aggregation({"aggregation": operation} if operation else {}, data_type) + + metadata = { + GOODDATA_METADATA_KEY: { + "id": agg_id, + "title": agg_def.get("title"), + "description": agg_def.get("description"), + "tags": agg_def.get("tags"), + "source_column": source_column, + "data_type": data_type, + "aggregated_fact": True, + "operation": operation, + "source_fact": source_fact_id, + "extra": self._extract_extra(agg_def, self._cloud_aggregated_fact_keys()), + } + } + + return Metric( + name=agg_id, + agg=agg, + sql=source_column or agg_id, + label=agg_def.get("title"), + description=agg_def.get("description"), + metadata=metadata, + ) + def _parse_cloud_reference(self, ref_def: Any) -> Relationship | None: if isinstance(ref_def, str): target_id = ref_def @@ -325,10 +401,23 @@ def _parse_cloud_reference(self, ref_def: Any) -> Relationship | None: multivalue = self._get_first(ref_def, "multivalue", "multiValue") is True rel_type = "many_to_many" if multivalue else "many_to_one" - source_columns = self._get_first(ref_def, "sourceColumns", "source_columns", "sourceColumn", "source_column") + # Newer exports replace flat sourceColumns with a sources array of + # {column, target: {id, type}, dataType}. Keep flat support for older exports. + sources = self._get_first(ref_def, "sources") + source_objects = None + if isinstance(sources, list) and sources: + source_objects = sources + source_columns = [s.get("column") for s in sources if isinstance(s, dict) and s.get("column")] + else: + source_columns = self._get_first( + ref_def, "sourceColumns", "source_columns", "sourceColumn", "source_column" + ) + foreign_key = None - if isinstance(source_columns, list) and len(source_columns) == 1: - foreign_key = source_columns[0] + if isinstance(source_columns, list) and source_columns: + # Composite keys keep the full column list (Relationship.foreign_key + # accepts list[str]); single-column refs unwrap to a plain string. + foreign_key = source_columns[0] if len(source_columns) == 1 else list(source_columns) elif isinstance(source_columns, str): foreign_key = source_columns @@ -336,6 +425,7 @@ def _parse_cloud_reference(self, ref_def: Any) -> Relationship | None: GOODDATA_METADATA_KEY: { "identifier": target_id, "source_columns": source_columns, + "sources": source_objects, "multivalue": multivalue, "extra": self._extract_extra(ref_def, self._cloud_reference_keys()), } @@ -507,9 +597,9 @@ def _parse_legacy_attribute(self, attr_def: dict[str, Any] | None, label_map: di source_column = self._get_first(label_def or {}, "sourceColumn", "source_column") or self._get_first( attr_def, "sourceColumn", "source_column" ) - data_type = self._get_first(label_def or {}, "dataType", "data_type") or self._get_first( - attr_def, "dataType", "data_type" - ) + data_type = self._get_first( + label_def or {}, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type" + ) or self._get_first(attr_def, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type") dim_type, granularity = self._map_dimension_type(data_type) @@ -545,7 +635,9 @@ def _parse_legacy_fact(self, fact_def: dict[str, Any] | None) -> Metric | None: raise GoodDataParseError("Legacy fact is missing an identifier") source_column = self._get_first(fact_def, "sourceColumn", "source_column") - data_type = self._get_first(fact_def, "dataType", "data_type") + data_type = self._get_first( + fact_def, "sourceColumnDataType", "source_column_data_type", "dataType", "data_type" + ) agg = self._map_fact_aggregation(fact_def, data_type) metadata = { @@ -617,18 +709,35 @@ def _export_dataset(self, model: Model) -> dict[str, Any]: if gd_meta.get("tags"): dataset["tags"] = gd_meta["tags"] - if model.table: + original_table_id = gd_meta.get("data_source_table_id") + if isinstance(original_table_id, dict): + # Preserve the newer object form (dataSourceId/id/type/path) verbatim. + dataset["dataSourceTableId"] = original_table_id + elif model.table: dataset["dataSourceTableId"] = model.table - elif gd_meta.get("data_source_table_id"): - dataset["dataSourceTableId"] = gd_meta["data_source_table_id"] + elif original_table_id: + dataset["dataSourceTableId"] = original_table_id elif gd_meta.get("table_path"): dataset["tablePath"] = gd_meta["table_path"] - if gd_meta.get("data_source_id"): + # SQL-backed datasets parsed from the SDK object form keep the data + # source nested inside the ``sql`` object, so skip the top-level field. + sql_is_object = bool(model.sql) and gd_meta.get("sql_is_object") + + if gd_meta.get("data_source_id") and not sql_is_object: dataset["dataSourceId"] = gd_meta["data_source_id"] if model.sql: - dataset["sql"] = model.sql + sql_data_source_id = gd_meta.get("sql_data_source_id") or gd_meta.get("data_source_id") + if sql_is_object and sql_data_source_id: + # Re-emit the SDK object shape {dataSourceId, statement}. + dataset["sql"] = {"dataSourceId": sql_data_source_id, "statement": model.sql} + else: + dataset["sql"] = model.sql + + wdf_columns = gd_meta.get("workspace_data_filter_columns") + if wdf_columns: + dataset["workspaceDataFilterColumns"] = wdf_columns grain_ids = gd_meta.get("grain") or [model.primary_key] dataset["grain"] = [{"id": grain_id, "type": "attribute"} for grain_id in grain_ids if grain_id] @@ -673,8 +782,31 @@ def _export_dataset(self, model: Model) -> dict[str, Any]: dataset["attributes"] = attributes facts = [] + aggregated_facts = [] for metric in model.metrics: fact_meta = (metric.metadata or {}).get(GOODDATA_METADATA_KEY, {}) + + if fact_meta.get("aggregated_fact"): + agg_def: dict[str, Any] = { + "id": fact_meta.get("id") or metric.name, + "title": metric.label or metric.name, + "sourceColumn": metric.sql or metric.name, + "sourceColumnDataType": fact_meta.get("data_type") or "NUMERIC", + } + if metric.description: + agg_def["description"] = metric.description + if fact_meta.get("tags"): + agg_def["tags"] = fact_meta["tags"] + operation = fact_meta.get("operation") or (metric.agg.upper() if metric.agg else None) + source_fact = fact_meta.get("source_fact") + if operation and source_fact: + agg_def["sourceFactReference"] = { + "operation": operation, + "reference": {"id": source_fact, "type": "fact"}, + } + aggregated_facts.append(agg_def) + continue + fact_def: dict[str, Any] = { "id": fact_meta.get("id") or metric.name, "title": metric.label or metric.name, @@ -696,6 +828,9 @@ def _export_dataset(self, model: Model) -> dict[str, Any]: if facts: dataset["facts"] = facts + if aggregated_facts: + dataset["aggregatedFacts"] = aggregated_facts + references = [] for rel in model.relationships: if rel.type not in ("many_to_one", "many_to_many", "one_to_one"): @@ -707,11 +842,17 @@ def _export_dataset(self, model: Model) -> dict[str, Any]: "multivalue": rel.type == "many_to_many" or rel_meta.get("multivalue") is True, } - source_columns = rel_meta.get("source_columns") - if not source_columns and rel.foreign_key: - source_columns = [rel.foreign_key] - if source_columns: - ref["sourceColumns"] = source_columns + sources = rel_meta.get("sources") + if isinstance(sources, list) and sources: + # Preserve the newer sources array form verbatim. + ref["sources"] = sources + else: + source_columns = rel_meta.get("source_columns") + if not source_columns and rel.foreign_key: + # foreign_key may be a single string or a composite list. + source_columns = list(rel.foreign_key) if isinstance(rel.foreign_key, list) else [rel.foreign_key] + if source_columns: + ref["sourceColumns"] = source_columns references.append(ref) @@ -827,15 +968,41 @@ def _select_label(self, labels: list[dict[str, Any]], default_label_id: str | No return label return labels[0] if labels else None + def _coerce_sql(self, value: Any) -> tuple[str | None, str | None]: + """Return (sql_statement, data_source_id) for legacy string or object SQL. + + SQL-backed datasets use ``{"dataSourceId": ..., "statement": ...}``; + older exports use a plain string. Both forms are supported. + """ + if isinstance(value, dict): + statement = value.get("statement") or value.get("sql") + data_source_id = value.get("dataSourceId") or value.get("data_source_id") + return (statement if isinstance(statement, str) else None), ( + data_source_id if isinstance(data_source_id, str) else None + ) + if isinstance(value, str): + return value, None + return None, None + def _coerce_table_path(self, value: Any) -> str | None: if isinstance(value, list): parts = [str(part) for part in value if part] return ".".join(parts) if parts else None if isinstance(value, dict): + # Newer object form: {dataSourceId, id, type, path: [schema, table]} + path = value.get("path") + if isinstance(path, list): + parts = [str(part) for part in path if part] + if parts: + return ".".join(parts) schema = value.get("schema") table = value.get("table") if schema and table: return f"{schema}.{table}" + # Fall back to a bare id when no path/schema is available. + table_id = value.get("id") + if isinstance(table_id, str): + return table_id if isinstance(value, str): return value return None @@ -950,8 +1117,12 @@ def _cloud_dataset_keys(self) -> set[str]: "primaryKey", "attributes", "facts", + "aggregatedFacts", + "aggregated_facts", "references", "fields", + "workspaceDataFilterColumns", + "workspace_data_filter_columns", } def _cloud_attribute_keys(self) -> set[str]: @@ -966,6 +1137,8 @@ def _cloud_attribute_keys(self) -> set[str]: "source_column", "dataType", "data_type", + "sourceColumnDataType", + "source_column_data_type", "labels", "defaultView", "default_view", @@ -984,6 +1157,28 @@ def _cloud_fact_keys(self) -> set[str]: "source_column", "dataType", "data_type", + "sourceColumnDataType", + "source_column_data_type", + "aggregation", + "type", + } + + def _cloud_aggregated_fact_keys(self) -> set[str]: + return { + "id", + "identifier", + "name", + "title", + "description", + "tags", + "sourceColumn", + "source_column", + "dataType", + "data_type", + "sourceColumnDataType", + "source_column_data_type", + "sourceFactReference", + "source_fact_reference", "aggregation", "type", } @@ -993,10 +1188,13 @@ def _cloud_reference_keys(self) -> set[str]: "identifier", "dataset", "reference", + "sources", "sourceColumns", "source_columns", "sourceColumn", "source_column", + "sourceColumnDataTypes", + "source_column_data_types", "multivalue", "multiValue", } diff --git a/tests/adapters/gooddata/test_parsing.py b/tests/adapters/gooddata/test_parsing.py index 6f48b77f..ffb43bbf 100644 --- a/tests/adapters/gooddata/test_parsing.py +++ b/tests/adapters/gooddata/test_parsing.py @@ -384,8 +384,10 @@ def test_ecommerce_demo_ldm_product_facts(): metric = product.metrics[0] assert metric.name == "rating" assert metric.sql == "rating" - # sourceColumnDataType=NUMERIC but no dataType, so agg inference returns None - assert metric.agg is None + # sourceColumnDataType=NUMERIC is read (newer name for dataType), so a numeric + # fact infers a sum aggregation. + assert metric.agg == "sum" + assert metric.metadata["gooddata"]["data_type"] == "NUMERIC" def test_ecommerce_demo_ldm_relationships_resolve(): @@ -424,26 +426,246 @@ def test_ecommerce_demo_analytics_not_ldm(): def test_sdk_declarative_ldm_dict_sql_field(): - """SDK declarative LDM has dict-style sql field that adapter doesn't yet handle. + """SDK declarative LDM with object-style sql field and aggregatedFacts. Source: gooddata/gooddata-python-sdk (MIT). 6 datasets, star schema, - GEO labels, aggregatedFacts, 1 SQL dataset with dict-style sql. + GEO labels, aggregatedFacts, 1 SQL dataset with object-style sql. The sql field is {"dataSourceId": ..., "statement": ...} instead of a string. """ adapter = GoodDataAdapter() - # Currently raises ValidationError because sql is a dict not a string. - # When adapter gains dict-style sql support, update this test to verify parsing. - with pytest.raises(Exception): - adapter.parse("tests/fixtures/gooddata/sdk_declarative_ldm.json") + graph = adapter.parse("tests/fixtures/gooddata/sdk_declarative_ldm.json") + + # 6 datasets + 1 date instance = 7 models + assert "campaign_channels" in graph.models + assert "campaign_channels_per_category" in graph.models + assert "campaigns" in graph.models + assert "customers" in graph.models + assert "order_lines" in graph.models + assert "products" in graph.models + assert "date" in graph.models + + # Object-style dataSourceTableId -> path joined into a table name. + campaign_channels = graph.models["campaign_channels"] + assert campaign_channels.table == "demo.campaign_channels" + assert campaign_channels.metadata["gooddata"]["data_source_id"] == "demo-test-ds" + # Original object form is preserved in metadata. + assert campaign_channels.metadata["gooddata"]["data_source_table_id"]["path"] == ["demo", "campaign_channels"] + + # sources-array reference yields the join foreign key. + rel = next(r for r in campaign_channels.relationships if r.name == "campaigns") + assert rel.type == "many_to_one" + assert rel.foreign_key == "campaign_id" + + # sourceColumnDataType (renamed from dataType) drives attribute/fact typing. + channel_id = campaign_channels.get_dimension("campaign_channel_id") + assert channel_id.type == "categorical" + budget = campaign_channels.get_metric("budget") + assert budget.agg == "sum" + assert budget.metadata["gooddata"]["data_type"] == "NUMERIC" + + # SQL-backed dataset: object sql {dataSourceId, statement} -> Model.sql statement. + per_category = graph.models["campaign_channels_per_category"] + assert per_category.sql == "SELECT category, SUM(budget) FROM campaign_channels GROUP BY category" + assert per_category.table is None + assert per_category.metadata["gooddata"]["data_source_id"] == "demo-test-ds" + + # aggregatedFacts (aggregate awareness): source fact + SUM operation. + assert len(per_category.metrics) == 1 + budget_agg = per_category.get_metric("budget_agg") + assert budget_agg.agg == "sum" + assert budget_agg.sql == "budget" + agg_meta = budget_agg.metadata["gooddata"] + assert agg_meta["aggregated_fact"] is True + assert agg_meta["operation"] == "SUM" + assert agg_meta["source_fact"] == "budget" def test_sdk_declarative_ldm_with_sql_dataset(): - """SDK LDM with SQL datasets has dict-style sql field. + """SDK LDM with SQL datasets, object sql, sources refs, and WDF columns. Source: gooddata/gooddata-python-sdk (MIT). 7 datasets, newer ref format, - isNullable facts, 2 SQL datasets with dict-style sql. + isNullable facts, 2 SQL datasets with object-style sql. + """ + adapter = GoodDataAdapter() + graph = adapter.parse("tests/fixtures/gooddata/sdk_declarative_ldm_with_sql_dataset.json") + + # SQL-backed dataset with workspaceDataFilterColumns. + sql_ds = graph.models["Customers_sql_dataset_with_WDF"] + assert sql_ds.sql == "SELECT * FROM v_wdf_customers" + assert sql_ds.table is None + assert sql_ds.metadata["gooddata"]["data_source_id"] == "demo-test-ds" + wdf = sql_ds.metadata["gooddata"]["workspace_data_filter_columns"] + assert wdf == [{"dataType": "STRING", "name": "wdf__region"}] + + # Second SQL dataset still references date via a sources-array reference. + dup = graph.models["Order_lines_duplicate_sql_dataset"] + assert dup.sql == "SELECT * FROM order_lines" + date_rel = next(r for r in dup.relationships if r.name == "date") + assert date_rel.foreign_key == "date" + + # Table-backed dataset: object dataSourceTableId + sources refs + WDF columns. + order_lines = graph.models["order_lines"] + assert order_lines.table == "demo.order_lines" + ref_fks = {r.name: r.foreign_key for r in order_lines.relationships} + assert ref_fks == { + "campaigns": "campaign_id", + "customers": "customer_id", + "date": "date", + "products": "product_id", + } + assert order_lines.metadata["gooddata"]["workspace_data_filter_columns"] == [ + {"dataType": "STRING", "name": "wdf__region"}, + {"dataType": "STRING", "name": "wdf__state"}, + ] + + +def test_sdk_declarative_ldm_sql_dataset_export_roundtrip(): + """Object sql, aggregatedFacts, and WDF columns survive an export round-trip.""" + import json as _json + import tempfile as _tempfile + from pathlib import Path as _Path + + adapter = GoodDataAdapter() + graph = adapter.parse("tests/fixtures/gooddata/sdk_declarative_ldm.json") + + with _tempfile.TemporaryDirectory() as tmpdir: + adapter.export(graph, tmpdir) + data = _json.loads((_Path(tmpdir) / "ldm.json").read_text()) + + datasets = {d["id"]: d for d in data["ldm"]["datasets"]} + + # Object dataSourceTableId preserved verbatim. + assert datasets["campaign_channels"]["dataSourceTableId"]["path"] == ["demo", "campaign_channels"] + + # SQL dataset keeps its statement and emits aggregatedFacts. + per_category = datasets["campaign_channels_per_category"] + # Object-form SQL round-trips back to the SDK shape {dataSourceId, statement}, + # not a bare string, and the data source stays nested (no top-level dataSourceId). + assert per_category["sql"] == { + "dataSourceId": "demo-test-ds", + "statement": "SELECT category, SUM(budget) FROM campaign_channels GROUP BY category", + } + assert "dataSourceId" not in per_category + assert "facts" not in per_category or not per_category["facts"] + agg_facts = per_category["aggregatedFacts"] + assert len(agg_facts) == 1 + assert agg_facts[0]["id"] == "budget_agg" + assert agg_facts[0]["sourceFactReference"] == { + "operation": "SUM", + "reference": {"id": "budget", "type": "fact"}, + } + + # Re-parsing the export preserves the aggregated fact + sql. + with _tempfile.TemporaryDirectory() as tmpdir: + adapter.export(graph, tmpdir) + reparsed = adapter.parse(_Path(tmpdir) / "ldm.json") + reparsed_agg = reparsed.models["campaign_channels_per_category"].get_metric("budget_agg") + assert reparsed_agg.agg == "sum" + assert reparsed_agg.metadata["gooddata"]["operation"] == "SUM" + + +def test_sdk_sql_dataset_object_sql_export_roundtrip(): + """SQL-backed datasets re-emit the SDK ``sql`` object shape on export. + + Object-form ``sql`` ({dataSourceId, statement}) must survive a parse/export + round-trip without collapsing to a bare statement string or hoisting the + data source to a top-level ``dataSourceId``. + """ + adapter = GoodDataAdapter() + graph = adapter.parse("tests/fixtures/gooddata/sdk_declarative_ldm_with_sql_dataset.json") + + with tempfile.TemporaryDirectory() as tmpdir: + adapter.export(graph, tmpdir) + data = json.loads((Path(tmpdir) / "ldm.json").read_text()) + + datasets = {d["id"]: d for d in data["ldm"]["datasets"]} + + sql_ds = datasets["Customers_sql_dataset_with_WDF"] + assert sql_ds["sql"] == {"dataSourceId": "demo-test-ds", "statement": "SELECT * FROM v_wdf_customers"} + assert "dataSourceId" not in sql_ds + assert "dataSourceTableId" not in sql_ds + + dup = datasets["Order_lines_duplicate_sql_dataset"] + assert dup["sql"] == {"dataSourceId": "demo-test-ds", "statement": "SELECT * FROM order_lines"} + assert "dataSourceId" not in dup + + # Table-backed cloud datasets still emit a top-level dataSourceId string. + cloud = adapter.parse("tests/fixtures/gooddata/cloud_ldm.json") + with tempfile.TemporaryDirectory() as tmpdir: + adapter.export(cloud, tmpdir) + cloud_data = json.loads((Path(tmpdir) / "ldm.json").read_text()) + cloud_datasets = {d["id"]: d for d in cloud_data["ldm"]["datasets"]} + assert cloud_datasets["customers"]["dataSourceId"] == "demo" + assert "sql" not in cloud_datasets["customers"] + + # Re-parsing the SQL export recovers the statement and nested data source. + with tempfile.TemporaryDirectory() as tmpdir: + adapter.export(graph, tmpdir) + reparsed = adapter.parse(Path(tmpdir) / "ldm.json") + reparsed_sql = reparsed.models["Customers_sql_dataset_with_WDF"] + assert reparsed_sql.sql == "SELECT * FROM v_wdf_customers" + assert reparsed_sql.table is None + assert reparsed_sql.metadata["gooddata"]["data_source_id"] == "demo-test-ds" + + +def test_gooddata_composite_sources_reference_foreign_key(): + """A multi-column ``sources`` reference sets the full composite foreign key. + + Single-column refs unwrap to a plain string; composite refs keep the list so + the SQL planner joins on every column instead of falling back to ``_id``. """ + ldm = { + "ldm": { + "datasets": [ + { + "id": "orders", + "title": "Orders", + "dataSourceTableId": "orders", + "grain": [{"id": "order_id", "type": "attribute"}], + "references": [ + { + "identifier": {"id": "customers", "type": "dataset"}, + "multivalue": False, + "sources": [ + {"column": "region", "target": {"id": "region", "type": "attribute"}}, + {"column": "customer_code", "target": {"id": "customer_code", "type": "attribute"}}, + ], + }, + { + "identifier": {"id": "products", "type": "dataset"}, + "multivalue": False, + "sources": [ + {"column": "product_id", "target": {"id": "product_id", "type": "attribute"}}, + ], + }, + ], + }, + {"id": "customers", "title": "Customers", "dataSourceTableId": "customers"}, + {"id": "products", "title": "Products", "dataSourceTableId": "products"}, + ] + } + } + adapter = GoodDataAdapter() - # Currently raises ValidationError because sql is a dict not a string. - with pytest.raises(Exception): - adapter.parse("tests/fixtures/gooddata/sdk_declarative_ldm_with_sql_dataset.json") + with tempfile.TemporaryDirectory() as tmpdir: + src = Path(tmpdir) / "ldm.json" + src.write_text(json.dumps(ldm)) + graph = adapter.parse(src) + + orders = graph.models["orders"] + customers_rel = next(r for r in orders.relationships if r.name == "customers") + # Composite key: full column list preserved (not collapsed, not defaulted). + assert customers_rel.foreign_key == ["region", "customer_code"] + assert customers_rel.foreign_key_columns == ["region", "customer_code"] + + products_rel = next(r for r in orders.relationships if r.name == "products") + # Single column unwraps to a plain string. + assert products_rel.foreign_key == "product_id" + + # Export preserves the composite sources array verbatim. + with tempfile.TemporaryDirectory() as tmpdir: + adapter.export(graph, tmpdir) + exported = json.loads((Path(tmpdir) / "ldm.json").read_text()) + exported_orders = next(d for d in exported["ldm"]["datasets"] if d["id"] == "orders") + cust_ref = next(r for r in exported_orders["references"] if r["identifier"]["id"] == "customers") + assert [s["column"] for s in cust_ref["sources"]] == ["region", "customer_code"] diff --git a/tests/adapters/test_added_fixture_coverage.py b/tests/adapters/test_added_fixture_coverage.py index 3aa89d3f..3ca510f1 100644 --- a/tests/adapters/test_added_fixture_coverage.py +++ b/tests/adapters/test_added_fixture_coverage.py @@ -8,7 +8,6 @@ import duckdb import pytest import sqlglot -from pydantic import ValidationError from sqlglot import exp from sidemantic.adapters.atscale_sml import AtScaleSMLAdapter @@ -60,6 +59,8 @@ (CubeAdapter, "tests/fixtures/cube/switch_dimension.yml"), (CubeAdapter, "tests/fixtures/cube/visitors_geo_subquery.yaml"), (GoodDataAdapter, "tests/fixtures/gooddata/ecommerce_demo_ldm.json"), + (GoodDataAdapter, "tests/fixtures/gooddata/sdk_declarative_ldm.json"), + (GoodDataAdapter, "tests/fixtures/gooddata/sdk_declarative_ldm_with_sql_dataset.json"), (HexAdapter, "tests/fixtures/hex/employees.yml"), (HexAdapter, "tests/fixtures/hex/inventory.yml"), (HexAdapter, "tests/fixtures/hex/page_views.yml"), @@ -167,16 +168,6 @@ "tests/fixtures/gooddata/sdk_declarative_analytics_model.json", GoodDataParseError, ), - ( - GoodDataAdapter, - "tests/fixtures/gooddata/sdk_declarative_ldm.json", - ValidationError, - ), - ( - GoodDataAdapter, - "tests/fixtures/gooddata/sdk_declarative_ldm_with_sql_dataset.json", - ValidationError, - ), ] ADDED_EXPECTED_EMPTY_GRAPH_FIXTURES = { diff --git a/tests/adapters/test_fixture_functionality_contracts.py b/tests/adapters/test_fixture_functionality_contracts.py index ecde29a4..c7752a89 100644 --- a/tests/adapters/test_fixture_functionality_contracts.py +++ b/tests/adapters/test_fixture_functionality_contracts.py @@ -60,8 +60,6 @@ EXPECTED_PARSE_FAILURES = { "tests/fixtures/gooddata/ecommerce_demo_analytics.json": GoodDataParseError, "tests/fixtures/gooddata/sdk_declarative_analytics_model.json": GoodDataParseError, - "tests/fixtures/gooddata/sdk_declarative_ldm.json": ValidationError, - "tests/fixtures/gooddata/sdk_declarative_ldm_with_sql_dataset.json": ValidationError, "tests/fixtures/metricflow/sub_daily_grain_to_date_hour.yml": ValidationError, "tests/fixtures/metricflow/sub_daily_millisecond.yml": ValidationError, }