From 3ad8bbcb83dbf3fc1eb932806f153ac7a707a4ba Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 13 Jun 2026 11:14:27 -0700 Subject: [PATCH 01/17] feat(thoughtspot): add first-class TML Model object support ThoughtSpot deprecated the Worksheet object (auto-migrated to Models and removed in 10.12.0.cl). The adapter routed the top-level model: key into the worksheet parser, which only reads worksheet keys, so current TML Model files (export_schema_version v2) imported as empty/incorrect models. - Add _parse_model for the TML Model object: reads model_tables: for tables, model-level columns: for fields, and joins nested inside each model_tables entry via with:/on:/type:/cardinality:. - Map join cardinality (MANY_TO_ONE/ONE_TO_ONE/ONE_TO_MANY/MANY_TO_MANY) to sidemantic relationship types; resolve model_tables aliases used by role-playing dimensions. - Keep _parse_worksheet for legacy worksheet content nested under model:. - Extend loader auto-detection so a model + model_tables + columns .tml/.yaml/.yml file is recognized as ThoughtSpot. - Add sales.model.tml and model_alias.model.tml fixtures plus parsing, alias-resolution, loader auto-detect, and legacy-fallback tests. --- sidemantic/adapters/thoughtspot.py | 258 +++++++++++++++++- sidemantic/loaders.py | 7 + tests/adapters/thoughtspot/test_parsing.py | 178 ++++++++++++ .../thoughtspot/model_alias.model.tml | 32 +++ tests/fixtures/thoughtspot/sales.model.tml | 102 +++++++ 5 files changed, 576 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/thoughtspot/model_alias.model.tml create mode 100644 tests/fixtures/thoughtspot/sales.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 56d40eb2..af7fa558 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -28,6 +28,13 @@ _TIME_TYPES = {"DATE", "TIME", "DATETIME", "TIMESTAMP"} _BOOL_TYPES = {"BOOL", "BOOLEAN"} +_CARDINALITY_MAP = { + "MANY_TO_ONE": "many_to_one", + "ONE_TO_ONE": "one_to_one", + "ONE_TO_MANY": "one_to_many", + "MANY_TO_MANY": "many_to_many", +} + _AGGREGATION_MAP = { "SUM": "sum", "COUNT": "count", @@ -217,7 +224,14 @@ def _parse_file(self, file_path: Path) -> Model | None: if "worksheet" in data: return self._parse_worksheet(data.get("worksheet"), data) if "model" in data: - return self._parse_worksheet(data.get("model"), data) + model_def = data.get("model") + # TML Model objects (export_schema_version v2) use `model_tables:` and + # model-level `columns:`. Legacy Worksheet content nested under `model:` + # still uses `tables:`/`worksheet_columns:`, so fall back to the + # worksheet parser for back-compat. + if isinstance(model_def, dict) and ("model_tables" in model_def or "columns" in model_def): + return self._parse_model(model_def, data) + return self._parse_worksheet(model_def, data) return None @@ -455,6 +469,248 @@ def _parse_worksheet(self, worksheet_def: dict[str, Any] | None, full_def: dict[ return model + def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any]) -> Model | None: + """Parse a TML Model object (export_schema_version v2). + + Model TML differs from the legacy Worksheet TML: + - tables live under `model_tables:` (vs worksheet `tables:`) + - joins are nested inside each `model_tables` entry under `joins:`, + using `with:`/`on:`/`type:`/`cardinality:` (vs top-level worksheet + `joins:` with `source`/`destination`/`is_one_to_one`) + - fields live under model-level `columns:` (vs `worksheet_columns:`) + """ + if not model_def: + return None + + name = model_def.get("name") + if not name: + return None + + description = model_def.get("description") + model_tables = model_def.get("model_tables") or [] + + table_name_lookup = self._table_name_lookup(model_tables) + + # Each model_tables entry may carry an `alias` used in column_id paths + # and join expressions; map alias -> table name so refs resolve. + alias_lookup: dict[str, str] = {} + for table in model_tables: + table_name = table.get("name") or table.get("id") + alias = table.get("alias") + if alias and table_name: + alias_lookup[alias] = table_name + # Build the path lookup used to resolve column_id/expression table refs. + path_lookup: dict[str, str] = dict(table_name_lookup) + path_lookup.update(alias_lookup) + + # Flatten nested joins (one per model_tables entry) into the same shape + # the worksheet join helpers consume. + flat_joins: list[dict[str, Any]] = [] + for table in model_tables: + source = table.get("name") or table.get("id") + for join_def in table.get("joins") or []: + destination = join_def.get("with") + if not source or not destination: + continue + resolved_dest = alias_lookup.get(destination, table_name_lookup.get(destination, destination)) + # PyYAML (YAML 1.1) parses the bare `on:` key as the boolean True. + on_value = join_def.get("on") + if on_value is None and True in join_def: + on_value = join_def.get(True) + flat_joins.append( + { + "source": source, + "destination": resolved_dest, + "type": join_def.get("type"), + "on": on_value, + "cardinality": join_def.get("cardinality"), + } + ) + + sql, base_table = self._build_join_sql(model_tables, flat_joins, path_lookup, table_name_lookup) + relationships = self._parse_model_relationships(flat_joins, path_lookup, table_name_lookup) + + formulas = model_def.get("formulas") or [] + formula_by_id = {f.get("id"): f for f in formulas if f.get("id")} + formula_by_name = {f.get("name"): f for f in formulas if f.get("name")} + + dimensions: list[Dimension] = [] + metrics: list[Metric] = [] + + for col_def in model_def.get("columns") or []: + col_name = col_def.get("name") + column_id = col_def.get("column_id") + formula_id = col_def.get("formula_id") + if not col_name: + if formula_id and formula_id in formula_by_id: + col_name = formula_by_id[formula_id].get("name") + elif column_id: + col_name = column_id.split("::")[-1] + + if not col_name: + continue + + properties = col_def.get("properties") or {} + column_type = _normalize(properties.get("column_type")) or "ATTRIBUTE" + bucket = _map_bucket(properties.get("default_date_bucket")) + label = col_def.get("custom_name") or col_def.get("display_name") + col_description = col_def.get("description") + format_pattern = properties.get("format_pattern") + + sql_expr = None + if formula_id and formula_id in formula_by_id: + sql_expr = formula_by_id[formula_id].get("expr") + elif formula_id and formula_id in formula_by_name: + sql_expr = formula_by_name[formula_id].get("expr") + elif col_name in formula_by_name: + sql_expr = formula_by_name[col_name].get("expr") + + if not sql_expr and column_id: + if "::" in column_id: + path_id, col_ref = column_id.split("::", 1) + table_name = path_lookup.get(path_id) + if table_name: + sql_expr = f"{table_name}.{col_ref}" + else: + sql_expr = col_ref + else: + sql_expr = column_id + + sql_expr = _convert_tml_expr(sql_expr, path_lookup) + + if column_type == "MEASURE": + agg, unsupported_func = _map_aggregation(properties.get("aggregation")) + metric_sql = sql_expr + if agg: + metric = Metric( + name=col_name, + agg=agg, + sql=metric_sql, + label=label, + description=col_description, + format=format_pattern, + ) + else: + if unsupported_func: + metric_sql = f"{unsupported_func}({metric_sql})" if metric_sql else unsupported_func + metric = Metric( + name=col_name, + type="derived", + sql=metric_sql, + label=label, + description=col_description, + format=format_pattern, + ) + metrics.append(metric) + else: + data_type = col_def.get("data_type") or (col_def.get("db_column_properties") or {}).get("data_type") + dim_type, granularity = _map_dimension_type(data_type, bucket) + dim = Dimension( + name=col_name, + type=dim_type, + sql=sql_expr, + granularity=granularity, + label=label, + description=col_description, + format=format_pattern, + ) + dimensions.append(dim) + + default_time_dimension = None + default_grain = None + for dim in dimensions: + if dim.type == "time": + default_time_dimension = dim.name + default_grain = dim.granularity + break + + primary_key = "id" + if any(d.name.lower() == "id" for d in dimensions): + primary_key = next(d.name for d in dimensions if d.name.lower() == "id") + + model = Model( + name=name, + table=base_table if not sql else None, + sql=sql, + description=description, + primary_key=primary_key, + dimensions=dimensions, + metrics=metrics, + relationships=relationships, + default_time_dimension=default_time_dimension, + default_grain=default_grain, + ) + + if base_table: + setattr(model, "_worksheet_base_table", base_table) + setattr(model, "_source_tml_type", "model") + + return model + + def _parse_model_relationships( + self, + joins: list[dict[str, Any]], + table_path_lookup: dict[str, str] | None = None, + table_name_lookup: dict[str, str] | None = None, + ) -> list[Relationship]: + """Build relationships from flattened model_tables joins. + + Model joins carry `cardinality:` directly (MANY_TO_ONE/ONE_TO_ONE/ + ONE_TO_MANY/MANY_TO_MANY), unlike worksheet joins which only flag + `is_one_to_one`. + """ + relationships: list[Relationship] = [] + + for join_def in joins: + source = join_def.get("source") + destination = join_def.get("destination") + if not source or not destination: + continue + + join_type = _normalize(join_def.get("type")) or "INNER" + on_value = join_def.get("on") + lookup = table_path_lookup or table_name_lookup + left, right = _extract_join_refs(on_value, lookup) + + if table_name_lookup: + source = table_name_lookup.get(source, source) + destination = table_name_lookup.get(destination, destination) + + foreign_key = None + primary_key = None + + if left and right: + left_table, left_col = left + right_table, right_col = right + + if left_table == source and right_table == destination: + foreign_key = left_col + primary_key = right_col + elif left_table == destination and right_table == source: + foreign_key = right_col + primary_key = left_col + else: + if left_table == source: + foreign_key = left_col + if right_table == destination: + primary_key = right_col + + cardinality = _normalize(join_def.get("cardinality")) + rel_type = _CARDINALITY_MAP.get(cardinality or "", "many_to_one") + if join_type in {"RIGHT_OUTER", "FULL_OUTER", "OUTER"} and cardinality not in _CARDINALITY_MAP: + rel_type = "many_to_many" + + relationships.append( + Relationship( + name=destination, + type=rel_type, + foreign_key=foreign_key, + primary_key=primary_key, + ) + ) + + return relationships + def _build_join_sql( self, tables: list[dict[str, Any]], diff --git a/sidemantic/loaders.py b/sidemantic/loaders.py index 40c622b2..735cd0cb 100644 --- a/sidemantic/loaders.py +++ b/sidemantic/loaders.py @@ -184,6 +184,13 @@ def load_from_directory(layer: "SemanticLayer", directory: str | Path, *, strict adapter = ThoughtSpotAdapter() elif _contains_yaml_key(yaml_data, "worksheet") and _contains_yaml_key(yaml_data, "worksheet_columns"): adapter = ThoughtSpotAdapter() + elif ( + _contains_yaml_key(yaml_data, "model") + and _contains_yaml_key(yaml_data, "model_tables") + and _contains_yaml_key(yaml_data, "columns") + ): + # ThoughtSpot TML Model object (export_schema_version v2) + adapter = ThoughtSpotAdapter() elif _yaml_has_top_level_key(yaml_data, "tables") and _contains_yaml_key(yaml_data, "base_table"): # Snowflake Cortex Semantic Model format adapter = SnowflakeAdapter() diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index b01068d6..1236cdd8 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -674,3 +674,181 @@ def test_import_tpch_liveboard_skipped(): adapter = ThoughtSpotAdapter() graph = adapter.parse("tests/fixtures/thoughtspot/tpch_liveboard.liveboard.tml") assert len(graph.models) == 0 + + +# ============================================================================= +# TML MODEL OBJECT (export_schema_version v2) +# ============================================================================= + + +def test_import_thoughtspot_model(): + """Parse a first-class TML Model object (model_tables + columns + nested joins).""" + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/sales.model.tml") + + assert "sales_model" in graph.models + model = graph.models["sales_model"] + + # Nested model_tables joins produce a joined SQL and relationships. + assert model.sql is not None + assert "JOIN customers" in model.sql + assert "JOIN regions" in model.sql + # Composite ON predicate from the regions join is preserved. + assert "country_code" in model.sql + + relationships = {rel.name: rel for rel in model.relationships} + # cardinality: MANY_TO_ONE + assert relationships["customers"].type == "many_to_one" + assert relationships["customers"].foreign_key == "customer_id" + assert relationships["customers"].primary_key == "id" + # cardinality: ONE_TO_ONE + assert relationships["regions"].type == "one_to_one" + assert relationships["regions"].foreign_key == "region_id" + assert relationships["regions"].primary_key == "id" + + # Dimensions: column_id :: resolves to table.column. + order_id = model.get_dimension("order_id") + assert order_id is not None + assert order_id.sql == "sales.id" + assert order_id.description == "Order identifier" + + order_date = model.get_dimension("order_date") + assert order_date is not None + assert order_date.type == "time" + assert order_date.granularity == "day" + assert order_date.label == "Order Date" + + is_active = model.get_dimension("is_active") + assert is_active is not None + assert is_active.type == "boolean" + + # Column from a joined table resolves to that table's name. + customer_name = model.get_dimension("customer_name") + assert customer_name is not None + assert customer_name.sql == "customers.name" + assert customer_name.label == "Customer" + + region_name = model.get_dimension("region_name") + assert region_name is not None + assert region_name.sql == "regions.name" + + # Simple aggregated measure. + gross_revenue = model.get_metric("gross_revenue") + assert gross_revenue is not None + assert gross_revenue.agg == "sum" + assert gross_revenue.format == "$#,##0.00" + assert gross_revenue.sql == "sales.gross_revenue" + + # Formula-backed measures. + net_revenue = model.get_metric("net_revenue") + assert net_revenue is not None + assert net_revenue.agg == "sum" + assert "gross_revenue" in (net_revenue.sql or "") + + avg_order_value = model.get_metric("avg_order_value") + assert avg_order_value is not None + assert avg_order_value.agg == "avg" + assert "/" in (avg_order_value.sql or "") + + # Aggregation coverage. + assert model.get_metric("distinct_customers").agg == "count_distinct" + assert model.get_metric("order_count").agg == "count" + assert model.get_metric("min_order_value").agg == "min" + assert model.get_metric("max_order_value").agg == "max" + assert model.get_metric("median_order_value").agg == "median" + + # Unsupported aggregation falls back to a derived metric. + revenue_stddev = model.get_metric("revenue_stddev") + assert revenue_stddev is not None + assert revenue_stddev.type == "derived" + assert "STDDEV" in (revenue_stddev.sql or "") + + +def test_import_thoughtspot_model_alias(): + """Parse a TML Model with id-based tables and an aliased (role-playing) join.""" + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_alias.model.tml") + + assert "orders_model" in graph.models + model = graph.models["orders_model"] + + # The alias `ship_country` in column_id / join `with` resolves to `countries`. + assert model.sql is not None + assert "JOIN countries" in model.sql + + relationships = {rel.name: rel for rel in model.relationships} + assert "countries" in relationships + assert relationships["countries"].type == "many_to_one" + assert relationships["countries"].foreign_key == "ship_country_id" + assert relationships["countries"].primary_key == "id" + + ship_country = model.get_dimension("ship_country_name") + assert ship_country is not None + assert ship_country.sql == "countries.name" + + amount = model.get_metric("amount") + assert amount is not None + assert amount.agg == "sum" + + +def test_thoughtspot_model_auto_detect_loader(): + """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" + with tempfile.TemporaryDirectory() as tmpdir: + tml_file = Path(tmpdir) / "sales.model.yaml" + tml_file.write_text( + """ +model: + name: sales_model + description: Sales model + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM +""" + ) + + layer = SemanticLayer() + load_from_directory(layer, tmpdir) + + assert "sales_model" in layer.graph.models + model = layer.graph.models["sales_model"] + assert model.get_metric("amount").agg == "sum" + + +def test_thoughtspot_legacy_model_key_still_worksheet(): + """Legacy worksheet content nested under `model:` still parses as a worksheet.""" + adapter = ThoughtSpotAdapter() + tml_def = { + "model": { + "name": "legacy_sheet", + "tables": [{"name": "orders"}], + "worksheet_columns": [ + { + "name": "amount", + "column_id": "orders::amount", + "properties": {"column_type": "MEASURE", "aggregation": "SUM"}, + }, + ], + } + } + + with tempfile.NamedTemporaryFile(mode="w", suffix=".tml", delete=False) as f: + yaml.safe_dump(tml_def, f, sort_keys=False) + temp_path = Path(f.name) + + try: + graph = adapter.parse(temp_path) + assert "legacy_sheet" in graph.models + model = graph.models["legacy_sheet"] + assert model.get_metric("amount").agg == "sum" + finally: + temp_path.unlink(missing_ok=True) diff --git a/tests/fixtures/thoughtspot/model_alias.model.tml b/tests/fixtures/thoughtspot/model_alias.model.tml new file mode 100644 index 00000000..e6e4d969 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_alias.model.tml @@ -0,0 +1,32 @@ +guid: "model-alias" +export_schema_version: "2" +model: + name: orders_model + description: Model with id-based tables and an aliased role-playing join + model_tables: + - name: orders + id: orders_tbl + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: ship_country + on: "[orders::ship_country_id] = [ship_country::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: countries + id: countries_tbl + fqn: ANALYTICS.PUBLIC.countries + alias: ship_country + columns: + - name: order_id + column_id: orders::id + properties: + column_type: ATTRIBUTE + - name: ship_country_name + column_id: ship_country::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/sales.model.tml b/tests/fixtures/thoughtspot/sales.model.tml new file mode 100644 index 00000000..5a292e77 --- /dev/null +++ b/tests/fixtures/thoughtspot/sales.model.tml @@ -0,0 +1,102 @@ +guid: "model-001" +export_schema_version: "2" +model: + name: sales_model + description: Sales model (TML v2 export) + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: customers + on: "[sales::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - with: regions + on: "[sales::region_id] = [regions::id] AND [sales::country_code] = [regions::country_code]" + type: INNER + cardinality: ONE_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + - name: regions + fqn: ANALYTICS.PUBLIC.regions + formulas: + - name: net_revenue + expr: "[sales::gross_revenue] - [sales::discount]" + id: net_rev + - name: avg_order_value + expr: "[sales::gross_revenue] / [sales::order_count]" + id: avg_order + columns: + - name: order_id + description: Order identifier + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: order_date + column_id: sales::order_date + data_type: DATE + custom_name: Order Date + properties: + column_type: ATTRIBUTE + default_date_bucket: DAILY + format_pattern: "YYYY-MM-DD" + - name: is_active + column_id: sales::is_active + data_type: BOOLEAN + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + custom_name: Customer + properties: + column_type: ATTRIBUTE + - name: region_name + column_id: regions::name + properties: + column_type: ATTRIBUTE + - name: gross_revenue + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: SUM + format_pattern: "$#,##0.00" + - name: net_revenue + formula_id: net_rev + properties: + column_type: MEASURE + aggregation: SUM + - name: avg_order_value + formula_id: avg_order + properties: + column_type: MEASURE + aggregation: AVERAGE + - name: distinct_customers + column_id: sales::customer_id + properties: + column_type: MEASURE + aggregation: COUNT_DISTINCT + - name: order_count + column_id: sales::id + properties: + column_type: MEASURE + aggregation: COUNT + - name: min_order_value + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: MIN + - name: max_order_value + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: MAX + - name: median_order_value + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: MEDIAN + - name: revenue_stddev + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: STD_DEVIATION From ede3e1858b15b9338be7b6409302f2fa73a3b038 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sat, 13 Jun 2026 15:33:45 -0700 Subject: [PATCH 02/17] fix(thoughtspot): keep table qualifier for model column_id name refs Model column_id paths reference a table by its name even when the table also has an id (e.g. column_id: orders::amount for a table with id: orders_tbl). _table_name_lookup only mapped id -> name, so those refs missed the lookup and the qualifier was dropped, emitting bare columns like amount/id that are ambiguous in joined model SQL. Map name -> name in addition to id -> name (id mappings take precedence) so column_id qualifiers resolve to table.column. --- sidemantic/adapters/thoughtspot.py | 9 +++++++++ tests/adapters/thoughtspot/test_parsing.py | 7 +++++++ 2 files changed, 16 insertions(+) diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index af7fa558..3751d04f 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -884,6 +884,15 @@ def _table_name_lookup(self, tables: list[dict[str, Any]]) -> dict[str, str]: table_name = table.get("name") or table.get("id") if table_id and table_name: lookup[table_id] = table_name + # Model `column_id`s reference a table by its `name` even when an `id` + # is present (e.g. `column_id: orders::amount` for a table with + # `id: orders_tbl`). Map `name -> name` too so those qualifiers resolve + # instead of being dropped, which would emit ambiguous unqualified + # columns in joined models. `id -> name` mappings take precedence. + for table in tables: + table_name = table.get("name") + if table_name and table_name not in lookup: + lookup[table_name] = table_name return lookup def export(self, graph: SemanticGraph, output_path: str | Path) -> None: diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 1236cdd8..2c1d323e 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -786,9 +786,16 @@ def test_import_thoughtspot_model_alias(): assert ship_country is not None assert ship_country.sql == "countries.name" + # `column_id` paths that use the table `name` (even when an `id` exists) + # keep their table qualifier instead of collapsing to a bare column. + order_id = model.get_dimension("order_id") + assert order_id is not None + assert order_id.sql == "orders.id" + amount = model.get_metric("amount") assert amount is not None assert amount.agg == "sum" + assert amount.sql == "orders.amount" def test_thoughtspot_model_auto_detect_loader(): From 924af094202cc50bb1cb5ca4097782a6f92310e4 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 07:00:54 -0700 Subject: [PATCH 03/17] fix(thoughtspot): make joined models queryable and preserve role-playing aliases Joined Model TMLs are exported as derived SQL (FROM () AS t) but their dimension/metric SQL kept inner table qualifiers like sales.gross_revenue, which are out of scope in the wrapping subquery and made joined models unqueryable. Rewrite the inner SELECT * into an explicit projection that aliases each table.column to a stable unqualified output name, expose the primary key, and rewrite column SQL to those aliases. Also preserve with: aliases as role identifiers instead of resolving them to the backing table, so role-playing joins (ship_country/bill_country both backed by countries) stay distinct via JOIN countries AS rather than collapsing into a single ambiguous countries relation. --- sidemantic/adapters/thoughtspot.py | 127 ++++++++++++++++-- tests/adapters/thoughtspot/test_parsing.py | 122 ++++++++++++++--- .../thoughtspot/role_playing.model.tml | 41 ++++++ 3 files changed, 263 insertions(+), 27 deletions(-) create mode 100644 tests/fixtures/thoughtspot/role_playing.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 3751d04f..b3c949ac 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -187,6 +187,82 @@ def _simple_column(sql: str | None, fallback: str | None) -> str | None: return fallback +def _expose_joined_columns( + sql: str | None, + tables: set[str], + dimensions: list[Dimension], + metrics: list[Metric], + base_table: str | None = None, + primary_key: str | None = None, +) -> str | None: + """Rewrite a joined model's derived SQL so its columns are queryable. + + A joined Model TML becomes a derived table (``FROM () AS t``) whose + column expressions still carry inner table qualifiers like ``sales.amount``. + Those qualifiers are out of scope once the join is wrapped in a subquery, so + a normal query (e.g. ``SELECT sales.amount FROM (...) AS t``) fails with + "table sales not found". This replaces the ``SELECT *`` projection with an + explicit list that aliases each referenced ``table.column`` to a stable, + unqualified output name, then rewrites the dimension/metric SQL to use those + aliases so the outer query stays in scope. + + The model's ``primary_key`` is also passed through by the SQL generator as a + bare column, so the base table's primary key is exposed under that name when + no model column already projects it. + """ + if not sql or "SELECT * FROM " not in sql: + return sql + + # Collect every distinct `table.column` referenced by the model's columns + # where `table` is one of the joined tables. Preserve first-seen order so + # the generated projection is deterministic. + projection: dict[tuple[str, str], str] = {} + + def _collect(expr: str | None) -> None: + if not expr: + return + for table, column in _TML_DOT_REF.findall(expr): + if table in tables: + projection.setdefault((table, column), f"{table}__{column}") + + for dim in dimensions: + _collect(dim.sql) + for metric in metrics: + _collect(metric.sql) + + if not projection: + return sql + + def _rewrite(expr: str | None) -> str | None: + if not expr: + return expr + + def _replace(match: re.Match[str]) -> str: + table = match.group(1) + column = match.group(2) + alias = projection.get((table, column)) + return alias if alias else match.group(0) + + return _TML_DOT_REF.sub(_replace, expr) + + for dim in dimensions: + dim.sql = _rewrite(dim.sql) + for metric in metrics: + metric.sql = _rewrite(metric.sql) + + select_parts = [f"{table}.{column} AS {alias}" for (table, column), alias in projection.items()] + + # The SQL generator passes through `model.primary_key` as a bare column when + # querying derived models. Expose `.` under that + # name so the key resolves instead of referencing an out-of-scope column. + aliases = set(projection.values()) + if base_table and primary_key and primary_key not in aliases: + select_parts.append(f"{base_table}.{primary_key} AS {primary_key}") + + select_list = ", ".join(select_parts) + return sql.replace("SELECT * FROM ", f"SELECT {select_list} FROM ", 1) + + class ThoughtSpotAdapter(BaseAdapter): """Adapter for ThoughtSpot TML (YAML) tables and worksheets.""" @@ -492,16 +568,23 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any table_name_lookup = self._table_name_lookup(model_tables) # Each model_tables entry may carry an `alias` used in column_id paths - # and join expressions; map alias -> table name so refs resolve. - alias_lookup: dict[str, str] = {} + # and join expressions. The alias is the role identifier (e.g. + # `ship_country`/`bill_country` both backed by `countries`), so keep the + # alias as the join/relationship/qualifier name and only track the + # underlying table for emitting `JOIN
AS `. Resolving the + # alias away here would collapse distinct role-playing joins into a + # single ambiguous `countries` relation. + alias_to_table: dict[str, str] = {} for table in model_tables: table_name = table.get("name") or table.get("id") alias = table.get("alias") if alias and table_name: - alias_lookup[alias] = table_name + alias_to_table[alias] = table_name # Build the path lookup used to resolve column_id/expression table refs. + # Aliases resolve to themselves so qualifiers stay role-scoped. path_lookup: dict[str, str] = dict(table_name_lookup) - path_lookup.update(alias_lookup) + for alias in alias_to_table: + path_lookup[alias] = alias # Flatten nested joins (one per model_tables entry) into the same shape # the worksheet join helpers consume. @@ -512,7 +595,12 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any destination = join_def.get("with") if not source or not destination: continue - resolved_dest = alias_lookup.get(destination, table_name_lookup.get(destination, destination)) + # Keep an aliased destination as-is (the role name); only resolve + # non-aliased ids to their table name. + if destination in alias_to_table: + resolved_dest = destination + else: + resolved_dest = table_name_lookup.get(destination, destination) # PyYAML (YAML 1.1) parses the bare `on:` key as the boolean True. on_value = join_def.get("on") if on_value is None and True in join_def: @@ -527,7 +615,7 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any } ) - sql, base_table = self._build_join_sql(model_tables, flat_joins, path_lookup, table_name_lookup) + sql, base_table = self._build_join_sql(model_tables, flat_joins, path_lookup, table_name_lookup, alias_to_table) relationships = self._parse_model_relationships(flat_joins, path_lookup, table_name_lookup) formulas = model_def.get("formulas") or [] @@ -616,6 +704,21 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any ) dimensions.append(dim) + primary_key = "id" + if any(d.name.lower() == "id" for d in dimensions): + primary_key = next(d.name for d in dimensions if d.name.lower() == "id") + + # A joined model is exported as derived SQL (FROM () AS t); rewrite + # its `SELECT *` into explicit aliased columns and update the dimension/ + # metric SQL so the inner table qualifiers stay in scope when queried. + if sql: + known_tables = set(table_name_lookup.values()) + for join_def in flat_joins: + known_tables.add(join_def.get("source")) + known_tables.add(join_def.get("destination")) + known_tables.discard(None) + sql = _expose_joined_columns(sql, known_tables, dimensions, metrics, base_table, primary_key) + default_time_dimension = None default_grain = None for dim in dimensions: @@ -624,10 +727,6 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any default_grain = dim.granularity break - primary_key = "id" - if any(d.name.lower() == "id" for d in dimensions): - primary_key = next(d.name for d in dimensions if d.name.lower() == "id") - model = Model( name=name, table=base_table if not sql else None, @@ -717,6 +816,7 @@ def _build_join_sql( joins: list[dict[str, Any]], table_path_lookup: dict[str, str] | None = None, table_name_lookup: dict[str, str] | None = None, + alias_to_table: dict[str, str] | None = None, ) -> tuple[str | None, str | None]: base_table = None if tables: @@ -766,7 +866,12 @@ def _build_join_sql( "INNER": "INNER", }.get(join_type, "INNER") - clauses.append(f"{join_keyword} JOIN {right} ON {on_expr}") + # Role-playing joins keep the alias as the relation name; emit + # `
AS ` so two roles backed by the same table do not + # produce an ambiguous, duplicated join. + backing = alias_to_table.get(right) if alias_to_table else None + right_clause = f"{backing} AS {right}" if backing and backing != right else right + clauses.append(f"{join_keyword} JOIN {right_clause} ON {on_expr}") joined.add(right) if not clauses: diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 2c1d323e..c0294cd6 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -706,10 +706,18 @@ def test_import_thoughtspot_model(): assert relationships["regions"].foreign_key == "region_id" assert relationships["regions"].primary_key == "id" - # Dimensions: column_id
:: resolves to table.column. + # A joined model becomes a derived table. Its `SELECT *` is rewritten into an + # explicit projection that aliases each inner `table.column` to a stable, + # unqualified output name so the columns stay in scope when queried. + assert "SELECT *" not in model.sql + assert "sales.id AS sales__id" in model.sql + assert "customers.name AS customers__name" in model.sql + + # Dimensions reference the unqualified aliased output names, not inner + # `table.column` qualifiers that are out of scope in the derived subquery. order_id = model.get_dimension("order_id") assert order_id is not None - assert order_id.sql == "sales.id" + assert order_id.sql == "sales__id" assert order_id.description == "Order identifier" order_date = model.get_dimension("order_date") @@ -722,22 +730,22 @@ def test_import_thoughtspot_model(): assert is_active is not None assert is_active.type == "boolean" - # Column from a joined table resolves to that table's name. + # Column from a joined table resolves to that table's aliased output name. customer_name = model.get_dimension("customer_name") assert customer_name is not None - assert customer_name.sql == "customers.name" + assert customer_name.sql == "customers__name" assert customer_name.label == "Customer" region_name = model.get_dimension("region_name") assert region_name is not None - assert region_name.sql == "regions.name" + assert region_name.sql == "regions__name" # Simple aggregated measure. gross_revenue = model.get_metric("gross_revenue") assert gross_revenue is not None assert gross_revenue.agg == "sum" assert gross_revenue.format == "$#,##0.00" - assert gross_revenue.sql == "sales.gross_revenue" + assert gross_revenue.sql == "sales__gross_revenue" # Formula-backed measures. net_revenue = model.get_metric("net_revenue") @@ -772,30 +780,112 @@ def test_import_thoughtspot_model_alias(): assert "orders_model" in graph.models model = graph.models["orders_model"] - # The alias `ship_country` in column_id / join `with` resolves to `countries`. + # The alias `ship_country` is the role identifier and is preserved as the + # join relation (`countries AS ship_country`) and relationship name, instead + # of being resolved away to the backing `countries` table. assert model.sql is not None - assert "JOIN countries" in model.sql + assert "JOIN countries AS ship_country" in model.sql relationships = {rel.name: rel for rel in model.relationships} - assert "countries" in relationships - assert relationships["countries"].type == "many_to_one" - assert relationships["countries"].foreign_key == "ship_country_id" - assert relationships["countries"].primary_key == "id" + assert "ship_country" in relationships + assert relationships["ship_country"].type == "many_to_one" + assert relationships["ship_country"].foreign_key == "ship_country_id" + assert relationships["ship_country"].primary_key == "id" ship_country = model.get_dimension("ship_country_name") assert ship_country is not None - assert ship_country.sql == "countries.name" + assert ship_country.sql == "ship_country__name" # `column_id` paths that use the table `name` (even when an `id` exists) - # keep their table qualifier instead of collapsing to a bare column. + # keep their table qualifier instead of collapsing to a bare column; the + # qualifier is carried through to the aliased output name. order_id = model.get_dimension("order_id") assert order_id is not None - assert order_id.sql == "orders.id" + assert order_id.sql == "orders__id" amount = model.get_metric("amount") assert amount is not None assert amount.agg == "sum" - assert amount.sql == "orders.amount" + assert amount.sql == "orders__amount" + + +def test_thoughtspot_joined_model_is_queryable(): + """A joined Model TML compiles to SQL that executes (columns stay in scope). + + Regression: joined models were exported as `FROM (SELECT * FROM sales JOIN ...) AS t` + while dimensions/metrics kept inner qualifiers like `sales.gross_revenue`, so a + normal query produced `SELECT sales.gross_revenue FROM (...) AS t` and failed with + "table sales not found". + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/sales.model.tml") + + layer = SemanticLayer() + for model in graph.models.values(): + layer.add_model(model) + + con = duckdb.connect() + con.execute( + "CREATE TABLE sales (id INT, order_date DATE, is_active BOOL, gross_revenue DOUBLE, " + "discount DOUBLE, order_count INT, customer_id INT, region_id INT, country_code VARCHAR)" + ) + con.execute( + "INSERT INTO sales VALUES (1, DATE '2024-01-01', true, 100.0, 10.0, 2, 5, 7, 'US'), " + "(2, DATE '2024-01-01', true, 50.0, 5.0, 1, 5, 7, 'US')" + ) + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + con.execute("CREATE TABLE regions (id INT, country_code VARCHAR, name VARCHAR)") + con.execute("INSERT INTO regions VALUES (7, 'US', 'West')") + + sql = layer.compile( + metrics=["sales_model.gross_revenue", "sales_model.net_revenue", "sales_model.order_count"], + dimensions=["sales_model.order_date", "sales_model.customer_name", "sales_model.region_name"], + ) + rows = con.execute(sql).fetchall() + assert rows == [(__import__("datetime").date(2024, 1, 1), "Acme", "West", 150.0, 135.0, 2)] + + +def test_thoughtspot_role_playing_joins_stay_distinct(): + """Two aliases backed by the same table become distinct role-playing joins. + + Regression: resolving `with:` aliases to the backing table name collapsed + `ship_country` and `bill_country` (both backed by `countries`) into a single + ambiguous `countries` join/relationship. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/role_playing.model.tml") + model = graph.models["shipments_model"] + + # Both aliases survive as distinct, aliased joins and relationships. + assert "JOIN countries AS ship_country" in model.sql + assert "JOIN countries AS bill_country" in model.sql + rel_names = {rel.name for rel in model.relationships} + assert {"ship_country", "bill_country"} <= rel_names + assert model.get_dimension("ship_country_name").sql == "ship_country__name" + assert model.get_dimension("bill_country_name").sql == "bill_country__name" + + layer = SemanticLayer() + for m in graph.models.values(): + layer.add_model(m) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (id INT, amount DOUBLE, ship_country_id INT, bill_country_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 7, 8)") + con.execute("CREATE TABLE countries (id INT, name VARCHAR)") + con.execute("INSERT INTO countries VALUES (7, 'US'), (8, 'CA')") + + sql = layer.compile( + metrics=["shipments_model.amount"], + dimensions=["shipments_model.ship_country_name", "shipments_model.bill_country_name"], + ) + rows = con.execute(sql).fetchall() + # The two roles resolve to different countries (US shipped, CA billed). + assert rows == [("US", "CA", 100.0)] def test_thoughtspot_model_auto_detect_loader(): diff --git a/tests/fixtures/thoughtspot/role_playing.model.tml b/tests/fixtures/thoughtspot/role_playing.model.tml new file mode 100644 index 00000000..c52c6da9 --- /dev/null +++ b/tests/fixtures/thoughtspot/role_playing.model.tml @@ -0,0 +1,41 @@ +guid: "model-role-playing" +export_schema_version: "2" +model: + name: shipments_model + description: Role-playing model with two aliases backed by the same table + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: ship_country + on: "[orders::ship_country_id] = [ship_country::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - with: bill_country + on: "[orders::bill_country_id] = [bill_country::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: countries + fqn: ANALYTICS.PUBLIC.countries + alias: ship_country + - name: countries + fqn: ANALYTICS.PUBLIC.countries + alias: bill_country + columns: + - name: order_id + column_id: orders::id + properties: + column_type: ATTRIBUTE + - name: ship_country_name + column_id: ship_country::name + properties: + column_type: ATTRIBUTE + - name: bill_country_name + column_id: bill_country::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM From 8f9777cd1faa4b3a63930d12cf520fd12199d625 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 08:31:36 -0700 Subject: [PATCH 04/17] fix(thoughtspot): resolve unqualified formula refs and aliased base tables in joined models --- sidemantic/adapters/thoughtspot.py | 65 ++++++++++++--- tests/adapters/thoughtspot/test_parsing.py | 80 +++++++++++++++++++ .../thoughtspot/model_base_alias.model.tml | 30 +++++++ .../model_unqualified_formula.model.tml | 38 +++++++++ 4 files changed, 201 insertions(+), 12 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_base_alias.model.tml create mode 100644 tests/fixtures/thoughtspot/model_unqualified_formula.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index b3c949ac..ceeef7a1 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -54,6 +54,9 @@ _SIMPLE_IDENTIFIER = re.compile(r"^[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)*$") _TML_REF = re.compile(r"\[([^\]]+)\]") _TML_DOT_REF = re.compile(r"\b([A-Za-z_][A-Za-z0-9_]*)\.([A-Za-z_][A-Za-z0-9_]*)\b") +# A bare identifier that is NOT part of a `table.column` qualifier and is NOT a +# function call: not preceded by `.`/word char, not followed by `.` or `(`. +_BARE_IDENTIFIER = re.compile(r"(? str | None: @@ -233,6 +236,21 @@ def _collect(expr: str | None) -> None: if not projection: return sql + # ThoughtSpot formulas also use unqualified references (e.g. + # `[gross_revenue] - [sales::discount]`), which convert to bare identifiers + # like `gross_revenue`. Map each such column name to its projected alias when + # exactly one joined table projects that column, so the rewritten expression + # uses the in-scope output alias instead of an out-of-scope bare column. + bare_to_alias: dict[str, str] = {} + ambiguous: set[str] = set() + for (_table, column), alias in projection.items(): + if column in bare_to_alias and bare_to_alias[column] != alias: + ambiguous.add(column) + else: + bare_to_alias[column] = alias + for column in ambiguous: + bare_to_alias.pop(column, None) + def _rewrite(expr: str | None) -> str | None: if not expr: return expr @@ -243,7 +261,12 @@ def _replace(match: re.Match[str]) -> str: alias = projection.get((table, column)) return alias if alias else match.group(0) - return _TML_DOT_REF.sub(_replace, expr) + rewritten = _TML_DOT_REF.sub(_replace, expr) + + def _replace_bare(match: re.Match[str]) -> str: + return bare_to_alias.get(match.group(1), match.group(0)) + + return _BARE_IDENTIFIER.sub(_replace_bare, rewritten) for dim in dimensions: dim.sql = _rewrite(dim.sql) @@ -587,10 +610,13 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any path_lookup[alias] = alias # Flatten nested joins (one per model_tables entry) into the same shape - # the worksheet join helpers consume. + # the worksheet join helpers consume. When a table carries an `alias`, + # its `column_id`/`on` qualifiers use the alias (e.g. `o::id`), so the + # join `source` must be the alias too (not the backing table name) for + # the join-direction logic and SQL relation name to stay consistent. flat_joins: list[dict[str, Any]] = [] for table in model_tables: - source = table.get("name") or table.get("id") + source = table.get("alias") or table.get("name") or table.get("id") for join_def in table.get("joins") or []: destination = join_def.get("with") if not source or not destination: @@ -713,6 +739,10 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any # metric SQL so the inner table qualifiers stay in scope when queried. if sql: known_tables = set(table_name_lookup.values()) + # Aliases (role-playing or a base-table alias) are the in-scope + # relation names that qualify columns like `o.id`/`ship_country.name`, + # so they must be recognized when projecting the derived columns. + known_tables.update(alias_to_table.keys()) for join_def in flat_joins: known_tables.add(join_def.get("source")) known_tables.add(join_def.get("destination")) @@ -819,16 +849,24 @@ def _build_join_sql( alias_to_table: dict[str, str] | None = None, ) -> tuple[str | None, str | None]: base_table = None + base_alias = None if tables: base_table = tables[0].get("name") or tables[0].get("id") + base_alias = tables[0].get("alias") + + # When the base table is aliased, its `column_id`/`on` qualifiers use the + # alias (e.g. `o::id`), so the relation in scope is the alias. Emit + # `FROM
AS ` and key the join tracking on the alias so the + # alias resolves instead of failing with "table o not found". + base_relation = base_alias if (base_alias and base_alias != base_table) else base_table joined: set[str] = set() - if base_table: - joined.add(base_table) + if base_relation: + joined.add(base_relation) clauses: list[str] = [] - if base_table: - clauses.append(base_table) + if base_relation: + clauses.append(f"{base_table} AS {base_alias}" if base_relation != base_table else base_table) for join_def in joins: source = join_def.get("source") @@ -846,10 +884,11 @@ def _build_join_sql( source = table_name_lookup.get(source, source) destination = table_name_lookup.get(destination, destination) - if not base_table: + if not base_relation: base_table = source - clauses.append(base_table) - joined.add(base_table) + base_relation = source + clauses.append(base_relation) + joined.add(base_relation) if source in joined and destination not in joined: right = destination @@ -878,13 +917,15 @@ def _build_join_sql( return None, None if len(clauses) == 1: - return None, clauses[0] + # Single base table, no joins: `model.table` is the real table name, + # not an alias (no subquery wraps it, so there is nothing to alias). + return None, base_table sql = "SELECT * FROM " + clauses[0] for clause in clauses[1:]: sql += f"\n{clause}" - return sql, base_table + return sql, base_relation def _parse_join_relationships( self, diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index c0294cd6..0b6004f0 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -888,6 +888,86 @@ def test_thoughtspot_role_playing_joins_stay_distinct(): assert rows == [("US", "CA", 100.0)] +def test_thoughtspot_unqualified_formula_ref_is_queryable(): + """A joined-model formula using an unqualified column ref stays queryable. + + Regression: ThoughtSpot formulas often use the unqualified reference form, + e.g. `[gross_revenue] - [sales::discount]`. The derived projection exposed + `sales.gross_revenue AS sales__gross_revenue`, but the metric SQL kept the + bare `gross_revenue`, so a normal query failed with + `Referenced column "gross_revenue" not found`. The bare ref must resolve to + the projected output alias. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_unqualified_formula.model.tml") + model = graph.models["sales_model"] + + # The unqualified `gross_revenue` ref resolves to the projected output alias, + # and the qualified `sales::discount` ref to its alias. + net_revenue = model.get_metric("net_revenue") + assert net_revenue is not None + assert net_revenue.sql == "sales__gross_revenue - sales__discount" + + layer = SemanticLayer() + for m in graph.models.values(): + layer.add_model(m) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, gross_revenue DOUBLE, discount DOUBLE, customer_id INT)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 10.0, 5), (2, 50.0, 5.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile( + metrics=["sales_model.gross_revenue", "sales_model.net_revenue"], + dimensions=["sales_model.customer_name"], + ) + rows = con.execute(sql).fetchall() + # gross_revenue = 150, net_revenue = (100-10) + (50-5) = 135 + assert rows == [("Acme", 150.0, 135.0)] + + +def test_thoughtspot_aliased_base_table_is_queryable(): + """A joined model whose base/source table is aliased stays queryable. + + Regression: aliases were applied only to the joined `right` relation, so a + base table declared as `name: orders, alias: o` (with `column_id: o::amount` + and an `on` clause using `[o::id]`) emitted `FROM orders` while the columns + referenced `o.*`, failing with `Referenced table "o" not found`. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_base_alias.model.tml") + model = graph.models["orders_model"] + + # The base table is aliased in the FROM clause so the `o` qualifier resolves. + assert model.sql is not None + assert "FROM orders AS o" in model.sql + # Base-table columns are projected under stable aliases keyed on the alias. + assert model.get_dimension("order_id").sql == "o__id" + assert model.get_metric("amount").sql == "o__amount" + + layer = SemanticLayer() + for m in graph.models.values(): + layer.add_model(m) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (id INT, amount DOUBLE, customer_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 5), (2, 25.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile( + metrics=["orders_model.amount"], + dimensions=["orders_model.customer_name"], + ) + rows = con.execute(sql).fetchall() + assert rows == [("Acme", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_base_alias.model.tml b/tests/fixtures/thoughtspot/model_base_alias.model.tml new file mode 100644 index 00000000..c1478e78 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_base_alias.model.tml @@ -0,0 +1,30 @@ +guid: "model-base-alias" +export_schema_version: "2" +model: + name: orders_model + description: Joined model whose base/source table carries an alias (orders AS o) + model_tables: + - name: orders + alias: o + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: customers + on: "[o::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + columns: + - name: order_id + column_id: o::id + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: o::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml b/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml new file mode 100644 index 00000000..bf0c4e45 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml @@ -0,0 +1,38 @@ +guid: "model-unqualified-formula" +export_schema_version: "2" +model: + name: sales_model + description: Joined model whose formula uses an unqualified column reference + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: customers + on: "[sales::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + formulas: + - name: net_revenue + expr: "[gross_revenue] - [sales::discount]" + id: net_rev + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: gross_revenue + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: SUM + - name: net_revenue + formula_id: net_rev + properties: + column_type: MEASURE + aggregation: SUM From 14a9a58219c8aef97dfaae5716fa7c7260a5efd6 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 09:10:52 -0700 Subject: [PATCH 05/17] fix(thoughtspot): preserve string literals and make single aliased tables queryable --- sidemantic/adapters/thoughtspot.py | 36 +++++++++++++--- tests/adapters/thoughtspot/test_parsing.py | 42 ++++++++++++++++++- .../model_single_table_alias.model.tml | 19 +++++++++ .../model_unqualified_formula.model.tml | 11 +++++ 4 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_single_table_alias.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index ceeef7a1..3b853ba0 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -57,6 +57,25 @@ # A bare identifier that is NOT part of a `table.column` qualifier and is NOT a # function call: not preceded by `.`/word char, not followed by `.` or `(`. _BARE_IDENTIFIER = re.compile(r"(? str: + """Apply ``pattern.sub(repl, ...)`` only to regions outside quoted literals. + + Column-reference rewriting must not touch string literals, otherwise a + formula like ``[status] = 'status'`` would rewrite the literal too and change + the predicate's meaning. + """ + result: list[str] = [] + last = 0 + for m in _STRING_LITERAL.finditer(text): + result.append(pattern.sub(repl, text[last : m.start()])) + result.append(m.group(0)) + last = m.end() + result.append(pattern.sub(repl, text[last:])) + return "".join(result) def _normalize(value: Any) -> str | None: @@ -261,12 +280,13 @@ def _replace(match: re.Match[str]) -> str: alias = projection.get((table, column)) return alias if alias else match.group(0) - rewritten = _TML_DOT_REF.sub(_replace, expr) - def _replace_bare(match: re.Match[str]) -> str: return bare_to_alias.get(match.group(1), match.group(0)) - return _BARE_IDENTIFIER.sub(_replace_bare, rewritten) + # Rewrite column references only outside quoted string literals so a + # literal that happens to match a column name is left untouched. + rewritten = _sub_outside_strings(_TML_DOT_REF, _replace, expr) + return _sub_outside_strings(_BARE_IDENTIFIER, _replace_bare, rewritten) for dim in dimensions: dim.sql = _rewrite(dim.sql) @@ -917,8 +937,14 @@ def _build_join_sql( return None, None if len(clauses) == 1: - # Single base table, no joins: `model.table` is the real table name, - # not an alias (no subquery wraps it, so there is nothing to alias). + if base_relation != base_table: + # Single but aliased base table (e.g. `name: orders, alias: o` + # with `column_id: o::amount`): the fields reference `o.*`, so the + # alias must be in scope. Emit `SELECT * FROM orders AS o`, which + # `_expose_joined_columns` rewrites into queryable output aliases. + return f"SELECT * FROM {clauses[0]}", base_relation + # Single, unaliased base table: `model.table` is the real table name + # (no subquery wraps it, so there is nothing to alias). return None, base_table sql = "SELECT * FROM " + clauses[0] diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 0b6004f0..1ba74048 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -910,13 +910,19 @@ def test_thoughtspot_unqualified_formula_ref_is_queryable(): assert net_revenue is not None assert net_revenue.sql == "sales__gross_revenue - sales__discount" + # A string literal that matches a column name (`'status'`) must NOT be + # rewritten; only the bare column reference `status` is qualified. + is_open = model.get_dimension("is_open") + assert is_open is not None + assert is_open.sql == "sales__status = 'status'" + layer = SemanticLayer() for m in graph.models.values(): layer.add_model(m) con = duckdb.connect() - con.execute("CREATE TABLE sales (id INT, gross_revenue DOUBLE, discount DOUBLE, customer_id INT)") - con.execute("INSERT INTO sales VALUES (1, 100.0, 10.0, 5), (2, 50.0, 5.0, 5)") + con.execute("CREATE TABLE sales (id INT, gross_revenue DOUBLE, discount DOUBLE, customer_id INT, status VARCHAR)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 10.0, 5, 'status'), (2, 50.0, 5.0, 5, 'closed')") con.execute("CREATE TABLE customers (id INT, name VARCHAR)") con.execute("INSERT INTO customers VALUES (5, 'Acme')") @@ -968,6 +974,38 @@ def test_thoughtspot_aliased_base_table_is_queryable(): assert rows == [("Acme", 125.0)] +def test_thoughtspot_single_aliased_table_is_queryable(): + """A single (join-less) aliased table stays queryable. + + Regression: `model_tables` with one entry `name: orders, alias: o` and + columns like `o::amount` converted fields to `o.amount` but emitted + `table=orders` with no SQL, so the compiled query selected `o.amount` from + `orders` with no `o` alias in scope. The base table must be aliased. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_single_table_alias.model.tml") + model = graph.models["orders_model"] + + # The alias is in scope: SQL wraps the single table as `orders AS o`. + assert model.sql is not None + assert "FROM orders AS o" in model.sql + assert model.get_metric("amount").sql == "o__amount" + + layer = SemanticLayer() + for m in graph.models.values(): + layer.add_model(m) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (id INT, amount DOUBLE)") + con.execute("INSERT INTO orders VALUES (1, 100.0), (2, 25.0)") + + sql = layer.compile(metrics=["orders_model.amount"], dimensions=["orders_model.order_id"]) + rows = sorted(con.execute(sql).fetchall()) + assert rows == [(1, 100.0), (2, 25.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_single_table_alias.model.tml b/tests/fixtures/thoughtspot/model_single_table_alias.model.tml new file mode 100644 index 00000000..0eecd689 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_single_table_alias.model.tml @@ -0,0 +1,19 @@ +guid: "model-single-table-alias" +export_schema_version: "2" +model: + name: orders_model + description: Single aliased table (orders AS o) with no joins + model_tables: + - name: orders + alias: o + fqn: ANALYTICS.PUBLIC.orders + columns: + - name: order_id + column_id: o::id + properties: + column_type: ATTRIBUTE + - name: amount + column_id: o::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml b/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml index bf0c4e45..43d3f689 100644 --- a/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml +++ b/tests/fixtures/thoughtspot/model_unqualified_formula.model.tml @@ -17,6 +17,9 @@ model: - name: net_revenue expr: "[gross_revenue] - [sales::discount]" id: net_rev + - name: is_open + expr: "[status] = 'status'" + id: is_open_f columns: - name: order_id column_id: sales::id @@ -26,6 +29,14 @@ model: column_id: customers::name properties: column_type: ATTRIBUTE + - name: status + column_id: sales::status + properties: + column_type: ATTRIBUTE + - name: is_open + formula_id: is_open_f + properties: + column_type: ATTRIBUTE - name: gross_revenue column_id: sales::gross_revenue properties: From f1679a77bad4f8a9d44f095a38f9f33b7f181554 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 11:13:12 -0700 Subject: [PATCH 06/17] fix(thoughtspot): map bare formula refs by column name and project relationship keys --- sidemantic/adapters/thoughtspot.py | 81 +++++++++++++++-- tests/adapters/thoughtspot/test_parsing.py | 89 +++++++++++++++++++ .../model_relationship_fk.model.tml | 29 ++++++ .../model_renamed_formula.model.tml | 39 ++++++++ 4 files changed, 229 insertions(+), 9 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_relationship_fk.model.tml create mode 100644 tests/fixtures/thoughtspot/model_renamed_formula.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 3b853ba0..2340e772 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -209,6 +209,23 @@ def _simple_column(sql: str | None, fallback: str | None) -> str | None: return fallback +def _simple_table_column(sql: str | None, tables: set[str]) -> tuple[str, str] | None: + """Return the ``(table, column)`` pair if ``sql`` is exactly ``table.column``. + + Only matches when ``table`` is one of the joined ``tables`` so the reference + corresponds to a column projected by the derived subquery. + """ + if not sql: + return None + match = _TML_DOT_REF.fullmatch(sql.strip()) + if not match: + return None + table, column = match.group(1), match.group(2) + if table in tables: + return (table, column) + return None + + def _expose_joined_columns( sql: str | None, tables: set[str], @@ -216,6 +233,7 @@ def _expose_joined_columns( metrics: list[Metric], base_table: str | None = None, primary_key: str | None = None, + foreign_keys: dict[str, tuple[str, str]] | None = None, ) -> str | None: """Rewrite a joined model's derived SQL so its columns are queryable. @@ -231,6 +249,12 @@ def _expose_joined_columns( The model's ``primary_key`` is also passed through by the SQL generator as a bare column, so the base table's primary key is exposed under that name when no model column already projects it. + + ``foreign_keys`` maps each relationship join key (the bare column name the + SQL generator selects when this model participates in a cross-model join) to + the ``(table, column)`` that backs it. Each one is projected under its bare + name when no model column already exposes it, so the derived subquery stays + joinable. """ if not sql or "SELECT * FROM " not in sql: return sql @@ -262,13 +286,27 @@ def _collect(expr: str | None) -> None: # uses the in-scope output alias instead of an out-of-scope bare column. bare_to_alias: dict[str, str] = {} ambiguous: set[str] = set() - for (_table, column), alias in projection.items(): - if column in bare_to_alias and bare_to_alias[column] != alias: - ambiguous.add(column) + + def _record_bare(name: str, alias: str) -> None: + if name in bare_to_alias and bare_to_alias[name] != alias: + ambiguous.add(name) else: - bare_to_alias[column] = alias - for column in ambiguous: - bare_to_alias.pop(column, None) + bare_to_alias[name] = alias + + for (_table, column), alias in projection.items(): + _record_bare(column, alias) + # Formulas can also reference another TML column by its model name even when + # that name differs from the backing DB column (e.g. a column `gross_revenue` + # mapped to `column_id: sales::gross_amt`, with formula `[gross_revenue] - + # [discount]`). The projection aliases the DB column (`sales__gross_amt`), so + # also map the model column name to that alias; otherwise the bare model name + # stays out of scope and the query fails. + for field in (*dimensions, *metrics): + ref = _simple_table_column(field.sql, tables) + if ref: + _record_bare(field.name, projection[ref]) + for name in ambiguous: + bare_to_alias.pop(name, None) def _rewrite(expr: str | None) -> str | None: if not expr: @@ -295,12 +333,25 @@ def _replace_bare(match: re.Match[str]) -> str: select_parts = [f"{table}.{column} AS {alias}" for (table, column), alias in projection.items()] + # Track which bare output names already exist so pass-through keys are not + # projected twice. + exposed: set[str] = set(projection.values()) + # The SQL generator passes through `model.primary_key` as a bare column when # querying derived models. Expose `.` under that # name so the key resolves instead of referencing an out-of-scope column. - aliases = set(projection.values()) - if base_table and primary_key and primary_key not in aliases: + if base_table and primary_key and primary_key not in exposed: select_parts.append(f"{base_table}.{primary_key} AS {primary_key}") + exposed.add(primary_key) + + # Relationship foreign keys are also passed through as bare columns when this + # model is joined to a separately loaded related model. A foreign key that is + # not already projected by a dimension/metric (or the primary key) would be + # missing from the subquery, so expose it from its backing table. + for fk, (fk_table, fk_column) in sorted((foreign_keys or {}).items()): + if fk and fk not in exposed and fk_table in tables: + select_parts.append(f"{fk_table}.{fk_column} AS {fk}") + exposed.add(fk) select_list = ", ".join(select_parts) return sql.replace("SELECT * FROM ", f"SELECT {select_list} FROM ", 1) @@ -767,7 +818,19 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any known_tables.add(join_def.get("source")) known_tables.add(join_def.get("destination")) known_tables.discard(None) - sql = _expose_joined_columns(sql, known_tables, dimensions, metrics, base_table, primary_key) + + # Resolve each relationship's foreign key to the `(table, column)` it + # comes from in the join `on` clauses, so the derived projection can + # expose those join keys for cross-model queries. + fk_refs: dict[str, tuple[str, str]] = {} + fk_names = {rel.foreign_key for rel in relationships if rel.foreign_key} + for join_def in flat_joins: + left, right = _extract_join_refs(join_def.get("on"), path_lookup) + for ref in (left, right): + if ref and ref[1] in fk_names and ref[0] in known_tables: + fk_refs.setdefault(ref[1], ref) + + sql = _expose_joined_columns(sql, known_tables, dimensions, metrics, base_table, primary_key, fk_refs) default_time_dimension = None default_grain = None diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 1ba74048..b78b22c9 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1006,6 +1006,95 @@ def test_thoughtspot_single_aliased_table_is_queryable(): assert rows == [(1, 100.0), (2, 25.0)] +def test_thoughtspot_renamed_column_formula_ref_is_queryable(): + """A formula referencing a TML column whose backing DB column differs stays queryable. + + Regression: a column `gross_revenue` mapped to `column_id: sales::gross_amt` + is projected as `sales.gross_amt AS sales__gross_amt`, but the bare-reference + map was keyed only on DB column names. A formula `[gross_revenue] - [discount]` + kept the bare model names `gross_revenue`/`discount`, which are out of scope in + the derived subquery, so a normal query failed. The bare model names must + resolve to their projected output aliases. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_renamed_formula.model.tml") + model = graph.models["sales_model"] + + # The formula's bare model-name refs resolve to the projected DB-column aliases. + net_revenue = model.get_metric("net_revenue") + assert net_revenue is not None + assert net_revenue.sql == "sales__gross_amt - sales__disc_amt" + + layer = SemanticLayer() + for m in graph.models.values(): + layer.add_model(m) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, gross_amt DOUBLE, disc_amt DOUBLE, customer_id INT)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 10.0, 5), (2, 50.0, 5.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile( + metrics=["sales_model.gross_revenue", "sales_model.net_revenue"], + dimensions=["sales_model.customer_name"], + ) + rows = con.execute(sql).fetchall() + # gross_revenue = 150, net_revenue = (100-10) + (50-5) = 135 + assert rows == [("Acme", 150.0, 135.0)] + + +def test_thoughtspot_relationship_fk_is_projected_for_cross_model_query(): + """A relationship foreign key is exposed even when it is not also a column. + + Regression: the derived projection only emitted columns referenced by + dimensions/metrics plus the primary key. When this model joined a separately + loaded related model, the SQL generator selected the relationship's foreign + key (e.g. `region_id`) as a bare column from the derived subquery, but it was + never projected, so the cross-model query failed with "Column region_id ... + cannot be referenced before it is defined". + """ + import duckdb + + from sidemantic.core.dimension import Dimension + from sidemantic.core.model import Model + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_relationship_fk.model.tml") + sales_model = graph.models["sales_model"] + + # The foreign key is projected from the base table under its bare name even + # though no dimension/metric references it. + assert sales_model.sql is not None + assert "sales.region_id AS region_id" in sales_model.sql + rel = {r.name: r for r in sales_model.relationships}["regions"] + assert rel.foreign_key == "region_id" + + # A separately loaded `regions` model joined via the parsed relationship. + regions = Model( + name="regions", + table="regions", + primary_key="id", + dimensions=[Dimension(name="name", type="categorical", sql="name")], + ) + + layer = SemanticLayer() + layer.add_model(sales_model) + layer.add_model(regions) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, amount DOUBLE, region_id INT)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 7), (2, 25.0, 7)") + con.execute("CREATE TABLE regions (id INT, name VARCHAR)") + con.execute("INSERT INTO regions VALUES (7, 'West')") + + sql = layer.compile(metrics=["sales_model.amount"], dimensions=["regions.name"]) + rows = con.execute(sql).fetchall() + assert rows == [("West", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_relationship_fk.model.tml b/tests/fixtures/thoughtspot/model_relationship_fk.model.tml new file mode 100644 index 00000000..e30681e9 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_relationship_fk.model.tml @@ -0,0 +1,29 @@ +guid: "model-relationship-fk" +export_schema_version: "2" +model: + name: sales_model + description: Joined model whose relationship foreign key is not also a column + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: regions + on: "[sales::region_id] = [regions::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: regions + fqn: ANALYTICS.PUBLIC.regions + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: region_name + column_id: regions::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_renamed_formula.model.tml b/tests/fixtures/thoughtspot/model_renamed_formula.model.tml new file mode 100644 index 00000000..ba3c3fcb --- /dev/null +++ b/tests/fixtures/thoughtspot/model_renamed_formula.model.tml @@ -0,0 +1,39 @@ +guid: "model-renamed-formula" +export_schema_version: "2" +model: + name: sales_model + description: Formula references a TML column whose backing DB column name differs + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: customers + on: "[sales::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + formulas: + - name: net_revenue + expr: "[gross_revenue] - [discount]" + id: net_rev + columns: + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: gross_revenue + column_id: sales::gross_amt + properties: + column_type: MEASURE + aggregation: SUM + - name: discount + column_id: sales::disc_amt + properties: + column_type: MEASURE + aggregation: SUM + - name: net_revenue + formula_id: net_rev + properties: + column_type: MEASURE + aggregation: SUM From ae95a8f56fbfddd73e87aafb0d8f795213f31afa Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 11:32:36 -0700 Subject: [PATCH 07/17] fix(thoughtspot): resolve single-table formula refs and one-to-many join direction --- sidemantic/adapters/thoughtspot.py | 48 ++++++++++++ tests/adapters/thoughtspot/test_parsing.py | 76 +++++++++++++++++++ .../thoughtspot/model_one_to_many.model.tml | 29 +++++++ ...del_single_table_renamed_formula.model.tml | 32 ++++++++ 4 files changed, 185 insertions(+) create mode 100644 tests/fixtures/thoughtspot/model_one_to_many.model.tml create mode 100644 tests/fixtures/thoughtspot/model_single_table_renamed_formula.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 2340e772..becf1a83 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -357,6 +357,37 @@ def _replace_bare(match: re.Match[str]) -> str: return sql.replace("SELECT * FROM ", f"SELECT {select_list} FROM ", 1) +def _resolve_bare_refs_to_db_columns(dimensions: list[Dimension], metrics: list[Metric]) -> None: + """Rewrite formula bare model-name refs to their backing DB columns in place. + + For a join-less model the SQL generator queries the base table directly, so a + formula that references another TML column by its model name only works when + that name equals the backing DB column. Build a map of model name -> DB column + from the non-formula fields (whose SQL is a plain ``column`` or + ``table.column``) and rewrite each formula's bare references so they target + the real column. Only names that differ from their DB column are rewritten; + string literals are left untouched. + """ + name_to_db_col: dict[str, str] = {} + for field in (*dimensions, *metrics): + sql = field.sql + if not sql: + continue + _table, column = _split_sql_identifier(sql) + if column and column != field.name: + name_to_db_col[field.name] = column + + if not name_to_db_col: + return + + def _replace_bare(match: re.Match[str]) -> str: + return name_to_db_col.get(match.group(1), match.group(0)) + + for field in (*dimensions, *metrics): + if field.sql: + field.sql = _sub_outside_strings(_BARE_IDENTIFIER, _replace_bare, field.sql) + + class ThoughtSpotAdapter(BaseAdapter): """Adapter for ThoughtSpot TML (YAML) tables and worksheets.""" @@ -831,6 +862,15 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any fk_refs.setdefault(ref[1], ref) sql = _expose_joined_columns(sql, known_tables, dimensions, metrics, base_table, primary_key, fk_refs) + else: + # Single-table (join-less) model: no derived subquery wraps it, so the + # `_expose_joined_columns` rewrite never runs. A formula that refers to + # another TML column by its model name (e.g. column `gross_revenue` + # mapped from `sales::gross_amt`, formula `[gross_revenue] - + # [discount]`) keeps the bare model name, which is not a real column on + # the base table. Rewrite those bare refs to the backing DB column so + # the query stays valid. + _resolve_bare_refs_to_db_columns(dimensions, metrics) default_time_dimension = None default_grain = None @@ -912,6 +952,14 @@ def _parse_model_relationships( if join_type in {"RIGHT_OUTER", "FULL_OUTER", "OUTER"} and cardinality not in _CARDINALITY_MAP: rel_type = "many_to_many" + # The keys above follow the `many_to_one` convention: `foreign_key` on + # the source (local) side, `primary_key` on the destination (related) + # side. For a `one_to_many` relationship the related model holds the + # foreign key and the local model holds the primary key, so swap them + # to match how Sidemantic interprets `one_to_many` keys. + if rel_type == "one_to_many": + foreign_key, primary_key = primary_key, foreign_key + relationships.append( Relationship( name=destination, diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index b78b22c9..c2c03d34 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1095,6 +1095,82 @@ def test_thoughtspot_relationship_fk_is_projected_for_cross_model_query(): assert rows == [("West", 125.0)] +def test_thoughtspot_single_table_renamed_formula_ref_is_queryable(): + """A join-less model's formula referencing a renamed column stays queryable. + + Regression: the renamed-column rewrite only ran on the derived (joined) SQL + path. For a single-table model (`sql is None`), a formula like + `[gross_revenue] - [discount]` kept the bare model names, so the compiled + `SELECT gross_revenue - discount FROM sales` failed because the table only has + the backing columns `gross_amt`/`disc_amt`. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_single_table_renamed_formula.model.tml") + model = graph.models["sales_model"] + + # No join, so the model queries the base table directly; the formula's bare + # model-name refs are rewritten to the backing DB columns. + assert model.sql is None + assert model.table == "sales" + assert model.get_metric("net_revenue").sql == "gross_amt - disc_amt" + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, gross_amt DOUBLE, disc_amt DOUBLE)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 10.0), (2, 50.0, 5.0)") + + sql = layer.compile( + metrics=["sales_model.gross_revenue", "sales_model.net_revenue"], + dimensions=["sales_model.order_id"], + ) + rows = sorted(con.execute(sql).fetchall()) + # net_revenue = gross_amt - disc_amt + assert rows == [(1, 100.0, 90.0), (2, 50.0, 45.0)] + + +def test_thoughtspot_one_to_many_join_keys_match_direction(): + """A `one_to_many` join maps the foreign key to the related (child) model. + + Regression: keys were assigned with the `many_to_one` convention (foreign key + on the local/source side). For a `one_to_many` join `[customers::id] = + [orders::customer_id]`, Sidemantic expects `foreign_key` on the related model + (`orders.customer_id`) and `primary_key` on the local model (`customers.id`), + so cross-model queries joined the wrong columns before this fix. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_one_to_many.model.tml") + model = graph.models["customers_model"] + + rel = {r.name: r for r in model.relationships}["orders"] + assert rel.type == "one_to_many" + # Related (child) model holds the FK; local model holds the PK. + assert rel.foreign_key == "customer_id" + assert rel.primary_key == "id" + + layer = SemanticLayer() + for m in graph.models.values(): + layer.add_model(m) + + con = duckdb.connect() + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + con.execute("CREATE TABLE orders (id INT, customer_id INT, amount DOUBLE)") + con.execute("INSERT INTO orders VALUES (1, 5, 100.0), (2, 5, 25.0)") + + sql = layer.compile( + metrics=["customers_model.order_amount"], + dimensions=["customers_model.customer_name"], + ) + rows = con.execute(sql).fetchall() + assert rows == [("Acme", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_one_to_many.model.tml b/tests/fixtures/thoughtspot/model_one_to_many.model.tml new file mode 100644 index 00000000..34d56ab9 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_one_to_many.model.tml @@ -0,0 +1,29 @@ +guid: "model-one-to-many" +export_schema_version: "2" +model: + name: customers_model + description: One-to-many parent-to-child join (FK lives on the related model) + model_tables: + - name: customers + fqn: ANALYTICS.PUBLIC.customers + joins: + - with: orders + on: "[customers::id] = [orders::customer_id]" + type: LEFT_OUTER + cardinality: ONE_TO_MANY + - name: orders + fqn: ANALYTICS.PUBLIC.orders + columns: + - name: customer_id + column_id: customers::id + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: order_amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_single_table_renamed_formula.model.tml b/tests/fixtures/thoughtspot/model_single_table_renamed_formula.model.tml new file mode 100644 index 00000000..97467a0d --- /dev/null +++ b/tests/fixtures/thoughtspot/model_single_table_renamed_formula.model.tml @@ -0,0 +1,32 @@ +guid: "model-single-table-renamed-formula" +export_schema_version: "2" +model: + name: sales_model + description: Single-table model whose formula references renamed columns + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + formulas: + - name: net_revenue + expr: "[gross_revenue] - [discount]" + id: net_rev + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: gross_revenue + column_id: sales::gross_amt + properties: + column_type: MEASURE + aggregation: SUM + - name: discount + column_id: sales::disc_amt + properties: + column_type: MEASURE + aggregation: SUM + - name: net_revenue + formula_id: net_rev + properties: + column_type: MEASURE + aggregation: SUM From 8829f2f22467daa28dd6403b43e32d7d3006790c Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 11:49:56 -0700 Subject: [PATCH 08/17] fix(thoughtspot): correct one-to-one join key direction and project relationship keys --- sidemantic/adapters/thoughtspot.py | 26 ++++++--- tests/adapters/thoughtspot/test_parsing.py | 56 ++++++++++++++++++- .../thoughtspot/model_one_to_one.model.tml | 29 ++++++++++ 3 files changed, 99 insertions(+), 12 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_one_to_one.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index becf1a83..b5cbba8f 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -850,15 +850,23 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any known_tables.add(join_def.get("destination")) known_tables.discard(None) - # Resolve each relationship's foreign key to the `(table, column)` it - # comes from in the join `on` clauses, so the derived projection can - # expose those join keys for cross-model queries. + # Resolve each relationship's join keys to the `(table, column)` they + # come from in the join `on` clauses, so the derived projection can + # expose them for cross-model queries. The SQL generator passes through + # the foreign key (many_to_one) or the local primary key (one_to_one/ + # one_to_many) as bare columns from this derived subquery, so cover + # both sides. fk_refs: dict[str, tuple[str, str]] = {} - fk_names = {rel.foreign_key for rel in relationships if rel.foreign_key} + key_names: set[str] = set() + for rel in relationships: + if rel.foreign_key: + key_names.add(rel.foreign_key) + if rel.primary_key: + key_names.add(rel.primary_key) for join_def in flat_joins: left, right = _extract_join_refs(join_def.get("on"), path_lookup) for ref in (left, right): - if ref and ref[1] in fk_names and ref[0] in known_tables: + if ref and ref[1] in key_names and ref[0] in known_tables: fk_refs.setdefault(ref[1], ref) sql = _expose_joined_columns(sql, known_tables, dimensions, metrics, base_table, primary_key, fk_refs) @@ -954,10 +962,10 @@ def _parse_model_relationships( # The keys above follow the `many_to_one` convention: `foreign_key` on # the source (local) side, `primary_key` on the destination (related) - # side. For a `one_to_many` relationship the related model holds the - # foreign key and the local model holds the primary key, so swap them - # to match how Sidemantic interprets `one_to_many` keys. - if rel_type == "one_to_many": + # side. Sidemantic treats `one_to_many` and `one_to_one` as edges + # where the related model owns the `foreign_key` and the local model + # owns the `primary_key`, so swap them to match that key direction. + if rel_type in ("one_to_many", "one_to_one"): foreign_key, primary_key = primary_key, foreign_key relationships.append( diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index c2c03d34..a2f1133f 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -701,10 +701,12 @@ def test_import_thoughtspot_model(): assert relationships["customers"].type == "many_to_one" assert relationships["customers"].foreign_key == "customer_id" assert relationships["customers"].primary_key == "id" - # cardinality: ONE_TO_ONE + # cardinality: ONE_TO_ONE. Sidemantic treats one_to_one (like one_to_many) as + # an edge where the related model owns the foreign key and the local model + # owns the primary key, so the parsed source-side FK/PK are stored swapped. assert relationships["regions"].type == "one_to_one" - assert relationships["regions"].foreign_key == "region_id" - assert relationships["regions"].primary_key == "id" + assert relationships["regions"].foreign_key == "id" + assert relationships["regions"].primary_key == "region_id" # A joined model becomes a derived table. Its `SELECT *` is rewritten into an # explicit projection that aliases each inner `table.column` to a stable, @@ -1171,6 +1173,54 @@ def test_thoughtspot_one_to_many_join_keys_match_direction(): assert rows == [("Acme", 125.0)] +def test_thoughtspot_one_to_one_join_keys_match_direction(): + """A `one_to_one` join with a source-side FK joins correctly cross-model. + + Regression: `one_to_one` keys were stored in the `many_to_one` positions + (FK on the source/local side). Sidemantic reads `one_to_one` like a has-one + edge where the related model owns the foreign key, so a cross-model query with + a separately loaded `regions` model joined `regions.region_id` to + `sales_model.id` and failed instead of joining `regions.id` to + `sales_model.region_id`. + """ + import duckdb + + from sidemantic.core.dimension import Dimension + from sidemantic.core.model import Model + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_one_to_one.model.tml") + sales_model = graph.models["sales_model"] + + rel = {r.name: r for r in sales_model.relationships}["regions"] + assert rel.type == "one_to_one" + # Related model owns the FK (regions.id), local model owns the PK (region_id). + assert rel.foreign_key == "id" + assert rel.primary_key == "region_id" + + # A separately loaded `regions` model joined via the parsed relationship. + regions = Model( + name="regions", + table="regions", + primary_key="id", + dimensions=[Dimension(name="zone", type="categorical", sql="zone")], + ) + + layer = SemanticLayer() + layer.add_model(sales_model) + layer.add_model(regions) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, amount DOUBLE, region_id INT)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 7), (2, 25.0, 7)") + con.execute("CREATE TABLE regions (id INT, name VARCHAR, zone VARCHAR)") + con.execute("INSERT INTO regions VALUES (7, 'West', 'PACIFIC')") + + sql = layer.compile(metrics=["sales_model.amount"], dimensions=["regions.zone"]) + rows = con.execute(sql).fetchall() + assert rows == [("PACIFIC", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_one_to_one.model.tml b/tests/fixtures/thoughtspot/model_one_to_one.model.tml new file mode 100644 index 00000000..492fa89e --- /dev/null +++ b/tests/fixtures/thoughtspot/model_one_to_one.model.tml @@ -0,0 +1,29 @@ +guid: "model-one-to-one" +export_schema_version: "2" +model: + name: sales_model + description: One-to-one join where the source side owns the foreign key + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: regions + on: "[sales::region_id] = [regions::id]" + type: INNER + cardinality: ONE_TO_ONE + - name: regions + fqn: ANALYTICS.PUBLIC.regions + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: region_name + column_id: regions::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM From c270e69c115b610a314aba9d069117240e67050b Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 12:08:10 -0700 Subject: [PATCH 09/17] fix(thoughtspot): preserve composite join keys and infer non-id primary keys --- sidemantic/adapters/thoughtspot.py | 101 ++++++++++++++---- tests/adapters/thoughtspot/test_parsing.py | 88 ++++++++++++++- .../thoughtspot/model_composite_key.model.tml | 29 +++++ .../thoughtspot/model_non_id_key.model.tml | 29 +++++ 4 files changed, 222 insertions(+), 25 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_composite_key.model.tml create mode 100644 tests/fixtures/thoughtspot/model_non_id_key.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index b5cbba8f..f5b145d6 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -164,6 +164,35 @@ def _extract_join_refs( return left, right +def _extract_all_join_refs( + expr: str | None, table_path_lookup: dict[str, str] | None = None +) -> list[tuple[tuple[str | None, str], tuple[str | None, str]]]: + """Extract every ``left = right`` key pair from a (possibly composite) join. + + A composite ON clause like ``[a::x] = [b::y] AND [a::p] = [b::q]`` yields all + consecutive ref pairs, so composite-key relationships keep both columns + instead of silently dropping all but the first pair. + """ + if not expr: + return [] + + tokens = _TML_REF.findall(expr) + if len(tokens) < 2: + tokens = [f"{t[0]}.{t[1]}" for t in _TML_DOT_REF.findall(expr)] + + pairs: list[tuple[tuple[str | None, str], tuple[str | None, str]]] = [] + for i in range(0, len(tokens) - 1, 2): + left = _parse_ref_token(tokens[i]) + right = _parse_ref_token(tokens[i + 1]) + if table_path_lookup: + if left[0] in table_path_lookup: + left = (table_path_lookup[left[0]], left[1]) + if right[0] in table_path_lookup: + right = (table_path_lookup[right[0]], right[1]) + pairs.append((left, right)) + return pairs + + def _split_sql_identifier(sql: str | None) -> tuple[str | None, str | None]: if not sql or not _SIMPLE_IDENTIFIER.match(sql): return None, None @@ -832,9 +861,12 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any ) dimensions.append(dim) - primary_key = "id" - if any(d.name.lower() == "id" for d in dimensions): - primary_key = next(d.name for d in dimensions if d.name.lower() == "id") + # The SQL generator always selects `model.primary_key` from derived models + # as a bare column, so it must name a column that exists on the base + # table. Prefer a dimension named `id`; otherwise infer the key from a + # base-table column so a model whose key is not literally `id` (e.g. + # `order_key`) does not project a non-existent `id` column. + primary_key = self._infer_model_primary_key(dimensions, base_table) # A joined model is exported as derived SQL (FROM () AS t); rewrite # its `SELECT *` into explicit aliased columns and update the dimension/ @@ -860,14 +892,14 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any key_names: set[str] = set() for rel in relationships: if rel.foreign_key: - key_names.add(rel.foreign_key) + key_names.update(rel.foreign_key_columns) if rel.primary_key: - key_names.add(rel.primary_key) + key_names.update(rel.primary_key_columns) for join_def in flat_joins: - left, right = _extract_join_refs(join_def.get("on"), path_lookup) - for ref in (left, right): - if ref and ref[1] in key_names and ref[0] in known_tables: - fk_refs.setdefault(ref[1], ref) + for left, right in _extract_all_join_refs(join_def.get("on"), path_lookup): + for ref in (left, right): + if ref and ref[1] in key_names and ref[0] in known_tables: + fk_refs.setdefault(ref[1], ref) sql = _expose_joined_columns(sql, known_tables, dimensions, metrics, base_table, primary_key, fk_refs) else: @@ -907,6 +939,31 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any return model + def _infer_model_primary_key(self, dimensions: list[Dimension], base_table: str | None) -> str: + """Infer a queryable primary key column for a TML Model. + + Prefer a dimension literally named ``id``. Otherwise, if the base table is + known, keep ``id`` when a base-table column actually resolves to ``id``; + failing that, use the first base-table column so the key references a real + column. Fall back to ``id`` only when no better candidate exists. + """ + for dim in dimensions: + if dim.name.lower() == "id": + return dim.name + + if base_table: + base_columns: list[str] = [] + for dim in dimensions: + table, column = _split_sql_identifier(dim.sql) + if column and (table is None or table == base_table): + base_columns.append(column) + if "id" in base_columns: + return "id" + if base_columns: + return base_columns[0] + + return "id" + def _parse_model_relationships( self, joins: list[dict[str, Any]], @@ -930,30 +987,34 @@ def _parse_model_relationships( join_type = _normalize(join_def.get("type")) or "INNER" on_value = join_def.get("on") lookup = table_path_lookup or table_name_lookup - left, right = _extract_join_refs(on_value, lookup) + # Composite predicates carry more than one key pair; keep all of them + # so cross-model joins do not silently drop part of the key. + ref_pairs = _extract_all_join_refs(on_value, lookup) if table_name_lookup: source = table_name_lookup.get(source, source) destination = table_name_lookup.get(destination, destination) - foreign_key = None - primary_key = None - - if left and right: + foreign_keys: list[str] = [] + primary_keys: list[str] = [] + for left, right in ref_pairs: left_table, left_col = left right_table, right_col = right if left_table == source and right_table == destination: - foreign_key = left_col - primary_key = right_col + foreign_keys.append(left_col) + primary_keys.append(right_col) elif left_table == destination and right_table == source: - foreign_key = right_col - primary_key = left_col + foreign_keys.append(right_col) + primary_keys.append(left_col) else: if left_table == source: - foreign_key = left_col + foreign_keys.append(left_col) if right_table == destination: - primary_key = right_col + primary_keys.append(right_col) + + foreign_key = foreign_keys[0] if len(foreign_keys) == 1 else (foreign_keys or None) + primary_key = primary_keys[0] if len(primary_keys) == 1 else (primary_keys or None) cardinality = _normalize(join_def.get("cardinality")) rel_type = _CARDINALITY_MAP.get(cardinality or "", "many_to_one") diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index a2f1133f..5e98649c 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -701,12 +701,14 @@ def test_import_thoughtspot_model(): assert relationships["customers"].type == "many_to_one" assert relationships["customers"].foreign_key == "customer_id" assert relationships["customers"].primary_key == "id" - # cardinality: ONE_TO_ONE. Sidemantic treats one_to_one (like one_to_many) as - # an edge where the related model owns the foreign key and the local model - # owns the primary key, so the parsed source-side FK/PK are stored swapped. + # cardinality: ONE_TO_ONE with a composite ON predicate. Sidemantic treats + # one_to_one (like one_to_many) as an edge where the related model owns the + # foreign key and the local model owns the primary key, so the parsed + # source-side FK/PK are stored swapped. Both key pairs from the composite + # predicate are preserved. assert relationships["regions"].type == "one_to_one" - assert relationships["regions"].foreign_key == "id" - assert relationships["regions"].primary_key == "region_id" + assert relationships["regions"].foreign_key == ["id", "country_code"] + assert relationships["regions"].primary_key == ["region_id", "country_code"] # A joined model becomes a derived table. Its `SELECT *` is rewritten into an # explicit projection that aliases each inner `table.column` to a stable, @@ -1221,6 +1223,82 @@ def test_thoughtspot_one_to_one_join_keys_match_direction(): assert rows == [("PACIFIC", 125.0)] +def test_thoughtspot_non_id_primary_key_is_queryable(): + """A joined model whose key is not literally `id` stays queryable. + + Regression: `primary_key` defaulted to `id`, and the derived projection + injected `orders.id AS id`. The SQL generator always selects the primary key + from derived models, so even a basic aggregate failed with + `Table "orders" does not have a column named "id"`. The key must be inferred + from a real base-table column (here `order_key`). + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_non_id_key.model.tml") + model = graph.models["orders_model"] + + # The primary key is inferred from a real base-table column, not the default. + assert model.primary_key == "order_key" + assert model.sql is not None + assert "orders.order_key AS order_key" in model.sql + assert "orders.id AS id" not in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (order_key INT, amount DOUBLE, customer_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 5), (2, 25.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile(metrics=["orders_model.amount"], dimensions=["orders_model.customer_name"]) + rows = con.execute(sql).fetchall() + assert rows == [("Acme", 125.0)] + + +def test_thoughtspot_composite_join_keys_are_preserved(): + """A composite-key join keeps every key pair instead of dropping all but one. + + Regression: only the first `left = right` pair of a composite ON predicate was + stored, so `[sales::region_id] = [regions::id] AND [sales::country_code] = + [regions::country_code]` became `region_id -> id` only. Cross-model joins then + used a truncated key and could mis-join rows. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_composite_key.model.tml") + model = graph.models["sales_model"] + + rel = {r.name: r for r in model.relationships}["regions"] + assert rel.type == "many_to_one" + # Both key pairs are preserved as composite lists. + assert rel.foreign_key == ["region_id", "country_code"] + assert rel.primary_key == ["id", "country_code"] + + # Both join columns are projected by the derived subquery so a composite + # cross-model join stays in scope. + assert model.sql is not None + assert "sales.region_id AS region_id" in model.sql + assert "sales.country_code AS country_code" in model.sql + + # The model still compiles and runs standalone. + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, amount DOUBLE, region_id INT, country_code VARCHAR)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 7, 'US'), (2, 25.0, 7, 'US')") + con.execute("CREATE TABLE regions (id INT, country_code VARCHAR, name VARCHAR)") + con.execute("INSERT INTO regions VALUES (7, 'US', 'West')") + + sql = layer.compile(metrics=["sales_model.amount"], dimensions=["sales_model.region_name"]) + rows = con.execute(sql).fetchall() + assert rows == [("West", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_composite_key.model.tml b/tests/fixtures/thoughtspot/model_composite_key.model.tml new file mode 100644 index 00000000..3e643076 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_composite_key.model.tml @@ -0,0 +1,29 @@ +guid: "model-composite-key" +export_schema_version: "2" +model: + name: sales_model + description: Joined model with a composite-key join predicate + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: regions + on: "[sales::region_id] = [regions::id] AND [sales::country_code] = [regions::country_code]" + type: INNER + cardinality: MANY_TO_ONE + - name: regions + fqn: ANALYTICS.PUBLIC.regions + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: region_name + column_id: regions::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_non_id_key.model.tml b/tests/fixtures/thoughtspot/model_non_id_key.model.tml new file mode 100644 index 00000000..a7f16d9e --- /dev/null +++ b/tests/fixtures/thoughtspot/model_non_id_key.model.tml @@ -0,0 +1,29 @@ +guid: "model-non-id-key" +export_schema_version: "2" +model: + name: orders_model + description: Joined model whose base table key is not literally `id` + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: customers + on: "[orders::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + columns: + - name: order_key + column_id: orders::order_key + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM From 9f33276de3df21d85ac41486bf1873541b3b8edf Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 12:24:02 -0700 Subject: [PATCH 10/17] fix(thoughtspot): inline nested formula references in model formulas --- sidemantic/adapters/thoughtspot.py | 39 +++++++++++++++ tests/adapters/thoughtspot/test_parsing.py | 34 ++++++++++++++ .../model_nested_formula.model.tml | 47 +++++++++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 tests/fixtures/thoughtspot/model_nested_formula.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index f5b145d6..4b440b37 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -130,6 +130,33 @@ def _replace(match: re.Match[str]) -> str: return _TML_REF.sub(_replace, expr) +def _inline_formula_refs( + expr: str | None, + formula_expr_by_name: dict[str, str], + _seen: frozenset[str] = frozenset(), +) -> str | None: + """Recursively inline ``[formula_name]`` references in a TML formula. + + A formula that references another formula by name (e.g. ``margin`` defined as + ``[net_revenue] / [gross_revenue]`` where ``net_revenue`` is itself a formula) + must have the nested formula expanded inline; otherwise the bare reference is + left unresolved and points at a column the derived subquery never projects. + Self/cyclic references are left untouched to avoid infinite recursion. + """ + if not expr or not formula_expr_by_name: + return expr + + def _replace(match: re.Match[str]) -> str: + token = match.group(1) + # Only inline unqualified references to a known formula name. + if "::" not in token and token in formula_expr_by_name and token not in _seen: + inner = _inline_formula_refs(formula_expr_by_name[token], formula_expr_by_name, _seen | {token}) + return f"({inner})" + return match.group(0) + + return _TML_REF.sub(_replace, expr) + + def _parse_ref_token(token: str) -> tuple[str | None, str]: if "::" in token: table, column = token.split("::", 1) @@ -778,6 +805,9 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any formulas = model_def.get("formulas") or [] formula_by_id = {f.get("id"): f for f in formulas if f.get("id")} formula_by_name = {f.get("name"): f for f in formulas if f.get("name")} + # Map formula name -> expression so nested formula references can be + # inlined before the expression is converted/aliased. + formula_expr_by_name = {f.get("name"): f.get("expr") for f in formulas if f.get("name") and f.get("expr")} dimensions: list[Dimension] = [] metrics: list[Metric] = [] @@ -803,12 +833,21 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any format_pattern = properties.get("format_pattern") sql_expr = None + is_formula = False if formula_id and formula_id in formula_by_id: sql_expr = formula_by_id[formula_id].get("expr") + is_formula = True elif formula_id and formula_id in formula_by_name: sql_expr = formula_by_name[formula_id].get("expr") + is_formula = True elif col_name in formula_by_name: sql_expr = formula_by_name[col_name].get("expr") + is_formula = True + + # Inline references to other formulas so nested formula expressions + # resolve to physical columns instead of unprojected formula names. + if is_formula: + sql_expr = _inline_formula_refs(sql_expr, formula_expr_by_name) if not sql_expr and column_id: if "::" in column_id: diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 5e98649c..8401bd38 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1299,6 +1299,40 @@ def test_thoughtspot_composite_join_keys_are_preserved(): assert rows == [("West", 125.0)] +def test_thoughtspot_nested_formula_ref_is_inlined_and_queryable(): + """A formula referencing another formula inlines the nested expression. + + Regression: a formula like `[net_revenue] / [gross_revenue]` (where + `net_revenue` is itself a formula) kept the bare `net_revenue` token, which the + derived subquery never projected, so the query failed with + `Referenced column "net_revenue" not found`. The nested formula must be + expanded inline so the expression resolves to physical columns. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_nested_formula.model.tml") + model = graph.models["sales_model"] + + # The nested `net_revenue` formula is inlined into `margin`. + assert model.get_metric("net_revenue").sql == "sales__gross_revenue - sales__discount" + assert model.get_metric("margin").sql == "(sales__gross_revenue - sales__discount) / sales__gross_revenue" + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, gross_revenue DOUBLE, discount DOUBLE, customer_id INT)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 10.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile(metrics=["sales_model.margin"], dimensions=["sales_model.customer_name"]) + rows = con.execute(sql).fetchall() + # margin = (100 - 10) / 100 = 0.9 + assert rows == [("Acme", 0.9)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_nested_formula.model.tml b/tests/fixtures/thoughtspot/model_nested_formula.model.tml new file mode 100644 index 00000000..2b7a9dd3 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_nested_formula.model.tml @@ -0,0 +1,47 @@ +guid: "model-nested-formula" +export_schema_version: "2" +model: + name: sales_model + description: Joined model whose formula references another formula + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: customers + on: "[sales::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + formulas: + - name: net_revenue + expr: "[gross_revenue] - [discount]" + id: net_rev + - name: margin + expr: "[net_revenue] / [gross_revenue]" + id: margin_f + columns: + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: gross_revenue + column_id: sales::gross_revenue + properties: + column_type: MEASURE + aggregation: SUM + - name: discount + column_id: sales::discount + properties: + column_type: MEASURE + aggregation: SUM + - name: net_revenue + formula_id: net_rev + properties: + column_type: MEASURE + aggregation: SUM + - name: margin + formula_id: margin_f + properties: + column_type: MEASURE + aggregation: SUM From 5a590f767b8ea74033228a1bcacc220a3a194b26 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 12:38:50 -0700 Subject: [PATCH 11/17] fix(thoughtspot): prefer TML field names over colliding columns and resolve renamed id keys --- sidemantic/adapters/thoughtspot.py | 51 ++++++++------ tests/adapters/thoughtspot/test_parsing.py | 68 +++++++++++++++++++ .../model_name_collision.model.tml | 39 +++++++++++ .../model_renamed_id_key.model.tml | 29 ++++++++ 4 files changed, 167 insertions(+), 20 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_name_collision.model.tml create mode 100644 tests/fixtures/thoughtspot/model_renamed_id_key.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 4b440b37..3569d139 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -340,29 +340,37 @@ def _collect(expr: str | None) -> None: # like `gross_revenue`. Map each such column name to its projected alias when # exactly one joined table projects that column, so the rewritten expression # uses the in-scope output alias instead of an out-of-scope bare column. - bare_to_alias: dict[str, str] = {} - ambiguous: set[str] = set() + def _build_map(pairs: list[tuple[str, str]]) -> dict[str, str]: + """Map name -> alias, dropping names that map to conflicting aliases.""" + mapping: dict[str, str] = {} + ambiguous: set[str] = set() + for name, alias in pairs: + if name in mapping and mapping[name] != alias: + ambiguous.add(name) + else: + mapping[name] = alias + for name in ambiguous: + mapping.pop(name, None) + return mapping - def _record_bare(name: str, alias: str) -> None: - if name in bare_to_alias and bare_to_alias[name] != alias: - ambiguous.add(name) - else: - bare_to_alias[name] = alias + # Physical DB column names map to their projected aliases. + physical_map = _build_map([(column, alias) for (_table, column), alias in projection.items()]) - for (_table, column), alias in projection.items(): - _record_bare(column, alias) # Formulas can also reference another TML column by its model name even when # that name differs from the backing DB column (e.g. a column `gross_revenue` # mapped to `column_id: sales::gross_amt`, with formula `[gross_revenue] - - # [discount]`). The projection aliases the DB column (`sales__gross_amt`), so - # also map the model column name to that alias; otherwise the bare model name - # stays out of scope and the query fails. - for field in (*dimensions, *metrics): - ref = _simple_table_column(field.sql, tables) - if ref: - _record_bare(field.name, projection[ref]) - for name in ambiguous: - bare_to_alias.pop(name, None) + # [discount]`). Build a separate field-name map and let it take precedence: + # a formula's `[amount]` refers to the TML field `amount`, which may resolve + # to a different physical column than one literally named `amount`. + field_map = _build_map( + [ + (field.name, projection[ref]) + for field in (*dimensions, *metrics) + if (ref := _simple_table_column(field.sql, tables)) + ] + ) + + bare_to_alias = {**physical_map, **field_map} def _rewrite(expr: str | None) -> str | None: if not expr: @@ -981,14 +989,17 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any def _infer_model_primary_key(self, dimensions: list[Dimension], base_table: str | None) -> str: """Infer a queryable primary key column for a TML Model. - Prefer a dimension literally named ``id``. Otherwise, if the base table is + Prefer a dimension named ``id`` but resolve it to its backing physical + column (a column named ``id`` may map to a differently named DB column, + e.g. ``column_id: orders::order_key``). Otherwise, if the base table is known, keep ``id`` when a base-table column actually resolves to ``id``; failing that, use the first base-table column so the key references a real column. Fall back to ``id`` only when no better candidate exists. """ for dim in dimensions: if dim.name.lower() == "id": - return dim.name + _table, column = _split_sql_identifier(dim.sql) + return column or dim.name if base_table: base_columns: list[str] = [] diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 8401bd38..431fd79c 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1333,6 +1333,74 @@ def test_thoughtspot_nested_formula_ref_is_inlined_and_queryable(): assert rows == [("Acme", 0.9)] +def test_thoughtspot_formula_ref_prefers_tml_field_over_physical_name(): + """A bare formula ref resolves to the TML field, not a colliding physical name. + + Regression: physical column names were recorded first, so adding TML field + names could mark a valid reference ambiguous and drop it. With + `gross_revenue -> sales::amount` and `amount -> sales::cost`, the formula + `[amount] + [gross_revenue]` must resolve the bare `amount` to `sales__cost` + (the TML field), not be dropped because `amount` also names a physical column. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_name_collision.model.tml") + model = graph.models["sales_model"] + + # The bare `amount` ref resolves to the TML field's backing column (cost), + # not the physical `amount` column. + assert model.get_metric("total").sql == "sales__cost + sales__amount" + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, amount DOUBLE, cost DOUBLE, customer_id INT)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 30.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile(metrics=["sales_model.total"], dimensions=["sales_model.customer_name"]) + rows = con.execute(sql).fetchall() + # total = cost (30) + amount (100) = 130 + assert rows == [("Acme", 130.0)] + + +def test_thoughtspot_renamed_id_key_uses_backing_column(): + """A semantic column named `id` backed by another column resolves to it. + + Regression: `_infer_model_primary_key` returned the semantic name `id` even + when the column mapped to a different physical column (`column_id: + orders::order_key`). The derived projection then injected `orders.id AS id`, + failing because the base table only has `order_key`. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_renamed_id_key.model.tml") + model = graph.models["orders_model"] + + # The primary key resolves to the backing physical column, not the name `id`. + assert model.primary_key == "order_key" + assert model.sql is not None + assert "orders.order_key AS order_key" in model.sql + assert "orders.id AS id" not in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (order_key INT, amount DOUBLE, customer_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 5), (2, 25.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile(metrics=["orders_model.amount"], dimensions=["orders_model.customer_name"]) + rows = con.execute(sql).fetchall() + assert rows == [("Acme", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_name_collision.model.tml b/tests/fixtures/thoughtspot/model_name_collision.model.tml new file mode 100644 index 00000000..7fcaa477 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_name_collision.model.tml @@ -0,0 +1,39 @@ +guid: "model-name-collision" +export_schema_version: "2" +model: + name: sales_model + description: A TML field name collides with a different physical column name + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: customers + on: "[sales::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + formulas: + - name: total + expr: "[amount] + [gross_revenue]" + id: total_f + columns: + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: gross_revenue + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM + - name: amount + column_id: sales::cost + properties: + column_type: MEASURE + aggregation: SUM + - name: total + formula_id: total_f + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_renamed_id_key.model.tml b/tests/fixtures/thoughtspot/model_renamed_id_key.model.tml new file mode 100644 index 00000000..35d7581c --- /dev/null +++ b/tests/fixtures/thoughtspot/model_renamed_id_key.model.tml @@ -0,0 +1,29 @@ +guid: "model-renamed-id-key" +export_schema_version: "2" +model: + name: orders_model + description: A semantic column named id is backed by a different physical column + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: customers + on: "[orders::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + columns: + - name: id + column_id: orders::order_key + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM From e4aaadecf4fae650293953e41e8b89961703ef44 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 12:52:40 -0700 Subject: [PATCH 12/17] fix(thoughtspot): skip joined-table id columns when inferring primary key --- sidemantic/adapters/thoughtspot.py | 17 ++++++---- tests/adapters/thoughtspot/test_parsing.py | 34 +++++++++++++++++++ .../thoughtspot/model_joined_id_key.model.tml | 29 ++++++++++++++++ 3 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_joined_id_key.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 3569d139..eb8c3f61 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -991,15 +991,20 @@ def _infer_model_primary_key(self, dimensions: list[Dimension], base_table: str Prefer a dimension named ``id`` but resolve it to its backing physical column (a column named ``id`` may map to a differently named DB column, - e.g. ``column_id: orders::order_key``). Otherwise, if the base table is - known, keep ``id`` when a base-table column actually resolves to ``id``; - failing that, use the first base-table column so the key references a real - column. Fall back to ``id`` only when no better candidate exists. + e.g. ``column_id: orders::order_key``). Only accept it when its SQL is + unqualified or belongs to the base table; a joined-table ``id`` (e.g. + ``customers::id``) is not the base model's key. Otherwise, if the base + table is known, keep ``id`` when a base-table column actually resolves to + ``id``; failing that, use the first base-table column so the key + references a real column. Fall back to ``id`` only when no better + candidate exists. """ for dim in dimensions: if dim.name.lower() == "id": - _table, column = _split_sql_identifier(dim.sql) - return column or dim.name + table, column = _split_sql_identifier(dim.sql) + if table is None or base_table is None or table == base_table: + return column or dim.name + break if base_table: base_columns: list[str] = [] diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 431fd79c..ef2e69cc 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1401,6 +1401,40 @@ def test_thoughtspot_renamed_id_key_uses_backing_column(): assert rows == [("Acme", 125.0)] +def test_thoughtspot_joined_id_dimension_is_not_used_as_primary_key(): + """A joined-table column named `id` is not treated as the base model's key. + + Regression: `_infer_model_primary_key` returned the backing column of the + first dimension named `id` even when it came from a joined table + (`customers::id`). The derived projection then emitted `orders.id AS id`, + failing because the base `orders` table has no `id` (its key is `order_key`). + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_joined_id_key.model.tml") + model = graph.models["orders_model"] + + # The joined-table id is skipped; the base-table column is used as the key. + assert model.primary_key == "order_key" + assert model.sql is not None + assert "orders.order_key AS order_key" in model.sql + assert "orders.id AS id" not in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (order_key INT, amount DOUBLE, customer_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 5), (2, 25.0, 5)") + con.execute("CREATE TABLE customers (id INT)") + con.execute("INSERT INTO customers VALUES (5)") + + sql = layer.compile(metrics=["orders_model.amount"], dimensions=["orders_model.order_key"]) + rows = sorted(con.execute(sql).fetchall()) + assert rows == [(1, 100.0), (2, 25.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_joined_id_key.model.tml b/tests/fixtures/thoughtspot/model_joined_id_key.model.tml new file mode 100644 index 00000000..22573a04 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_joined_id_key.model.tml @@ -0,0 +1,29 @@ +guid: "model-joined-id-key" +export_schema_version: "2" +model: + name: orders_model + description: A joined-table id dimension precedes the base table key + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: customers + on: "[orders::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + columns: + - name: id + column_id: customers::id + properties: + column_type: ATTRIBUTE + - name: order_key + column_id: orders::order_key + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM From 608e374788db7010d684d2f9662c98c66eabccf0 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 13:07:13 -0700 Subject: [PATCH 13/17] fix(thoughtspot): only treat equality conjuncts as relationship join keys --- sidemantic/adapters/thoughtspot.py | 31 +++++++++---- tests/adapters/thoughtspot/test_parsing.py | 43 +++++++++++++++++++ .../thoughtspot/model_non_equi_join.model.tml | 29 +++++++++++++ 3 files changed, 94 insertions(+), 9 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_non_equi_join.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index eb8c3f61..00d452dd 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -59,6 +59,10 @@ _BARE_IDENTIFIER = re.compile(r"(?=`, `!=`, `<>`, or `==`). +_EQUALITY_OP = re.compile(r"(?=!])=(?![=])") def _sub_outside_strings(pattern: re.Pattern[str], repl: Any, text: str) -> str: @@ -197,20 +201,29 @@ def _extract_all_join_refs( """Extract every ``left = right`` key pair from a (possibly composite) join. A composite ON clause like ``[a::x] = [b::y] AND [a::p] = [b::q]`` yields all - consecutive ref pairs, so composite-key relationships keep both columns - instead of silently dropping all but the first pair. + equality pairs, so composite-key relationships keep both columns instead of + silently dropping all but the first pair. Only pure equality conjuncts are + treated as key pairs; range/non-equi predicates (e.g. + ``[a::date] BETWEEN [b::start] AND [b::end]``) are skipped so they are not + mistaken for additional equality keys. """ if not expr: return [] - tokens = _TML_REF.findall(expr) - if len(tokens) < 2: - tokens = [f"{t[0]}.{t[1]}" for t in _TML_DOT_REF.findall(expr)] - pairs: list[tuple[tuple[str | None, str], tuple[str | None, str]]] = [] - for i in range(0, len(tokens) - 1, 2): - left = _parse_ref_token(tokens[i]) - right = _parse_ref_token(tokens[i + 1]) + for conjunct in _AND_SPLIT.split(expr): + tokens = _TML_REF.findall(conjunct) + if len(tokens) < 2: + tokens = [f"{t[0]}.{t[1]}" for t in _TML_DOT_REF.findall(conjunct)] + + # Only an equality between exactly two refs is a join key pair. A `=` that + # is part of `<=`/`>=`/`!=` is not a plain equality. + equalities = _EQUALITY_OP.findall(conjunct) + if len(tokens) != 2 or len(equalities) != 1: + continue + + left = _parse_ref_token(tokens[0]) + right = _parse_ref_token(tokens[1]) if table_path_lookup: if left[0] in table_path_lookup: left = (table_path_lookup[left[0]], left[1]) diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index ef2e69cc..7f813e01 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1435,6 +1435,49 @@ def test_thoughtspot_joined_id_dimension_is_not_used_as_primary_key(): assert rows == [(1, 100.0), (2, 25.0)] +def test_thoughtspot_non_equi_join_predicate_is_not_a_relationship_key(): + """A range predicate in a join ON clause is not mistaken for an equality key. + + Regression: the composite-key extractor paired every two consecutive refs, so + `[sales::region_id] = [regions::id] AND [sales::date] BETWEEN + [regions::start_date] AND [regions::end_date]` produced bogus extra keys + (`date -> start_date`). Only the real equality conjunct should become the + relationship key; the range predicate stays in the join SQL but is ignored as + a key. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_non_equi_join.model.tml") + model = graph.models["sales_model"] + + rel = {r.name: r for r in model.relationships}["regions"] + # Only the equality key pair is captured; the BETWEEN refs are not keys. + assert rel.foreign_key == "region_id" + assert rel.primary_key == "id" + + # The range predicate is preserved in the join SQL, but its columns are not + # projected as spurious key columns. + assert model.sql is not None + assert "BETWEEN regions.start_date AND regions.end_date" in model.sql + assert "AS start_date" not in model.sql + assert "AS date" not in model.sql + + # The model still compiles and runs. + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, amount DOUBLE, region_id INT, date DATE)") + con.execute("INSERT INTO sales VALUES (1, 100.0, 7, DATE '2024-06-15'), (2, 25.0, 7, DATE '2024-06-20')") + con.execute("CREATE TABLE regions (id INT, name VARCHAR, start_date DATE, end_date DATE)") + con.execute("INSERT INTO regions VALUES (7, 'West', DATE '2024-06-01', DATE '2024-06-30')") + + sql = layer.compile(metrics=["sales_model.amount"], dimensions=["sales_model.region_name"]) + rows = con.execute(sql).fetchall() + assert rows == [("West", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_non_equi_join.model.tml b/tests/fixtures/thoughtspot/model_non_equi_join.model.tml new file mode 100644 index 00000000..1b60ad5a --- /dev/null +++ b/tests/fixtures/thoughtspot/model_non_equi_join.model.tml @@ -0,0 +1,29 @@ +guid: "model-non-equi-join" +export_schema_version: "2" +model: + name: sales_model + description: Join with an equality key plus a BETWEEN range predicate + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: regions + on: "[sales::region_id] = [regions::id] AND [sales::date] BETWEEN [regions::start_date] AND [regions::end_date]" + type: INNER + cardinality: MANY_TO_ONE + - name: regions + fqn: ANALYTICS.PUBLIC.regions + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: region_name + column_id: regions::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM From f4e34cd41b1f9bf30de4915cebf87a148bbb0553 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 13:21:32 -0700 Subject: [PATCH 14/17] fix(thoughtspot): resolve aliased table id and name refs to the alias --- sidemantic/adapters/thoughtspot.py | 13 +++++++ tests/adapters/thoughtspot/test_parsing.py | 37 +++++++++++++++++++ .../model_aliased_table_id_ref.model.tml | 31 ++++++++++++++++ 3 files changed, 81 insertions(+) create mode 100644 tests/fixtures/thoughtspot/model_aliased_table_id_ref.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 00d452dd..edba394b 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -787,6 +787,19 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any path_lookup: dict[str, str] = dict(table_name_lookup) for alias in alias_to_table: path_lookup[alias] = alias + # An aliased entry may still be referenced by its `id`/`name` in a + # `column_id` or `on` expression (e.g. `id: countries_tbl, alias: + # ship_country` with `column_id: countries_tbl::name`). The SQL relation in + # scope is the alias (`JOIN countries AS ship_country`), so map the entry's + # id/name tokens to the alias too; otherwise the qualifier resolves to the + # backing table name, which is not in scope. + for table in model_tables: + alias = table.get("alias") + if not alias: + continue + for token in (table.get("id"), table.get("name")): + if token and token != alias: + path_lookup[token] = alias # Flatten nested joins (one per model_tables entry) into the same shape # the worksheet join helpers consume. When a table carries an `alias`, diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 7f813e01..1fb96542 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1478,6 +1478,43 @@ def test_thoughtspot_non_equi_join_predicate_is_not_a_relationship_key(): assert rows == [("West", 125.0)] +def test_thoughtspot_aliased_table_referenced_by_id_is_queryable(): + """An aliased table referenced by its id resolves to the alias, not the table. + + Regression: a `model_tables` entry `id: countries_tbl, alias: ship_country` + referenced by `column_id: countries_tbl::name` resolved to `countries.name`, + but the SQL emits `JOIN countries AS ship_country`, so `countries` is not in + scope. The id/name tokens of an aliased entry must resolve to the alias. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_aliased_table_id_ref.model.tml") + model = graph.models["shipments_model"] + + # The id-qualified field resolves to the alias relation, which is in scope. + assert model.get_dimension("ship_country_name").sql == "ship_country__name" + assert model.sql is not None + assert "JOIN countries AS ship_country" in model.sql + assert "ship_country.name AS ship_country__name" in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (id INT, amount DOUBLE, ship_country_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 7), (2, 25.0, 7)") + con.execute("CREATE TABLE countries (id INT, name VARCHAR)") + con.execute("INSERT INTO countries VALUES (7, 'US')") + + sql = layer.compile( + metrics=["shipments_model.amount"], + dimensions=["shipments_model.ship_country_name"], + ) + rows = con.execute(sql).fetchall() + assert rows == [("US", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_aliased_table_id_ref.model.tml b/tests/fixtures/thoughtspot/model_aliased_table_id_ref.model.tml new file mode 100644 index 00000000..014ade24 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_aliased_table_id_ref.model.tml @@ -0,0 +1,31 @@ +guid: "model-aliased-table-id-ref" +export_schema_version: "2" +model: + name: shipments_model + description: An aliased table is referenced by its id in a column_id + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: ship_country + on: "[orders::ship_country_id] = [ship_country::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: countries + id: countries_tbl + alias: ship_country + fqn: ANALYTICS.PUBLIC.countries + columns: + - name: order_id + column_id: orders::id + properties: + column_type: ATTRIBUTE + - name: ship_country_name + column_id: countries_tbl::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM From 280e05bf8c07a60ac9f03328c3f38400d8264272 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 13:36:24 -0700 Subject: [PATCH 15/17] fix(thoughtspot): resolve aliased join targets and infer measure-backed keys --- sidemantic/adapters/thoughtspot.py | 42 ++++++----- tests/adapters/thoughtspot/test_parsing.py | 70 +++++++++++++++++++ .../model_join_target_by_id.model.tml | 31 ++++++++ .../thoughtspot/model_measure_key.model.tml | 30 ++++++++ 4 files changed, 157 insertions(+), 16 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_join_target_by_id.model.tml create mode 100644 tests/fixtures/thoughtspot/model_measure_key.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index edba394b..ad64fe4a 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -813,12 +813,15 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any destination = join_def.get("with") if not source or not destination: continue - # Keep an aliased destination as-is (the role name); only resolve - # non-aliased ids to their table name. + # Resolve the join target to the in-scope relation name. For an + # aliased entry that is the role alias (so a `with:` token that is + # the alias, or the table id/name of an aliased entry, becomes the + # alias); `path_lookup` already encodes both, falling back to the + # backing table name for non-aliased entries. if destination in alias_to_table: resolved_dest = destination else: - resolved_dest = table_name_lookup.get(destination, destination) + resolved_dest = path_lookup.get(destination, table_name_lookup.get(destination, destination)) # PyYAML (YAML 1.1) parses the bare `on:` key as the boolean True. on_value = join_def.get("on") if on_value is None and True in join_def: @@ -939,7 +942,7 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any # table. Prefer a dimension named `id`; otherwise infer the key from a # base-table column so a model whose key is not literally `id` (e.g. # `order_key`) does not project a non-existent `id` column. - primary_key = self._infer_model_primary_key(dimensions, base_table) + primary_key = self._infer_model_primary_key(dimensions, metrics, base_table) # A joined model is exported as derived SQL (FROM () AS t); rewrite # its `SELECT *` into explicit aliased columns and update the dimension/ @@ -1012,30 +1015,37 @@ def _parse_model(self, model_def: dict[str, Any] | None, full_def: dict[str, Any return model - def _infer_model_primary_key(self, dimensions: list[Dimension], base_table: str | None) -> str: + def _infer_model_primary_key( + self, + dimensions: list[Dimension], + metrics: list[Metric], + base_table: str | None, + ) -> str: """Infer a queryable primary key column for a TML Model. - Prefer a dimension named ``id`` but resolve it to its backing physical - column (a column named ``id`` may map to a differently named DB column, - e.g. ``column_id: orders::order_key``). Only accept it when its SQL is + Prefer a field named ``id`` but resolve it to its backing physical column + (a column named ``id`` may map to a differently named DB column, e.g. + ``column_id: orders::order_key``). Only accept it when its SQL is unqualified or belongs to the base table; a joined-table ``id`` (e.g. ``customers::id``) is not the base model's key. Otherwise, if the base table is known, keep ``id`` when a base-table column actually resolves to ``id``; failing that, use the first base-table column so the key - references a real column. Fall back to ``id`` only when no better - candidate exists. + references a real column. ThoughtSpot often exports numeric key columns as + measures, so dimensions are scanned first but metrics are considered too. + Fall back to ``id`` only when no better candidate exists. """ - for dim in dimensions: - if dim.name.lower() == "id": - table, column = _split_sql_identifier(dim.sql) + fields: list[Dimension | Metric] = [*dimensions, *metrics] + for field in fields: + if field.name.lower() == "id": + table, column = _split_sql_identifier(field.sql) if table is None or base_table is None or table == base_table: - return column or dim.name + return column or field.name break if base_table: base_columns: list[str] = [] - for dim in dimensions: - table, column = _split_sql_identifier(dim.sql) + for field in fields: + table, column = _split_sql_identifier(field.sql) if column and (table is None or table == base_table): base_columns.append(column) if "id" in base_columns: diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 1fb96542..f12f1d90 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1515,6 +1515,76 @@ def test_thoughtspot_aliased_table_referenced_by_id_is_queryable(): assert rows == [("US", 125.0)] +def test_thoughtspot_join_target_by_id_resolves_to_alias(): + """A join whose `with:` target is an aliased table id resolves to the alias. + + Regression: after id/name refs mapped to the alias, the join-flatten step + still resolved `with: countries_tbl` to the backing table name, so the SQL + emitted `JOIN countries ON ... ship_country.id` (alias not in scope) and the + relationship was named `countries`. The join target must resolve to the alias. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_join_target_by_id.model.tml") + model = graph.models["shipments_model"] + + # The join uses the alias as the relation name, matching the ON clause. + assert model.sql is not None + assert "JOIN countries AS ship_country" in model.sql + assert {r.name for r in model.relationships} == {"ship_country"} + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (id INT, amount DOUBLE, ship_country_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 7), (2, 25.0, 7)") + con.execute("CREATE TABLE countries (id INT, name VARCHAR)") + con.execute("INSERT INTO countries VALUES (7, 'US')") + + sql = layer.compile( + metrics=["shipments_model.amount"], + dimensions=["shipments_model.ship_country_name"], + ) + rows = con.execute(sql).fetchall() + assert rows == [("US", 125.0)] + + +def test_thoughtspot_measure_key_column_is_inferred_as_primary_key(): + """A base-table key exported as a measure is still inferred as the primary key. + + Regression: `_infer_model_primary_key` only scanned dimensions, but + ThoughtSpot often exports key columns as measures. With `order_key` as a + MEASURE and no physical `orders.id`, the primary key defaulted to `id` and the + derived projection injected a non-existent `orders.id AS id`, breaking even a + plain aggregate query. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_measure_key.model.tml") + model = graph.models["orders_model"] + + # The measure-backed base-table column is used as the key. + assert model.primary_key == "order_key" + assert model.sql is not None + assert "orders.id AS id" not in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE orders (order_key INT, amount DOUBLE, customer_id INT)") + con.execute("INSERT INTO orders VALUES (1, 100.0, 5), (2, 25.0, 5)") + con.execute("CREATE TABLE customers (id INT, name VARCHAR)") + con.execute("INSERT INTO customers VALUES (5, 'Acme')") + + sql = layer.compile(metrics=["orders_model.amount"], dimensions=["orders_model.customer_name"]) + rows = con.execute(sql).fetchall() + assert rows == [("Acme", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_join_target_by_id.model.tml b/tests/fixtures/thoughtspot/model_join_target_by_id.model.tml new file mode 100644 index 00000000..30a8c571 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_join_target_by_id.model.tml @@ -0,0 +1,31 @@ +guid: "model-join-target-by-id" +export_schema_version: "2" +model: + name: shipments_model + description: A join identifies an aliased target table by its id + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: countries_tbl + on: "[orders::ship_country_id] = [ship_country::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: countries + id: countries_tbl + alias: ship_country + fqn: ANALYTICS.PUBLIC.countries + columns: + - name: order_id + column_id: orders::id + properties: + column_type: ATTRIBUTE + - name: ship_country_name + column_id: ship_country::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_measure_key.model.tml b/tests/fixtures/thoughtspot/model_measure_key.model.tml new file mode 100644 index 00000000..5f0c1858 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_measure_key.model.tml @@ -0,0 +1,30 @@ +guid: "model-measure-key" +export_schema_version: "2" +model: + name: orders_model + description: The base table key is exported as a MEASURE and there is no id + model_tables: + - name: orders + fqn: ANALYTICS.PUBLIC.orders + joins: + - with: customers + on: "[orders::customer_id] = [customers::id]" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + columns: + - name: order_key + column_id: orders::order_key + properties: + column_type: MEASURE + aggregation: COUNT + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: orders::amount + properties: + column_type: MEASURE + aggregation: SUM From 8a23d0d55f9bbf204c0844ead814b17d7b797279 Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 13:50:58 -0700 Subject: [PATCH 16/17] fix(thoughtspot): skip key-less and function-wrapped joins as relationship keys --- sidemantic/adapters/thoughtspot.py | 42 +++++++---- tests/adapters/thoughtspot/test_parsing.py | 69 +++++++++++++++++++ .../model_range_only_join.model.tml | 29 ++++++++ .../model_wrapped_join_key.model.tml | 29 ++++++++ 4 files changed, 156 insertions(+), 13 deletions(-) create mode 100644 tests/fixtures/thoughtspot/model_range_only_join.model.tml create mode 100644 tests/fixtures/thoughtspot/model_wrapped_join_key.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index ad64fe4a..18ef0d6a 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -202,28 +202,37 @@ def _extract_all_join_refs( A composite ON clause like ``[a::x] = [b::y] AND [a::p] = [b::q]`` yields all equality pairs, so composite-key relationships keep both columns instead of - silently dropping all but the first pair. Only pure equality conjuncts are - treated as key pairs; range/non-equi predicates (e.g. - ``[a::date] BETWEEN [b::start] AND [b::end]``) are skipped so they are not - mistaken for additional equality keys. + silently dropping all but the first pair. Only a plain equality between two + bare column references is a key pair; range/non-equi predicates (e.g. + ``[a::date] BETWEEN [b::start] AND [b::end]``) and wrapped expressions (e.g. + ``LOWER([a::email]) = LOWER([b::email])``) are skipped so they are not + mistaken for equality keys. """ if not expr: return [] + def _bare_ref(side: str) -> tuple[str | None, str] | None: + """Parse ``side`` only if it is exactly one bare ref, not a wrapped expr.""" + side = side.strip() + bracketed = _TML_REF.fullmatch(side) + if bracketed: + return _parse_ref_token(bracketed.group(1)) + if _TML_DOT_REF.fullmatch(side) or _SIMPLE_IDENTIFIER.match(side): + return _parse_ref_token(side) + return None + pairs: list[tuple[tuple[str | None, str], tuple[str | None, str]]] = [] for conjunct in _AND_SPLIT.split(expr): - tokens = _TML_REF.findall(conjunct) - if len(tokens) < 2: - tokens = [f"{t[0]}.{t[1]}" for t in _TML_DOT_REF.findall(conjunct)] - - # Only an equality between exactly two refs is a join key pair. A `=` that - # is part of `<=`/`>=`/`!=` is not a plain equality. + # A plain equality has exactly one `=` (not part of `<=`/`>=`/`!=`). equalities = _EQUALITY_OP.findall(conjunct) - if len(tokens) != 2 or len(equalities) != 1: + if len(equalities) != 1: + continue + left_side, right_side = _EQUALITY_OP.split(conjunct, maxsplit=1) + left = _bare_ref(left_side) + right = _bare_ref(right_side) + if not left or not right: continue - left = _parse_ref_token(tokens[0]) - right = _parse_ref_token(tokens[1]) if table_path_lookup: if left[0] in table_path_lookup: left = (table_path_lookup[left[0]], left[1]) @@ -1082,6 +1091,13 @@ def _parse_model_relationships( # so cross-model joins do not silently drop part of the key. ref_pairs = _extract_all_join_refs(on_value, lookup) + # A join with no equality keys (e.g. a pure range predicate) has no + # usable cross-model relationship key. Emitting a relationship with + # unset keys makes Sidemantic default to `_id`/`id`, which do not + # exist; skip it. The predicate stays in the derived join SQL. + if not ref_pairs: + continue + if table_name_lookup: source = table_name_lookup.get(source, source) destination = table_name_lookup.get(destination, destination) diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index f12f1d90..0d57c41e 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1585,6 +1585,75 @@ def test_thoughtspot_measure_key_column_is_inferred_as_primary_key(): assert rows == [("Acme", 125.0)] +def test_thoughtspot_range_only_join_emits_no_relationship(): + """A join with only a range predicate emits no cross-model relationship. + + Regression: a join with no equality key (e.g. `[sales::date] BETWEEN + [calendar::start_date] AND [calendar::end_date]`) still appended a relationship + with unset keys, which Sidemantic defaults to `calendar_id`/`id` (non-existent + columns). No relationship should be emitted; the predicate stays in the join. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_range_only_join.model.tml") + model = graph.models["sales_model"] + + # No relationship is created from a key-less range join. + assert model.relationships == [] + # The range predicate is preserved in the derived join SQL. + assert model.sql is not None + assert "BETWEEN calendar.start_date AND calendar.end_date" in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE sales (id INT, amount DOUBLE, date DATE)") + con.execute("INSERT INTO sales VALUES (1, 100.0, DATE '2024-06-15'), (2, 25.0, DATE '2024-06-20')") + con.execute("CREATE TABLE calendar (period_name VARCHAR, start_date DATE, end_date DATE)") + con.execute("INSERT INTO calendar VALUES ('June', DATE '2024-06-01', DATE '2024-06-30')") + + sql = layer.compile(metrics=["sales_model.amount"], dimensions=["sales_model.period_name"]) + rows = con.execute(sql).fetchall() + assert rows == [("June", 125.0)] + + +def test_thoughtspot_wrapped_equality_is_not_a_relationship_key(): + """A function-wrapped equality (e.g. LOWER()) is not stored as a join key. + + Regression: `LOWER([users::email]) = LOWER([contacts::email])` matched the + "two refs, one `=`" heuristic and stored a plain `email -> email` key. A + cross-model query would then use a case-sensitive `email = email` join that + differs from the `LOWER()` predicate. Only bare-ref equalities are keys. + """ + import duckdb + + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_wrapped_join_key.model.tml") + model = graph.models["users_model"] + + # The wrapped equality does not become a relationship key. + assert model.relationships == [] + # The LOWER() predicate is preserved in the derived join SQL. + assert model.sql is not None + assert "LOWER(users.email) = LOWER(contacts.email)" in model.sql + + layer = SemanticLayer() + layer.add_model(model) + + con = duckdb.connect() + con.execute("CREATE TABLE users (id INT, amount DOUBLE, email VARCHAR)") + con.execute("INSERT INTO users VALUES (1, 100.0, 'A@X.COM'), (2, 25.0, 'A@X.COM')") + con.execute("CREATE TABLE contacts (name VARCHAR, email VARCHAR)") + con.execute("INSERT INTO contacts VALUES ('Acme', 'a@x.com')") + + sql = layer.compile(metrics=["users_model.amount"], dimensions=["users_model.contact_name"]) + rows = con.execute(sql).fetchall() + # The case-insensitive join still matches via the preserved LOWER() predicate. + assert rows == [("Acme", 125.0)] + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_range_only_join.model.tml b/tests/fixtures/thoughtspot/model_range_only_join.model.tml new file mode 100644 index 00000000..6ff8b92a --- /dev/null +++ b/tests/fixtures/thoughtspot/model_range_only_join.model.tml @@ -0,0 +1,29 @@ +guid: "model-range-only-join" +export_schema_version: "2" +model: + name: sales_model + description: Join with only a range predicate and no equality key + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: calendar + on: "[sales::date] BETWEEN [calendar::start_date] AND [calendar::end_date]" + type: INNER + cardinality: MANY_TO_ONE + - name: calendar + fqn: ANALYTICS.PUBLIC.calendar + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: period_name + column_id: calendar::period_name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM diff --git a/tests/fixtures/thoughtspot/model_wrapped_join_key.model.tml b/tests/fixtures/thoughtspot/model_wrapped_join_key.model.tml new file mode 100644 index 00000000..42659db8 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_wrapped_join_key.model.tml @@ -0,0 +1,29 @@ +guid: "model-wrapped-join-key" +export_schema_version: "2" +model: + name: users_model + description: Join keys wrapped in LOWER() are not plain equality keys + model_tables: + - name: users + fqn: ANALYTICS.PUBLIC.users + joins: + - with: contacts + on: "LOWER([users::email]) = LOWER([contacts::email])" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: contacts + fqn: ANALYTICS.PUBLIC.contacts + columns: + - name: user_id + column_id: users::id + properties: + column_type: ATTRIBUTE + - name: contact_name + column_id: contacts::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: users::amount + properties: + column_type: MEASURE + aggregation: SUM From 176fe8da03e4fd41709465f0e16275023ae2d2ca Mon Sep 17 00:00:00 2001 From: Nico Ritschel Date: Sun, 14 Jun 2026 14:05:03 -0700 Subject: [PATCH 17/17] fix(thoughtspot): strip surrounding parentheses when matching join equality keys --- sidemantic/adapters/thoughtspot.py | 5 +++- tests/adapters/thoughtspot/test_parsing.py | 18 ++++++++++++ .../model_parenthesized_join.model.tml | 29 +++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 tests/fixtures/thoughtspot/model_parenthesized_join.model.tml diff --git a/sidemantic/adapters/thoughtspot.py b/sidemantic/adapters/thoughtspot.py index 18ef0d6a..42dd15ce 100644 --- a/sidemantic/adapters/thoughtspot.py +++ b/sidemantic/adapters/thoughtspot.py @@ -213,7 +213,10 @@ def _extract_all_join_refs( def _bare_ref(side: str) -> tuple[str | None, str] | None: """Parse ``side`` only if it is exactly one bare ref, not a wrapped expr.""" - side = side.strip() + # Harmless surrounding parentheses/whitespace (e.g. `([a::x] = [b::y])` + # splits into `([a::x]` / `[b::y])`) are stripped; a bare ref never + # contains parentheses, so this does not admit function-wrapped exprs. + side = side.strip().strip("()").strip() bracketed = _TML_REF.fullmatch(side) if bracketed: return _parse_ref_token(bracketed.group(1)) diff --git a/tests/adapters/thoughtspot/test_parsing.py b/tests/adapters/thoughtspot/test_parsing.py index 0d57c41e..bb736851 100644 --- a/tests/adapters/thoughtspot/test_parsing.py +++ b/tests/adapters/thoughtspot/test_parsing.py @@ -1654,6 +1654,24 @@ def test_thoughtspot_wrapped_equality_is_not_a_relationship_key(): assert rows == [("Acme", 125.0)] +def test_thoughtspot_parenthesized_equality_is_a_relationship_key(): + """An equality wrapped in harmless parentheses still becomes a join key. + + Regression: requiring each equality side to be a bare ref dropped valid keys + when the predicate was parenthesized (e.g. `([sales::customer_id] = + [customers::id])`), since the sides reached the matcher as `([sales::...]` / + `[customers::id])`. Surrounding parentheses must be stripped first. + """ + adapter = ThoughtSpotAdapter() + graph = adapter.parse("tests/fixtures/thoughtspot/model_parenthesized_join.model.tml") + model = graph.models["sales_model"] + + rel = {r.name: r for r in model.relationships}["customers"] + assert rel.type == "many_to_one" + assert rel.foreign_key == "customer_id" + assert rel.primary_key == "id" + + def test_thoughtspot_model_auto_detect_loader(): """A model + model_tables + columns YAML file is auto-detected as ThoughtSpot.""" with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/fixtures/thoughtspot/model_parenthesized_join.model.tml b/tests/fixtures/thoughtspot/model_parenthesized_join.model.tml new file mode 100644 index 00000000..f53385e8 --- /dev/null +++ b/tests/fixtures/thoughtspot/model_parenthesized_join.model.tml @@ -0,0 +1,29 @@ +guid: "model-parenthesized-join" +export_schema_version: "2" +model: + name: sales_model + description: An equality join predicate wrapped in harmless parentheses + model_tables: + - name: sales + fqn: ANALYTICS.PUBLIC.sales + joins: + - with: customers + on: "([sales::customer_id] = [customers::id])" + type: LEFT_OUTER + cardinality: MANY_TO_ONE + - name: customers + fqn: ANALYTICS.PUBLIC.customers + columns: + - name: order_id + column_id: sales::id + properties: + column_type: ATTRIBUTE + - name: customer_name + column_id: customers::name + properties: + column_type: ATTRIBUTE + - name: amount + column_id: sales::amount + properties: + column_type: MEASURE + aggregation: SUM