DataBiosphere · dsotirho-ucsc · Apr 1, 2026 · Apr 21, 2026 · Mar 30, 2026 · Apr 21, 2026
@@ -26,7 +26,6 @@
 )
 
 from more_itertools import (
-    first,
     one,
 )
 
@@ -429,15 +428,27 @@ def from_index(self, value: str) -> str | None:
 null_datetime: NullableDateTime = NullableDateTime(str, str)
 
 
-class Nested(PassThrough[JSON]):
+class Nested(FieldType[JSON, JSON]):
+    allow_sorting_by_empty_lists = False
+    es_type = 'nested'
     properties: Mapping[str, FieldType]
-    agg_property: str
 
     def __init__(self, **properties):
-        super().__init__(JSON, es_type='nested')
-        self.agg_property = first(properties.keys())
+        super().__init__(JSON, JSON)
         self.properties = properties
 
+    def to_index(self, value: JSON) -> JSON:
+        return {
+            field: field_type.to_index(value[field])
+            for field, field_type in self.properties.items()
+        }
+
+    def from_index(self, value: JSON) -> JSON:
+        return {
+            field: field_type.from_index(value[field])
+            for field, field_type in self.properties.items()
+        }
+
     def api_filter_values_schema(self, operator: str, mode: Mode) -> JSON:
         assert operator == 'is'
         schema = super().api_filter_values_schema(operator, mode)

@@ -21,6 +21,7 @@
     FieldTypes,
     FieldTypes1,
     Nested,
+    pass_thru_bool,
 )
 from azul.indexer.document import (
     Aggregate,
@@ -43,6 +44,7 @@
     json_dict,
 )
 from azul.plugins import (
+    FieldName,
     MetadataPlugin,
 )
 
@@ -96,7 +98,6 @@ def field_type(self, catalog: CatalogName, path: FieldPath) -> FieldType:
         if isinstance(field_types, Nested):
             element = next(elements, None)
             if element is not None:
-                assert element == field_types.agg_property, (element, field_types)
                 field_types = field_types.properties[element]
         assert isinstance(field_types, FieldType), (path, field_types)
         element = next(elements, None)
@@ -122,6 +123,26 @@ def field_types(self, catalog: CatalogName) -> FieldTypes:
             # does not undergo translation
         )
 
+    @cache
+    def mapped_field_types(self, catalog: CatalogName) -> Mapping[FieldName, FieldType]:
+        """
+        Returns the field type for each supported sort and filter field, using
+        the name of the field as provided by clients. Unlike field_types(), this
+        is a flat mapping and includes the synthetic field 'accessible' that has
+        no entry in the plugin's field_mapping.
+
+        :return: a mapping from each field's name to its type
+        """
+        plugin = self.metadata_plugin(catalog)
+        result = {}
+        for field, path in plugin.field_mapping.items():
+            field_type = self.field_type(catalog, path)
+            result[field] = field_type
+        accessible_field = plugin.special_fields.accessible.name
+        assert accessible_field not in result, result
+        result[accessible_field] = pass_thru_bool
+        return result
+
     def catalogued_field_types(self) -> CataloguedFieldTypes:
         return {
             catalog: self.field_types(catalog)

@@ -164,6 +164,10 @@ def dotted(path_or_element: FieldPathElement | FieldPath,
         return dot.join(path_or_element)
 
 
+def undotted(path: DottedFieldPath) -> FieldPath:
+    return tuple(path.split('.'))
+
+
 class DocumentSlice(TypedDict, total=False):
     """
     Also known in OpenSearch land as a *source filter*, but that phrase has

@@ -203,8 +203,8 @@ def _accumulator(self, field) -> Accumulator | None:
             return SetOfDictAccumulator(key=compose_keys(none_safe_key(),
                                                          none_safe_itemgetter('accession')))
         elif field == 'tissue_atlas':
-            return SetOfDictAccumulator(key=compose_keys(none_safe_key(),
-                                                         none_safe_itemgetter('atlas')))
+            return SetOfDictAccumulator(key=compose_keys(none_safe_tuple_key(),
+                                                         none_safe_itemgetter('atlas', 'version')))
         else:
             return super()._accumulator(field)
 

@@ -38,6 +38,7 @@
     ilen,
     one,
     only,
+    unique_everseen,
 )
 
 from azul import (
@@ -735,7 +736,16 @@ def _project(self, project: api.Project) -> MutableJSON:
             'accessions': list(map(self._accession, project.accessions)),
             'is_tissue_atlas_project': any(bionetwork.atlas_project
                                            for bionetwork in project.bionetworks),
-            'tissue_atlas': list(map(self._tissue_atlas, project.bionetworks)),
+            # Despite this field being aggregated with a SetOfDictAccumulator,
+            # we need to manually deduplicate the field's values because the
+            # project inner entity is not aggregated, only copied, when part of
+            # a project outter entity. Since this field is a nested field, we
+            # need the values to be unique, otherwise we'd get incorrect term
+            # facet totals.
+            'tissue_atlas': list(unique_everseen(
+                map(self._tissue_atlas, project.bionetworks),
+                key=lambda d: frozenset(d.items())
+            )),
             'bionetwork_name': json_sorted(bionetwork.name
                                            for bionetwork in project.bionetworks),
             'estimated_cell_count': project.estimated_cell_count,

@@ -18,6 +18,9 @@
     one,
 )
 
+from azul.field_type import (
+    Nested,
+)
 from azul.lib import (
     cached_property,
 )
@@ -557,9 +560,11 @@ def file_type_summary(aggregate_file: JSON) -> FileTypeSummaryForHit:
             ]
             return summarized_hit
 
-    def make_terms(self, agg) -> Terms:
+    def make_terms(self, field_type, agg) -> Terms:
         def choose_entry(_term):
-            if 'key_as_string' in _term:
+            if nested_property_names is not None:
+                return dict(zip(nested_property_names, _term['key']))
+            elif 'key_as_string' in _term:
                 return _term['key_as_string']
             elif (term_key := _term['key']) is None:
                 return None
@@ -570,10 +575,14 @@ def choose_entry(_term):
             else:
                 return str(term_key)
 
+        if isinstance(field_type, Nested):
+            nested_property_names = field_type.properties.keys()
+        else:
+            nested_property_names = None
         terms: list[Term] = []
         for bucket in agg['myTerms']['buckets']:
-            term = Term(term=choose_entry(bucket),
-                        count=bucket['doc_count'])
+            doc_count = bucket['doc_count']
+            term = Term(term=choose_entry(bucket), count=doc_count)
             try:
                 sub_agg = bucket['myProjectIds']
             except KeyError:
@@ -604,8 +613,9 @@ def choose_entry(_term):
                      type='terms')
 
     def make_facets(self, aggs: JSON) -> dict[str, Terms]:
+        field_types = self.service.mapped_field_types(self.catalog)
         facets = {}
         for facet, agg in aggs.items():
             if facet != '_project_agg':  # Filter out project specific aggs
-                facets[facet] = self.make_terms(agg)
+                facets[facet] = self.make_terms(field_types[facet], agg)
         return facets
@@ -28,7 +28,6 @@
 from azul.field_type import (
     FieldType,
     Mode,
-    pass_thru_bool,
 )
 from azul.lib import (
     cache,
@@ -224,19 +223,4 @@ def _filter_schema_validator(self,
 
     @cache
     def _field_types(self, catalog: CatalogName) -> Mapping[str, FieldType]:
-        """
-        Returns the field type for each supported sort and filter field, using
-        the name of the field as provided by clients.
-        """
-        result = {}
-        plugin = self._metadata_plugin
-        for field, path in plugin.field_mapping.items():
-            field_type = self._service.field_type(catalog, path)
-            if isinstance(field_type, FieldType):
-                result[field] = field_type
-        # This field is a synthetic element of the response and will never be
-        # null. Including it here helps to streamline request validation.
-        accessible_field = plugin.special_fields.accessible.name
-        assert accessible_field not in result, result
-        result[accessible_field] = pass_thru_bool
-        return result
+        return self._service.mapped_field_types(catalog)
@@ -35,6 +35,7 @@
 )
 from opensearchpy.helpers.aggs import (
     Agg,
+    MultiTerms,
     Terms,
 )
 from opensearchpy.helpers.query import (
@@ -81,9 +82,11 @@
 )
 from azul.plugins import (
     DocumentSlice,
+    DottedFieldPath,
     FieldPath,
     MetadataPlugin,
     dotted,
+    undotted,
 )
 
 log = logging.getLogger(__name__)
@@ -327,21 +330,35 @@ def _prepare_aggregation(self, *, facet: str, facet_path: FieldPath) -> Agg:
 
         field_type = self.service.field_type(self.catalog, facet_path)
         if isinstance(field_type, Nested):
-            nested_agg = agg.bucket(name='nested',
-                                    agg_type='nested',
-                                    path=dotted(facet_path))
-            facet_path = dotted(facet_path, field_type.agg_property)
+            path = dotted(facet_path)
+            # A nested aggregation to aggregate on fields inside a nested field
+            agg.bucket(name='nested',
+                       agg_type='nested',
+                       path=path)
+            # A multi-terms aggregation to form composite keys made from the
+            # fields inside a nested field
+            agg.aggs.nested.bucket(name='myTerms',
+                                   agg_type='multi_terms',
+                                   terms=[
+                                       {'field': dotted(path, field, 'keyword')}
+                                       for field in field_type.properties
+                                   ],
+                                   size=config.terms_aggregation_size)
+            # A filter aggregation to account for any missing values, since we
+            # can't use a 'missing' aggregation with a nested field
+            # See https://github.com/elastic/elasticsearch/issues/9571
+            agg.bucket(name='untagged',
+                       agg_type='filter',
+                       filter=Q('bool', must_not=[
+                           Q('nested', path=path, query=Q('exists', field=path))
+                       ]))
         else:
-            nested_agg = agg
-        # Make an inner agg that will contain the terms in question
-        path = dotted(facet_path, 'keyword')
-        # FIXME: Approximation errors for terms aggregation are unchecked
-        #        https://github.com/DataBiosphere/azul/issues/3413
-        nested_agg.bucket(name='myTerms',
-                          agg_type='terms',
-                          field=path,
-                          size=config.terms_aggregation_size)
-        nested_agg.bucket('untagged', 'missing', field=path)
+            path = dotted(facet_path, 'keyword')
+            agg.bucket(name='myTerms',
+                       agg_type='terms',
+                       field=path,
+                       size=config.terms_aggregation_size)
+            agg.bucket('untagged', 'missing', field=path)
         return agg
 
     def _annotate_aggs_for_translation(self, request: Search):
@@ -351,14 +368,27 @@ def _annotate_aggs_for_translation(self, request: Search):
         the response.
         """
 
+        def convert_path(path: DottedFieldPath) -> FieldPath:
+            p = undotted(path)
+            assert p[-1] == 'keyword', path
+            return p[:-1]
+
         def annotate(agg: Agg):
-            if isinstance(agg, Terms):
-                path = agg.field.split('.')
-                if path[-1] == 'keyword':
-                    path.pop()
+            if isinstance(agg, (Terms, MultiTerms)):
                 if not hasattr(agg, 'meta'):
                     agg.meta = {}
-                agg.meta['path'] = path
+                agg.meta['paths'] = []
+                if isinstance(agg, Terms):
+                    # A Terms agg is for a single field, so we only need to
+                    # annotate with the one FieldPath for the field.
+                    agg.meta['paths'].append(convert_path(agg.field))
+                else:
+                    # A MultiTerms agg contains multiple fields, so we need the
+                    # FieldPath of each one. By storing these in the same order
+                    # that the fields occur in `agg.terms`, we can later pair
+                    # these FieldPaths to the values in the aggregation buckets.
+                    for term in agg.terms:
+                        agg.meta['paths'].append(convert_path(term['field']))
             if hasattr(agg, 'aggs'):
                 subs = agg.aggs
                 for sub_name in subs:
@@ -391,13 +421,25 @@ def translate(k, v: MutableJSON):
                         translate(k, v)
             else:
                 try:
-                    path = v['meta']['path']
+                    # `paths` is a key we added to `meta` to have available here
+                    # when processing the response. Each path is a FieldPath
+                    # (e.g. ['contents', 'projects', 'document_id']). There will
+                    # be only one FieldPath in the case of a Terms aggregation,
+                    # and multiple in the case of a MultiTerms aggregation.
+                    paths = v['meta']['paths']
                 except KeyError:
                     pass
                 else:
-                    field_type = self.service.field_type(self.catalog, tuple(path))
+                    for i, path in enumerate(paths):
+                        field_type = self.service.field_type(self.catalog, tuple(path))
+                        for bucket in buckets:
+                            # If the bucket is from a MultiTemrms aggregation
+                            if isinstance(bucket['key'], list):
+                                bucket['key'][i] = field_type.from_index(bucket['key'][i])
+                            # If the bucket is from a Terms aggregation
+                            else:
+                                bucket['key'] = field_type.from_index(bucket['key'])
                     for bucket in buckets:
-                        bucket['key'] = field_type.from_index(bucket['key'])
                         translate(k, bucket)
 
         for k, v in aggs.items():

@@ -1746,6 +1746,41 @@ def test_organoid_priority(self):
         self.assertEqual(inner_cell_suspensions_in_contributions + inner_cell_suspensions_in_aggregates,
                          inner_cell_suspensions)
 
+    def test_nested_field_aggregation(self):
+        bundles = [
+            # Bundles with the following tissue_atlas (atlas/version) values:
+            # [None/None (x2), Lung/None, Retina/v1.0, Blood/v1.0]
+            self.bundle_fqid(uuid='2c7d06b8-658e-4c51-9de4-a768322f84c5',
+                             version='2021-09-21T17:27:23.898000Z'),
+            # [Blood/v1.0]
+            self.bundle_fqid(uuid='587d74b4-1075-4bbf-b96a-4d1ede0481b2',
+                             version='2018-10-10T02:23:43.182000Z'),
+            # [] (none)
+            self.bundle_fqid(uuid='97f0cc83-f0ac-417a-8a29-221c77debde8',
+                             version='2019-10-14T19:54:15.397406Z')
+        ]
+        for bundle in bundles:
+            self._index_canned_bundle(bundle)
+        hits = self._get_all_hits()
+        expected = {
+            '50151324-f3ed-4358-98af-ec352a940a61': [
+                {'atlas': '~null', 'version': '~null'},
+                {'atlas': 'Lung', 'version': '~null'},
+                {'atlas': 'Retina', 'version': 'v1.0'},
+                {'atlas': 'Blood', 'version': 'v1.0'}
+            ],
+            '6615efae-fca8-4dd2-a223-9cfcf30fe94d': [
+                {'atlas': 'Blood', 'version': 'v1.0'}
+            ],
+            '4e6f083b-5b9a-4393-9890-2a83da8188f1': [
+            ]
+        }
+        for hit in self._filter_hits(hits, DocumentType.aggregate, 'projects'):
+            contents = hit['_source']['contents']
+            project = cast(JSON, one(contents['projects']))
+            project_id = project['document_id']
+            self.assertEqual(expected[project_id], project['tissue_atlas'])
+
     def test_accessions_fields(self):
         bundle_fqid = self.bundle_fqid(uuid='fa5be5eb-2d64-49f5-8ed8-bd627ac9bc7a',
                                        version='2019-02-14T19:24:38.034764Z')