-
Notifications
You must be signed in to change notification settings - Fork 4
Add support for HCA tissue atlas (#7128) #7877
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
24b9f95
ccc79d3
a2031fe
10fb954
eb3e80b
79926f3
3dfe9db
f18e8bc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,9 @@ | |
| one, | ||
| ) | ||
|
|
||
| from azul.field_type import ( | ||
| Nested, | ||
| ) | ||
| from azul.lib import ( | ||
| cached_property, | ||
| ) | ||
|
|
@@ -557,9 +560,11 @@ def file_type_summary(aggregate_file: JSON) -> FileTypeSummaryForHit: | |
| ] | ||
| return summarized_hit | ||
|
|
||
| def make_terms(self, agg) -> Terms: | ||
| def make_terms(self, field_type, agg) -> Terms: | ||
| def choose_entry(_term): | ||
| if 'key_as_string' in _term: | ||
| if nested_property_names is not None: | ||
| return dict(zip(nested_property_names, _term['key'])) | ||
| elif 'key_as_string' in _term: | ||
| return _term['key_as_string'] | ||
| elif (term_key := _term['key']) is None: | ||
| return None | ||
|
|
@@ -570,10 +575,14 @@ def choose_entry(_term): | |
| else: | ||
| return str(term_key) | ||
|
|
||
| if isinstance(field_type, Nested): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PL
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TODO: Rename |
||
| nested_property_names = field_type.properties.keys() | ||
| else: | ||
| nested_property_names = None | ||
| terms: list[Term] = [] | ||
| for bucket in agg['myTerms']['buckets']: | ||
| term = Term(term=choose_entry(bucket), | ||
| count=bucket['doc_count']) | ||
| doc_count = bucket['doc_count'] | ||
| term = Term(term=choose_entry(bucket), count=doc_count) | ||
| try: | ||
| sub_agg = bucket['myProjectIds'] | ||
| except KeyError: | ||
|
|
@@ -604,8 +613,9 @@ def choose_entry(_term): | |
| type='terms') | ||
|
|
||
| def make_facets(self, aggs: JSON) -> dict[str, Terms]: | ||
| field_types = self.service.mapped_field_types(self.catalog) | ||
| facets = {} | ||
| for facet, agg in aggs.items(): | ||
| if facet != '_project_agg': # Filter out project specific aggs | ||
| facets[facet] = self.make_terms(agg) | ||
| facets[facet] = self.make_terms(field_types[facet], agg) | ||
| return facets | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,6 +35,7 @@ | |
| ) | ||
| from opensearchpy.helpers.aggs import ( | ||
| Agg, | ||
| MultiTerms, | ||
| Terms, | ||
| ) | ||
| from opensearchpy.helpers.query import ( | ||
|
|
@@ -81,9 +82,11 @@ | |
| ) | ||
| from azul.plugins import ( | ||
| DocumentSlice, | ||
| DottedFieldPath, | ||
| FieldPath, | ||
| MetadataPlugin, | ||
| dotted, | ||
| undotted, | ||
| ) | ||
|
|
||
| log = logging.getLogger(__name__) | ||
|
|
@@ -327,21 +330,35 @@ def _prepare_aggregation(self, *, facet: str, facet_path: FieldPath) -> Agg: | |
|
|
||
| field_type = self.service.field_type(self.catalog, facet_path) | ||
| if isinstance(field_type, Nested): | ||
| nested_agg = agg.bucket(name='nested', | ||
| agg_type='nested', | ||
| path=dotted(facet_path)) | ||
| facet_path = dotted(facet_path, field_type.agg_property) | ||
| path = dotted(facet_path) | ||
| # A nested aggregation to aggregate on fields inside a nested field | ||
| agg.bucket(name='nested', | ||
| agg_type='nested', | ||
| path=path) | ||
| # A multi-terms aggregation to form composite keys made from the | ||
| # fields inside a nested field | ||
| agg.aggs.nested.bucket(name='myTerms', | ||
| agg_type='multi_terms', | ||
| terms=[ | ||
| {'field': dotted(path, field, 'keyword')} | ||
| for field in field_type.properties | ||
| ], | ||
| size=config.terms_aggregation_size) | ||
| # A filter aggregation to account for any missing values, since we | ||
| # can't use a 'missing' aggregation with a nested field | ||
| # See https://github.com/elastic/elasticsearch/issues/9571 | ||
| agg.bucket(name='untagged', | ||
| agg_type='filter', | ||
| filter=Q('bool', must_not=[ | ||
| Q('nested', path=path, query=Q('exists', field=path)) | ||
| ])) | ||
| else: | ||
| nested_agg = agg | ||
| # Make an inner agg that will contain the terms in question | ||
| path = dotted(facet_path, 'keyword') | ||
| # FIXME: Approximation errors for terms aggregation are unchecked | ||
| # https://github.com/DataBiosphere/azul/issues/3413 | ||
| nested_agg.bucket(name='myTerms', | ||
| agg_type='terms', | ||
| field=path, | ||
| size=config.terms_aggregation_size) | ||
| nested_agg.bucket('untagged', 'missing', field=path) | ||
| path = dotted(facet_path, 'keyword') | ||
| agg.bucket(name='myTerms', | ||
| agg_type='terms', | ||
| field=path, | ||
| size=config.terms_aggregation_size) | ||
| agg.bucket('untagged', 'missing', field=path) | ||
| return agg | ||
|
|
||
| def _annotate_aggs_for_translation(self, request: Search): | ||
|
|
@@ -351,14 +368,27 @@ def _annotate_aggs_for_translation(self, request: Search): | |
| the response. | ||
| """ | ||
|
|
||
| def convert_path(path: DottedFieldPath) -> FieldPath: | ||
| p = undotted(path) | ||
| assert p[-1] == 'keyword', path | ||
| return p[:-1] | ||
|
|
||
| def annotate(agg: Agg): | ||
| if isinstance(agg, Terms): | ||
| path = agg.field.split('.') | ||
| if path[-1] == 'keyword': | ||
| path.pop() | ||
| if isinstance(agg, (Terms, MultiTerms)): | ||
| if not hasattr(agg, 'meta'): | ||
| agg.meta = {} | ||
| agg.meta['path'] = path | ||
| agg.meta['paths'] = [] | ||
| if isinstance(agg, Terms): | ||
| # A Terms agg is for a single field, so we only need to | ||
| # annotate with the one FieldPath for the field. | ||
| agg.meta['paths'].append(convert_path(agg.field)) | ||
| else: | ||
| # A MultiTerms agg contains multiple fields, so we need the | ||
| # FieldPath of each one. By storing these in the same order | ||
| # that the fields occur in `agg.terms`, we can later pair | ||
| # these FieldPaths to the values in the aggregation buckets. | ||
| for term in agg.terms: | ||
| agg.meta['paths'].append(convert_path(term['field'])) | ||
| if hasattr(agg, 'aggs'): | ||
| subs = agg.aggs | ||
| for sub_name in subs: | ||
|
|
@@ -391,13 +421,25 @@ def translate(k, v: MutableJSON): | |
| translate(k, v) | ||
| else: | ||
| try: | ||
| path = v['meta']['path'] | ||
| # `paths` is a key we added to `meta` to have available here | ||
| # when processing the response. Each path is a FieldPath | ||
| # (e.g. ['contents', 'projects', 'document_id']). There will | ||
| # be only one FieldPath in the case of a Terms aggregation, | ||
| # and multiple in the case of a MultiTerms aggregation. | ||
| paths = v['meta']['paths'] | ||
| except KeyError: | ||
| pass | ||
| else: | ||
| field_type = self.service.field_type(self.catalog, tuple(path)) | ||
| for i, path in enumerate(paths): | ||
| field_type = self.service.field_type(self.catalog, tuple(path)) | ||
| for bucket in buckets: | ||
| # If the bucket is from a MultiTemrms aggregation | ||
| if isinstance(bucket['key'], list): | ||
| bucket['key'][i] = field_type.from_index(bucket['key'][i]) | ||
| # If the bucket is from a Terms aggregation | ||
| else: | ||
| bucket['key'] = field_type.from_index(bucket['key']) | ||
| for bucket in buckets: | ||
| bucket['key'] = field_type.from_index(bucket['key']) | ||
| translate(k, bucket) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PL to explain why
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was mistaken in thinking that this is no longer needed. We still have occurrences of nested agg buckets such as in the HCA agg
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please don't ignore my PL request.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Decided in PL to refactor the changes to the translation of aggregates ( |
||
|
|
||
| for k, v in aggs.items(): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This comment was generated by Claude Code.
With these two overrides, does it still make sense for
Nestedto inheritPassThrough?The defining characteristic of
PassThroughis thatto_indexandfrom_indexare identity operations. After this change,Nestedoverrides both with non-trivial logic that delegates to per-property field types — the values are no longer "passed through" unchanged.What
Nestedstill gets fromPassThrough:es_typeproperty (trivial — stores and returns_es_type)allow_sorting_by_empty_lists = False__init__convenience of setting bothnative_formandindex_formto the same type (JSON)All three are easy to replicate by inheriting
FieldType[JSON, JSON]directly.