diff --git a/src/azul/plugins/metadata/anvil/service/response.py b/src/azul/plugins/metadata/anvil/service/response.py index bdd48bfbb0..3d1fccdbba 100644 --- a/src/azul/plugins/metadata/anvil/service/response.py +++ b/src/azul/plugins/metadata/anvil/service/response.py @@ -32,7 +32,6 @@ json_sequence_of_mappings, json_str, json_untyped_dict, - optional, ) from azul.plugins import ( SpecialFields, @@ -227,9 +226,7 @@ def _pivotal_entity(self, ) -> MutableJSON: inner_entity = copy_json(inner_entity) if inner_entity_type == 'files': - inner_entity['azul_url'] = self._file_url(uuid=json_str(inner_entity['document_id']), - version=json_str(inner_entity['version']), - drs_uri=optional(json_str, inner_entity['drs_uri'])) + inner_entity['azul_url'] = self._file_url(inner_entity) inner_entity['azul_mirror_uri'] = self._file_mirror_uri(source, inner_entity) inner_entity.pop('version', None) return inner_entity diff --git a/src/azul/plugins/metadata/hca/service/response.py b/src/azul/plugins/metadata/hca/service/response.py index 4b27eecebd..c1b99d7cc9 100644 --- a/src/azul/plugins/metadata/hca/service/response.py +++ b/src/azul/plugins/metadata/hca/service/response.py @@ -419,9 +419,7 @@ def make_file(self, source: SourceRef, file: JSON) -> JSON: 'version': file.get('version'), 'matrixCellCount': file.get('matrix_cell_count'), 'drs_uri': file.get('drs_uri'), - 'azul_url': self._file_url(uuid=json_str(file['uuid']), - version=json_str(file['version']), - drs_uri=optional(json_str, file['drs_uri'])), + 'azul_url': self._file_url(file), 'azul_mirror_uri': self._file_mirror_uri(source, file), } return translated_file diff --git a/src/azul/service/drs_controller.py b/src/azul/service/drs_controller.py index 8ff29c4742..5abe14e0ea 100644 --- a/src/azul/service/drs_controller.py +++ b/src/azul/service/drs_controller.py @@ -67,7 +67,7 @@ class DRSController(ServiceController): @cached_property def _service(self) -> IndexService: - return IndexService() + return IndexService(file_url_func=self._file_url) _drs_spec_description = fd(''' This is a partial implementation of the [DRS 1.0.0 spec][1]. Not all diff --git a/src/azul/service/index_controller.py b/src/azul/service/index_controller.py index ed07365a18..dfe2e69805 100644 --- a/src/azul/service/index_controller.py +++ b/src/azul/service/index_controller.py @@ -70,7 +70,7 @@ class IndexController(QueryController): @cached_property def _service(self) -> IndexService: - return IndexService() + return IndexService(file_url_func=self._file_url) _min_page_size = 1 @@ -365,7 +365,6 @@ def search(self, entity_type: str, entity_id: str | None = None) -> str | JSON: try: response = self._service.search(catalog=self.app.catalog, entity_type=entity_type, - file_url_func=self._file_url, item_id=entity_id, filters=filters, pagination=pagination) diff --git a/src/azul/service/index_service.py b/src/azul/service/index_service.py index d37345b025..1361d2c42c 100644 --- a/src/azul/service/index_service.py +++ b/src/azul/service/index_service.py @@ -32,12 +32,6 @@ from azul.filters import ( Filters, ) -from azul.indexer.mirror_service import ( - MirrorService, -) -from azul.lib import ( - cache, -) from azul.lib.types import ( JSON, MutableJSON, @@ -51,14 +45,13 @@ ) from azul.service import ( BadArgumentException, - FileUrlFunc, ) from azul.service.query_service import ( + FileUrlService, IndexNotFoundError, OpenSearchStage, Pagination, PaginationStage, - QueryService, ResponseTriple, ToDictStage, _OpenSearchStage, @@ -80,25 +73,15 @@ def __init__(self, entity_type: str, entity_id: str): class SearchResponseStage(_OpenSearchStage[ResponseTriple, MutableJSON], metaclass=ABCMeta): service: IndexService - file_url_func: FileUrlFunc def prepare_request(self, request: Search) -> Search: return request - def _file_url(self, *, uuid: str, version: str, drs_uri: str | None) -> str | None: - if drs_uri is None: - # To download a file we need its DRS URI - return None - else: - return str(self.file_url_func(catalog=self.catalog, - fetch=False, - file_uuid=uuid, - version=version)) + def _file_url(self, file: JSON) -> str | None: + return self.service.azul_file_url(self.catalog, file) def _file_mirror_uri(self, source: SourceRef, file: JSON) -> str | None: - file_cls = self.plugin.file_class - mirror_service = self.service.mirror_service(self.catalog) - return mirror_service.mirror_uri(source, file_cls, file) + return self.service.azul_mirror_uri(self.catalog, source, file) class SummaryResponseStage(OpenSearchStage[JSON, MutableJSON], @@ -113,17 +96,13 @@ def prepare_request(self, request: Search) -> Search: return request -class IndexService(QueryService): - - @cache - def mirror_service(self, catalog: CatalogName) -> MirrorService: - return MirrorService(catalog=catalog) +@attrs.frozen(auto_attribs=True, kw_only=True) +class IndexService(FileUrlService): def search(self, *, catalog: CatalogName, entity_type: str, - file_url_func: FileUrlFunc, item_id: str | None, filters: Filters, pagination: Pagination @@ -135,9 +114,6 @@ def search(self, :param pagination: A dictionary with pagination information as return from `_get_pagination()` :param filters: parsed JSON filters from the request :param item_id: If item_id is specified, only a single item is searched for - :param file_url_func: A function that is used only when getting a *list* of files data. - It creates the files URL based on info from the request. It should have the type - signature `(uuid: str, **params) -> str` :return: The OpenSearch JSON response """ if item_id is not None: @@ -148,8 +124,7 @@ def search(self, filters=filters, pagination=pagination, aggregate=item_id is None, - entity_type=entity_type, - file_url_func=file_url_func) + entity_type=entity_type) special_fields = self.metadata_plugin(catalog).special_fields for hit in response['hits']: @@ -169,7 +144,6 @@ def _search(self, aggregate: bool, filters: Filters, pagination: Pagination, - file_url_func: FileUrlFunc ) -> MutableJSON: """ This function does the whole transformation process. It takes the path @@ -225,8 +199,7 @@ def _search(self, response_stage_cls = plugin.search_response_stage chain = response_stage_cls(service=self, catalog=catalog, - entity_type=entity_type, - file_url_func=file_url_func).wrap(chain) + entity_type=entity_type).wrap(chain) request = self.create_request(catalog, entity_type) request = chain.prepare_request(request) diff --git a/src/azul/service/manifest_service.py b/src/azul/service/manifest_service.py index 4a02f52b57..ad9891c722 100644 --- a/src/azul/service/manifest_service.py +++ b/src/azul/service/manifest_service.py @@ -166,17 +166,16 @@ manifest_config_to_json, ) from azul.service import ( - FileUrlFunc, avro_pfb, ) from azul.service.avro_pfb import ( PFBRelation, ) from azul.service.query_service import ( + FileUrlService, OpenSearchChain, Pagination, PaginationStage, - QueryService, SortKey, ToDictStage, sort_key_from_json, @@ -573,8 +572,7 @@ class CachedManifestNotFound(Exception): @attrs.frozen(kw_only=True) -class ManifestService(QueryService): - file_url_func: FileUrlFunc +class ManifestService(FileUrlService): @cached_property def storage_service(self) -> StorageService: @@ -810,9 +808,9 @@ def format(cls) -> ManifestFormat: def metadata_plugin(self) -> MetadataPlugin: return self.service.metadata_plugin(self.catalog) - @cached_property + @property def mirror_service(self) -> MirrorService: - return MirrorService(catalog=self.catalog) + return self.service.mirror_service(self.catalog) @classmethod @abstractmethod @@ -964,7 +962,6 @@ def __init__(self, self.service = service self.catalog = catalog self.filters = filters - self.file_url_func = service.file_url_func manifest_namespace = UUID('ca1df635-b42c-4671-9322-b0a7209f0235') @@ -1142,20 +1139,10 @@ def _azul_file_url(self, file: JSON, args: Mapping = frozendict() ) -> str | None: - if file['drs_uri'] is None: - # To download a file we need its DRS URI - return None - else: - special_fields = self.metadata_plugin.special_fields - return str(self.file_url_func(catalog=self.catalog, - file_uuid=json_str(file[special_fields.file_uuid.name_in_hit]), - version=json_str(file['version']), - fetch=False, - **args)) + return self.service.azul_file_url(self.catalog, file, args) def _azul_mirror_uri(self, source: SourceRef, file: JSON) -> str | None: - file_cls = self.metadata_plugin.file_class - return self.mirror_service.mirror_uri(source, file_cls, file) + return self.service.azul_mirror_uri(self.catalog, source, file) @cache def _content_hash(self, *, by_bundle: bool) -> str: diff --git a/src/azul/service/query_service.py b/src/azul/service/query_service.py index 83c6f63af4..d2aab0c2d6 100644 --- a/src/azul/service/query_service.py +++ b/src/azul/service/query_service.py @@ -20,7 +20,7 @@ Self, ) -import attr +import attrs from furl import ( furl, ) @@ -62,8 +62,12 @@ from azul.indexer.document_service import ( DocumentService, ) +from azul.indexer.mirror_service import ( + MirrorService, +) from azul.lib import ( R, + cache, cached_property, ) from azul.lib.types import ( @@ -75,6 +79,7 @@ PrimitiveJSON, json_list, json_str, + optional, ) from azul.opensearch import ( OpenSearchClientFactory, @@ -85,6 +90,15 @@ MetadataPlugin, dotted, ) +from azul.service import ( + FileUrlFunc, +) +from azul.source import ( + SourceRef, +) +from azul.vendored.frozendict import ( + frozendict, +) log = logging.getLogger(__name__) @@ -120,7 +134,7 @@ def process_response(self, response: R1) -> R2: raise NotImplementedError -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class OpenSearchChain[R0, R1, R2](OpenSearchStage[R0, R2]): """ The result of wrapping a stage or chain in another stage. @@ -151,7 +165,7 @@ def stages(self) -> Iterable[OpenSearchStage]: yield self.inner -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class _OpenSearchStage[R1, R2](OpenSearchStage[R1, R2], metaclass=ABCMeta): """ A base implementation of a stage. @@ -171,7 +185,7 @@ def wrap[R0](self, other: OpenSearchStage[R0, R1]) -> OpenSearchChain[R0, R1, R2 TranslatedFilters = Mapping[FieldPath, Mapping[str, Sequence[PrimitiveJSON]]] -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class FilterStage(_OpenSearchStage[Response, Response]): """ Converts the given filters to an OpenSearch query and adds that query as @@ -269,7 +283,7 @@ def prepare_query(self, skip_field_paths: tuple[FieldPath] = ()) -> Query: return Q('bool', must=query_list) -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class AggregationStage(_OpenSearchStage[MutableJSON, MutableJSON]): """ Cooperate with the given filter stage to augment the request with an @@ -422,7 +436,7 @@ def _populate_accessible(self, aggs: MutableJSON) -> None: aggs[special_fields.accessible.name] = agg -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class SlicingStage(_OpenSearchStage[Response, Response]): """ Augments the request with a document slice (known as a *source filter* in @@ -451,7 +465,7 @@ def _prepared_slice(self) -> DocumentSlice | None: # FIXME: Elminate Eliminate reliance on Elasticsearch DSL # https://github.com/DataBiosphere/azul/issues/4111 -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class ToDictStage(_OpenSearchStage[Response, MutableJSON]): def prepare_request(self, request: Search) -> Search: @@ -473,7 +487,7 @@ def sort_key_to_json(s: SortKey) -> AnyJSON: return list(s) -@attr.s(auto_attribs=True, kw_only=True, frozen=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class Pagination: order: str size: int @@ -486,9 +500,9 @@ def advance(self, search_before: SortKey | None, search_after: SortKey | None ) -> Self: - return attr.evolve(self, - search_before=search_before, - search_after=search_after) + return attrs.evolve(self, + search_before=search_before, + search_after=search_after) def link(self, *, previous: bool, **params: str) -> furl | None: """ @@ -517,7 +531,7 @@ class ResponsePagination(JSONTypedDict): ResponseTriple = tuple[JSONs, ResponsePagination, JSON] -@attr.s(frozen=True, auto_attribs=True, kw_only=True) +@attrs.frozen(auto_attribs=True, kw_only=True) class PaginationStage(_OpenSearchStage[JSON, ResponseTriple]): """ Handles the pagination of search results @@ -709,3 +723,47 @@ def create_request(self, index=str(IndexName.create(catalog=catalog, qualifier=entity_type, doc_type=doc_type))) + + +@attrs.frozen(kw_only=True) +class FileUrlService(QueryService): + file_url_func: FileUrlFunc + + @cache + def mirror_service(self, catalog: CatalogName) -> MirrorService: + return MirrorService(catalog=catalog) + + def azul_mirror_uri(self, + catalog: CatalogName, + source: SourceRef, + file: JSON + ) -> str | None: + file_cls = self.metadata_plugin(catalog).file_class + return self.mirror_service(catalog).mirror_uri(source, file_cls, file) + + def azul_file_url(self, + catalog: CatalogName, + file: JSON, + args: Mapping = frozendict() + ) -> str | None: + drs_uri = optional(json_str, file['drs_uri']) + if drs_uri is None: + # To download a file we need its DRS URI + return None + elif ( + config.catalogs[catalog].atlas == 'lungmap' + and drs_uri.startswith('drs://dg.4503:') + ): + # LungMAP contains external files hosted on BioDataCatalyst. + # Downloading these files requires authentication that can't be + # provided by Azul, rendering our file URLs non-functional. If a + # user tries to follow such a URL, the request fails with a 401 + # status, so we avoid exposing them wherever possible. + return None + else: + special_fields = self.metadata_plugin(catalog).special_fields + return str(self.file_url_func(catalog=catalog, + file_uuid=json_str(file[special_fields.file_uuid.name_in_hit]), + version=json_str(file['version']), + fetch=False, + **args)) diff --git a/src/azul/service/repository_controller.py b/src/azul/service/repository_controller.py index 65fb800271..1582e1f3db 100644 --- a/src/azul/service/repository_controller.py +++ b/src/azul/service/repository_controller.py @@ -99,7 +99,7 @@ def _repository_service(self) -> RepositoryService: @cached_property def _index_service(self) -> IndexService: - return IndexService() + return IndexService(file_url_func=self._file_url) def _mirror_service(self, catalog: CatalogName) -> MirrorService: return self._index_service.mirror_service(catalog) diff --git a/test/service/test_response.py b/test/service/test_response.py index 0486d313c9..b129149928 100644 --- a/test/service/test_response.py +++ b/test/service/test_response.py @@ -39,6 +39,8 @@ LocalAppTestCase, ) from azul import ( + CatalogName, + Config, config, ) from azul.deployment import ( @@ -48,11 +50,13 @@ null_str, ) from azul.indexer import ( + Bundle, BundleFQID, SourcedBundleFQID, ) from azul.indexer.document import ( DocumentType, + EntityReference, IndexName, ) import azul.indexer.index_service @@ -208,8 +212,7 @@ def _service_index_service(self) -> IndexService: def _response_stage(self, entity_type: str) -> HCASearchResponseStage: return HCASearchResponseStage(service=self._service_index_service, entity_type=entity_type, - catalog=self.catalog, - file_url_func=self.file_url_func) + catalog=self.catalog) @property def paginations(self): @@ -3750,7 +3753,7 @@ def test(self): }, response.json()) -class TestResponseWithDCP2Cans(DCP2CannedBundleTestCase, WebServiceTestCase): +class DCP2ResponseTestCase(DCP2CannedBundleTestCase, WebServiceTestCase): @classmethod def setUpClass(cls): @@ -3771,6 +3774,15 @@ def bundles(cls) -> list[SourcedBundleFQID]: version='2022-08-23T17:25:02.565000Z') ] + def get_file(self, entry_id: str) -> JSON: + url = self.base_url.set(path=('index', 'files', entry_id)) + response = requests.get(str(url)) + response.raise_for_status() + return one(response.json()['files']) + + +class TestResponseWithDCP2Cans(DCP2ResponseTestCase): + def test_tdr_sources(self): url = self.base_url.set(path='/index/projects') response = requests.get(str(url)) @@ -3788,12 +3800,6 @@ def test_tdr_sources(self): prefix=Prefix.parse(source[prefix_field])) self.assertEqual(self.source.ref, source) - def get_file(self, entry_id: str) -> JSON: - url = self.base_url.set(path=('index', 'files', entry_id)) - response = requests.get(str(url)) - response.raise_for_status() - return one(response.json()['files']) - def test_file_urls(self): with self.subTest(phantom=False): file = self.get_file('507d2814-1688-54e7-b73e-2f831aa34368') @@ -3856,3 +3862,30 @@ def test_contributed_analyses_matrix(self): }} } self.assertEqual(expected_tree, project['contributedAnalyses']) + + +class TestExternalLungmapFiles(DCP2ResponseTestCase): + + @classmethod + def catalog_config(cls) -> dict[CatalogName, Config.Catalog]: + return { + name: attr.evolve(catalog, atlas='lungmap') + for name, catalog in super().catalog_config().items() + } + + external_file_uuid = '27fc1a2e-d70e-47ee-a4b7-92bf57e5b7a6' + # Compact identifier for BioDataCatalyst + external_drs_uri = 'drs://dg.4503:foo' + + @classmethod + def _index_bundle(cls, bundle: Bundle, *, delete: bool = False) -> None: + assert isinstance(bundle, HCABundle), bundle + for ref, entry in bundle.manifest.items(): + if EntityReference.parse(ref).entity_id == cls.external_file_uuid: + entry['drs_uri'] = cls.external_drs_uri + super()._index_bundle(bundle, delete=delete) + + def test_external_files(self): + file = self.get_file(self.external_file_uuid) + self.assertEqual(self.external_drs_uri, file['drs_uri']) + self.assertIsNone(file['azul_url'])