diff --git a/README.md b/README.md index b96c187fa1..4f78ffe180 100644 --- a/README.md +++ b/README.md @@ -668,6 +668,46 @@ These steps are performed once per deployment (multiple times per project). Follow the prompts to store the client secret. +### 3.2.3 Terra workspace for requester-pays with TDR + +TDR supports requester-pays via Google Cloud to allow for data egress charges +(e.g. from mirroring) to be billed to a specific [Terra billing project] instead +of the bucket owner. Billing projects are a concept internal to Terra, related +to but distinct from Google Cloud billing accounts. + +[Terra billing project]: https://support.terra.bio/hc/en-us/articles/360026182251-How-to-set-up-billing-in-Terra-GCP + +To use requester-pays, Azul must be configured with a Google Project ID +associated with a Terra workspace that belongs to the chosen Terra billing +project. This workspace must be creating and configuring manually. To set up a +workspace for requester-pays in a given deployment: + +1. If the deployment's service accounts have not already been + [registered with SAM](#234-google-cloud-tdr-and-sam), complete the steps in that section before + proceeding further. + +2. Go to [Terra](https://app.terra.bio/#workspace) and create a new workspace. + Give it the name `"azul-requester_pays-${AZUL_DEPLOYMENT_STAGE}"` + +3. Select the Terra billing project to be used for requester-pays. + +4. Add the deployment's public service account as a collaborator and assign it + the role of "Writer". The service account's email follows the pattern + `"${AZUL_GOOGLE_SERVICE_ACCOUNT_PUBLIC}@${GOOGLE_PROJECT}.iam.gserviceaccount.com"` + +5. If the deployment will be used to mirror managed-access files, repeat step 4 + for the indexer service account. + +Then, to enable requester-pays for that deployment: + +1. Copy the workspace's Google Project ID from the "Cloud Information" side + panel. + +2. Set `"AZUL_TDR_REQUESTER_PAYS_PROJECT"` to the copied project ID in the + chosen deployment's `environment.py`. + +3. `_refresh` and `make deploy`. + ## 3.3 Provisioning cloud infrastructure Once you've configured the project and your personal deployment or a shared diff --git a/deployments/anvilbox/environment.py b/deployments/anvilbox/environment.py index be309b56bc..2819f74f50 100644 --- a/deployments/anvilbox/environment.py +++ b/deployments/anvilbox/environment.py @@ -147,6 +147,7 @@ def env() -> Mapping[str, str | None]: 'AZUL_SAM_SERVICE_URL': 'https://sam.dsde-dev.broadinstitute.org', 'AZUL_DUOS_SERVICE_URL': 'https://consent.dsde-dev.broadinstitute.org', 'AZUL_TERRA_SERVICE_URL': 'https://firecloud-orchestration.dsde-dev.broadinstitute.org', + 'AZUL_TDR_REQUESTER_PAYS_PROJECT': 'terra-dev-8c6454ca', 'azul_ecm_service_url': 'https://externalcreds.dsde-dev.broadinstitute.org', **( @@ -184,9 +185,4 @@ def env() -> Mapping[str, str | None]: 'AZUL_GOOGLE_OAUTH2_CLIENT_ID': '561542988117-cpo2avhomdh6t7fetp91js78cdhm9p47.apps.googleusercontent.com', 'AZUL_ENABLE_MIRRORING': '1', - - # FIXME: Revert, once the underlying issue with requester-pays is fixed - # https://github.com/DataBiosphere/azul/issues/7955 - # - 'azul_it_flags': 'no_mirror', } diff --git a/deployments/anvildev/environment.py b/deployments/anvildev/environment.py index 176fb68a37..96a6a62e1c 100644 --- a/deployments/anvildev/environment.py +++ b/deployments/anvildev/environment.py @@ -134,6 +134,7 @@ def env() -> Mapping[str, str | None]: 'AZUL_SAM_SERVICE_URL': 'https://sam.dsde-dev.broadinstitute.org', 'AZUL_DUOS_SERVICE_URL': 'https://consent.dsde-dev.broadinstitute.org', 'AZUL_TERRA_SERVICE_URL': 'https://firecloud-orchestration.dsde-dev.broadinstitute.org', + 'AZUL_TDR_REQUESTER_PAYS_PROJECT': 'terra-dev-789f8dd1', 'azul_ecm_service_url': 'https://externalcreds.dsde-dev.broadinstitute.org', 'AZUL_ENABLE_MONITORING': '1', @@ -164,9 +165,4 @@ def env() -> Mapping[str, str | None]: 'workspace_id': 'T09P9H91S', # ucsc-gi.slack.com 'channel_id': 'C04K4BQET7G' # #team-boardwalk-anvildev }), - - # FIXME: Revert, once the underlying issue with requester-pays is fixed - # https://github.com/DataBiosphere/azul/issues/7955 - # - 'azul_it_flags': 'no_mirror', } diff --git a/deployments/anvilprod/environment.py b/deployments/anvilprod/environment.py index 08394e807d..622ca107d4 100644 --- a/deployments/anvilprod/environment.py +++ b/deployments/anvilprod/environment.py @@ -1393,6 +1393,7 @@ def env() -> Mapping[str, str | None]: 'AZUL_SAM_SERVICE_URL': 'https://sam.dsde-prod.broadinstitute.org', 'AZUL_DUOS_SERVICE_URL': 'https://consent.dsde-prod.broadinstitute.org', 'AZUL_TERRA_SERVICE_URL': 'https://firecloud-orchestration.dsde-prod.broadinstitute.org', + 'AZUL_TDR_REQUESTER_PAYS_PROJECT': 'terra-20cf0e48', 'azul_ecm_service_url': 'https://externalcreds.dsde-prod.broadinstitute.org', 'AZUL_ENABLE_MONITORING': '1', diff --git a/deployments/hammerbox/environment.py b/deployments/hammerbox/environment.py index 151bbf3ea3..47b2c5abe1 100644 --- a/deployments/hammerbox/environment.py +++ b/deployments/hammerbox/environment.py @@ -1407,6 +1407,7 @@ def env() -> Mapping[str, str | None]: 'AZUL_SAM_SERVICE_URL': 'https://sam.dsde-prod.broadinstitute.org', 'AZUL_DUOS_SERVICE_URL': 'https://consent.dsde-prod.broadinstitute.org', 'AZUL_TERRA_SERVICE_URL': 'https://firecloud-orchestration.dsde-prod.broadinstitute.org', + 'AZUL_TDR_REQUESTER_PAYS_PROJECT': 'terra-fb27b873', 'azul_ecm_service_url': 'https://externalcreds.dsde-prod.broadinstitute.org', # Personal deployments & `hammerbox` share an ES domain with `anvilprod` diff --git a/deployments/tempdev/environment.py b/deployments/tempdev/environment.py index e949f8232d..8eda5c6e73 100644 --- a/deployments/tempdev/environment.py +++ b/deployments/tempdev/environment.py @@ -162,9 +162,4 @@ def env() -> Mapping[str, str | None]: 'AZUL_GOOGLE_OAUTH2_CLIENT_ID': '807674395527-erth0gf1m7qme5pe6bu384vpdfjh06dg.apps.googleusercontent.com', 'AZUL_ENABLE_MIRRORING': '1', - - # FIXME: Revert, once the underlying issue with requester-pays is fixed - # https://github.com/DataBiosphere/azul/issues/7955 - # - 'azul_it_flags': 'no_mirror', } diff --git a/environment.py b/environment.py index 691f6e06f9..b859620a56 100644 --- a/environment.py +++ b/environment.py @@ -762,6 +762,14 @@ def env() -> Mapping[str, str | None]: # 'AZUL_TERRA_SERVICE_URL': None, + # The GCP project ID of the Terra workspace to charge for file downloads + # from TDR while mirroring. If left unset, egress charges are incurred + # to the owner of the GCS bucket the files are stored in. Otherwise, the + # egress will be charged to the GCP billing account associated with the + # workspace. See section 3.2.3 of the README. + # + 'AZUL_TDR_REQUESTER_PAYS_PROJECT': None, + # OAuth2 Client ID to be used for authenticating users. See section # 3.2 of the README # diff --git a/src/azul/__init__.py b/src/azul/__init__.py index 7f7b13e5fb..57cc4cdb0d 100644 --- a/src/azul/__init__.py +++ b/src/azul/__init__.py @@ -331,6 +331,10 @@ def terra_service_url(self) -> mutable_furl: def ecm_service_url(self) -> mutable_furl: return mutable_furl(self.environ['azul_ecm_service_url']) + @property + def tdr_requester_pays_project(self) -> str | None: + return self.environ.get('AZUL_TDR_REQUESTER_PAYS_PROJECT') + @property def dss_query_prefix(self) -> str: return self.environ.get('AZUL_DSS_QUERY_PREFIX', '') diff --git a/src/azul/drs.py b/src/azul/drs.py index b4dcf6561c..803ca1b743 100644 --- a/src/azul/drs.py +++ b/src/azul/drs.py @@ -356,15 +356,30 @@ class DRSObject: _http_client: HttpClient _url: furl - def get(self, access_method: AccessMethod = AccessMethod.https) -> Access: + def get(self, + access_method: AccessMethod = AccessMethod.https, + headers: Mapping[str, str] | None = None + ) -> Access: """ Returns access to the content of the data object identified by the - given URI. The scheme of the URL in the returned access object depends - on the access method specified. + given URI. + + :param access_method: The type of access method to use from the object + response. The scheme of the URL in the returned + access object depends on the access method + specified. + + :param headers: Optional request headers for accessing the object. Note + that this argument is only applied for requests to + .../objects/{object_id}/access/{access_id}/, and not to + .../objects/{object_id}. """ - return self._get(access_method) + return self._get(access_method, headers) - def _get(self, access_method: AccessMethod) -> Access: + def _get(self, + access_method: AccessMethod, + headers: Mapping[str, str] | None + ) -> Access: url = self._url while True: response = self._request(url) @@ -384,9 +399,9 @@ def _get(self, access_method: AccessMethod) -> Access: # https://github.com/ga4gh/data-repository-service-schemas/issues/361 assert access_method is AccessMethod.gs, R( 'Unexpected access method', access_method) - return self._get_access(access_id, AccessMethod.https) + return self._get_access(access_id, AccessMethod.https, headers) elif access_id is not None: - return self._get_access(access_id, access_method) + return self._get_access(access_id, access_method, headers) elif access_url is not None: scheme = furl(access_url['url']).scheme assert scheme == access_method.scheme, R( @@ -403,23 +418,28 @@ def _get(self, access_method: AccessMethod) -> Access: else: raise DRSStatusException(url, response) - def _get_access(self, access_id: str, access_method: AccessMethod) -> Access: + def _get_access(self, + access_id: str, + access_method: AccessMethod, + headers: Mapping[str, str] | None + ) -> Access: url = self._url.copy() url.path.add(['access', access_id]) while True: - response = self._request(url) + response = self._request(url, headers=headers) if response.status == 200: response_data = json_dict(json.loads(response.data)) scheme = furl(json_str(response_data['url'])).scheme assert scheme == access_method.scheme, R( 'Unexpected access URL scheme', scheme) access_url = json_str(response_data['url']) - headers = response_data.get('headers') - if headers is None: - access_headers = None - else: - access_headers = {k: json_str(v) for k, v in json_dict(headers).items()} - return Access(method=access_method, url=access_url, headers=access_headers) + access_headers = response_data.get('headers') + return Access(method=access_method, + url=access_url, + headers=None if access_headers is None else { + k: json_str(v) + for k, v in json_dict(access_headers).items() + }) elif response.status == 202: wait_time = int(response.headers['retry-after']) time.sleep(wait_time) @@ -428,8 +448,8 @@ def _get_access(self, access_id: str, access_method: AccessMethod) -> Access: else: raise DRSStatusException(url, response) - def _request(self, url: furl) -> urllib3.BaseHTTPResponse: - return self._http_client.request('GET', str(url), redirect=False) + def _request(self, url: furl, **kwargs) -> urllib3.BaseHTTPResponse: + return self._http_client.request('GET', str(url), **kwargs, redirect=False) class DRSStatusException(Exception): diff --git a/src/azul/indexer/mirror_service.py b/src/azul/indexer/mirror_service.py index 8818fb37bb..bd68ba9b79 100644 --- a/src/azul/indexer/mirror_service.py +++ b/src/azul/indexer/mirror_service.py @@ -42,9 +42,6 @@ from azul.deployment import ( aws, ) -from azul.drs import ( - AccessMethod, -) from azul.http import ( HasCachedHttpClient, ) @@ -78,6 +75,9 @@ RepositoryFileDownload, RepositoryPlugin, ) +from azul.plugins.repository.tdr import ( + TDRFileDownload, +) from azul.queues import ( Action, Queues, @@ -799,7 +799,7 @@ def _create_info(self, file: File): content_type='application/json', overwrite=False) - def _repository_url(self, file: File) -> furl: + def _repository_url(self, file: File) -> str: assert config.is_tdr_enabled(self.catalog), R( 'Only TDR catalogs are supported', self.catalog) assert file.drs_uri is not None, R( @@ -810,10 +810,16 @@ def _repository_url(self, file: File) -> furl: authentication = None else: authentication = indexer_authentication - object = self.repository_plugin.drs_object(file.drs_uri, authentication) - access = object.get(AccessMethod.gs) - assert access.method is AccessMethod.https, access - return furl(access.url) + authentication = None + download = TDRFileDownload(plugin=self.repository_plugin, + file=file, + replica=None, + token=None) + download.update(authentication, + requester_pays_project=config.tdr_requester_pays_project) + assert download.retry_after is None + assert download.location is not None + return download.location def _download(self, file: File, part: FilePart | None = None) -> bytes: url = self._repository_url(file) @@ -828,7 +834,7 @@ def _download(self, file: File, part: FilePart | None = None) -> bytes: expected_status = 206 # Ideally we would stream the response, but boto only supports uploading # from streams that are seekable. - response = self._http_client.request('GET', str(url), headers=headers) + response = self._http_client.request('GET', url, headers=headers) if response.status == expected_status: actual_size = len(response.data) log.info('Downloaded %d bytes in %.3fs from file %r', diff --git a/src/azul/plugins/repository/tdr.py b/src/azul/plugins/repository/tdr.py index cba6c272e2..45d3e89f13 100644 --- a/src/azul/plugins/repository/tdr.py +++ b/src/azul/plugins/repository/tdr.py @@ -273,15 +273,23 @@ def find_in_source(self, class TDRFileDownload(RepositoryFileDownload): _location: str | None = None - def update(self, authentication: Authentication | None) -> None: + def update(self, + authentication: Authentication | None, + *, + requester_pays_project: str | None = None + ) -> None: assert self.replica is None or self.replica == 'gcp', R( 'Invalid replica', self.replica) if self.file.drs_uri is None: assert self.location is None, self assert self.retry_after is None, self else: + if requester_pays_project is not None: + headers = {'x-user-project': requester_pays_project} + else: + headers = None drs_client = self._plugin.drs_object(self.file.drs_uri, authentication) - access = drs_client.get(access_method=AccessMethod.gs) + access = drs_client.get(access_method=AccessMethod.gs, headers=headers) assert access.method is AccessMethod.https, R(str(access.method)) assert access.headers is None, R(str(access.headers)) signed_url = access.url diff --git a/test/integration_test.py b/test/integration_test.py index 0cbcf8b82d..023737b73d 100644 --- a/test/integration_test.py +++ b/test/integration_test.py @@ -1209,12 +1209,13 @@ def _test_file_download(self, source: SourceSpec, file: JSON) -> mutable_furl | self.assertNotIn(str(config.tdr_service_url), msg) return None elif response.status == 403: - # FIXME: Treat this as an error if requester-pays is enabled - # https://github.com/DataBiosphere/azul/issues/7794 msg = json.loads(response.data)['Message'] prefix = 'DRS server requires requester-pays for ' self.assertEqual(prefix, msg[:len(prefix)]) - return None + if config.tdr_requester_pays_project is None: + return None + else: + self.fail(msg) else: self.assertEqual(200, response.status) response = json.loads(response.data)