diff --git a/core/grpc/proto/metastore-status.v1.proto b/core/grpc/proto/metastore-status.v1.proto index 70e3701503..529f85bf96 100644 --- a/core/grpc/proto/metastore-status.v1.proto +++ b/core/grpc/proto/metastore-status.v1.proto @@ -19,6 +19,7 @@ enum RequestProcessingStatus { COMPLETED = 1; // Request has completed successfully FAILED = 2; // Request has failed ANY = 3; // Special value to query any status + DELETED = 4; // Request has been deleted } // Information about the status of a specific request diff --git a/core/service/src/engine/centralized/service/crs_gen.rs b/core/service/src/engine/centralized/service/crs_gen.rs index f3a3b60dca..b68b109444 100644 --- a/core/service/src/engine/centralized/service/crs_gen.rs +++ b/core/service/src/engine/centralized/service/crs_gen.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use aes_prng::AesRng; use alloy_sol_types::Eip712Domain; use anyhow::Result; @@ -10,6 +8,7 @@ use observability::metrics_names::{ CENTRAL_TAG, OP_CRS_GEN_ABORT, OP_CRS_GEN_REQUEST, OP_CRS_GEN_RESULT, OP_INSECURE_CRS_GEN_REQUEST, TAG_PARTY_ID, }; +use std::sync::Arc; use threshold_execution::tfhe_internals::parameters::DKGParams; use tokio_util::sync::CancellationToken; @@ -26,7 +25,8 @@ use crate::engine::validation::{ RequestIdParsingErr, parse_grpc_request_id, validate_crs_gen_request, }; use crate::util::meta_store::{ - MetaStore, add_req_to_meta_store, retrieve_from_meta_store, update_err_req_in_meta_store, + MetaStore, MetaStorePermit, add_req_to_meta_store, retrieve_from_meta_store, + update_err_req_in_meta_store, }; use crate::vault::storage::crypto_material::CentralizedCryptoMaterialStorage; use crate::vault::storage::{Storage, StorageExt}; @@ -74,17 +74,14 @@ pub async fn crs_gen_impl< // check that the request ID is not used yet // and then insert the request ID only if it's unused - // all validation must be done before inserting the request ID - add_req_to_meta_store( - &mut service.crs_meta_map.write().await, - &verified.req_id, - op_tag, - )?; + // all validation must be done before inserting the request ID. + // The meta-store permit is threaded into crs_gen_background and consumed + // by one of: the abort arm, the generation-error arm, or write_crs. + let meta_permit = + add_req_to_meta_store(&service.crs_meta_map, &verified.req_id, op_tag).await?; let meta_store = Arc::clone(&service.crs_meta_map); - let meta_store_cancel = Arc::clone(&service.crs_meta_map); let crypto_storage = service.crypto_storage.clone(); - let crypto_storage_cancel = service.crypto_storage.clone(); let sk = service .base_kms .sig_key() @@ -115,46 +112,26 @@ pub async fn crs_gen_impl< async move { let _timer = timer; let _permit = permit; - tokio::select! { - () = crs_gen_background( - &req_id, - &epoch_id, - rng, - meta_store, - crypto_storage, - sk, - verified.params, - verified.eip712_domain, - verified.extra_data, - max_bits, - op_tag, - ) => { - tracing::info!("CRS generation of request {} exiting normally.", req_id); - // Remove cancellation token since generation is now done. - ongoing.lock().await.remove(&req_id); - }, - () = token.cancelled() => { - MetricedError::handle_unreturnable_error( - OP_CRS_GEN_REQUEST, - Some(req_id), - anyhow::anyhow!("CRS generation of request {} exiting before completion because of an abort request.", req_id), - ); - let del_res = crypto_storage_cancel.inner.purge_crs_material(&req_id, &epoch_id).await; - { - let mut guarded_meta_store = meta_store_cancel.write().await; - let msg = if del_res { - let msg = format!("CRS generation aborted and CRS material deleted successfully for request {}", req_id); - tracing::info!(msg); - msg - } else { - let msg = format!("CRS generation aborted but failed to delete CRS material for request {}", req_id); - tracing::error!(msg); - msg - }; - update_err_req_in_meta_store(&mut guarded_meta_store, &req_id, msg, OP_CRS_GEN_REQUEST); - } - } - } + crs_gen_background( + meta_permit, + token, + &req_id, + &epoch_id, + rng, + meta_store, + crypto_storage, + sk, + verified.params, + verified.eip712_domain, + verified.extra_data, + max_bits, + op_tag, + ) + .await; + // Cleanup runs on every termination (normal completion, generation + // error, or abort) — the cancellation handling now lives inside + // `crs_gen_background`. + ongoing.lock().await.remove(&req_id); } .instrument(tracing::Span::current()), ); @@ -183,10 +160,9 @@ pub async fn get_crs_gen_result_impl< .map_err(|e| MetricedError::new(op_tag, None, e, tonic::Code::InvalidArgument))?; tracing::debug!("Received CRS gen result request with id {}", request_id); - let crs_info = - retrieve_from_meta_store(service.crs_meta_map.read().await, &request_id, op_tag).await?; + let crs_info = retrieve_from_meta_store(&service.crs_meta_map, &request_id, op_tag).await?; - match crs_info { + match (*crs_info).clone() { CrsGenMetadata::LegacyV0(_) => { // This is a legacy result, we cannot return the crs_digest or external_signature // as they're signed using a different SolStruct and hashed using a different domain separator @@ -259,12 +235,21 @@ pub async fn abort_crs_gen_impl< } } -/// Background task for CRS generation +/// Background task for CRS generation. +/// +/// Owns the meta-store permit for the entire lifetime of the request. The +/// cancellation token is checked only during the long-running +/// [`async_generate_crs`] phase via an inner `tokio::select!`; once +/// generation succeeds, the disk-write phase is intentionally uncancellable +/// so we never end up with "purged material on disk" + "Done state in +/// meta store". #[allow(clippy::too_many_arguments)] pub(crate) async fn crs_gen_background< PubS: Storage + Send + Sync + 'static, PrivS: StorageExt + Send + Sync + 'static, >( + permit: MetaStorePermit, + cancel_token: CancellationToken, req_id: &RequestId, epoch_id: &EpochId, rng: AesRng, @@ -279,42 +264,56 @@ pub(crate) async fn crs_gen_background< ) { let start = tokio::time::Instant::now(); - let (pp, crs_info) = match async_generate_crs( - &sk, - params, - max_number_bits, - eip712_domain, - extra_data, - req_id, - rng, - ) - .await - { - Ok((pp, crs_info)) => (pp, crs_info), - Err(e) => { - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, + // Race the long-running generation against cancellation. + let outcome: Result<(tfhe::zk::CompactPkeCrs, CrsGenMetadata), String> = tokio::select! { + biased; + () = cancel_token.cancelled() => Err(format!("CRS generation aborted for request {req_id}")), + result = async_generate_crs( + &sk, params, max_number_bits, eip712_domain, extra_data, req_id, rng, + ) => result.map_err(|e| e.to_string()), + }; + + match outcome { + Err(msg) => { + tracing::error!("{msg}"); + let del_res = crypto_storage + .inner + .purge_crs_material(req_id, epoch_id) + .await; + let msg = if del_res { + let m = format!( + "CRS generation aborted and CRS material deleted successfully for request {req_id}" + ); + tracing::info!(m); + m + } else { + let m = format!( + "CRS generation aborted but failed to delete CRS material for request {req_id}" + ); + tracing::error!(m); + m + }; + + let _ = update_err_req_in_meta_store(&meta_store, permit, msg, op_tag).await; + } + Ok((pp, crs_info)) => { + // write_crs consumes the permit via update_meta_store internally + // (Ok → update_ok_req_in_meta_store, Err → update_err). + if let Err(e) = crypto_storage + .inner + .write_crs(req_id, epoch_id, pp, crs_info, meta_store, permit, op_tag) + .await + { + tracing::error!("Failed to write CRS to storage: {e}"); + return; + } + tracing::info!( + "⏱️ Core Event Time for CRS-gen request id {}: {:?}", req_id, - e.to_string(), - op_tag, + start.elapsed() ); - return; } - }; - - if let Err(e) = crypto_storage - .inner - .write_crs(req_id, epoch_id, pp, crs_info, meta_store, op_tag) - .await - { - tracing::error!("Failed to write CRS to storage: {e}"); - return; } - tracing::info!( - "⏱️ Core Event Time for CRS-gen request id {}: {:?}", - req_id, - start.elapsed() - ); } #[cfg(test)] diff --git a/core/service/src/engine/centralized/service/decryption.rs b/core/service/src/engine/centralized/service/decryption.rs index 3dbac41a76..0b7180f579 100644 --- a/core/service/src/engine/centralized/service/decryption.rs +++ b/core/service/src/engine/centralized/service/decryption.rs @@ -83,11 +83,12 @@ pub async fn user_decrypt_impl< let meta_store = Arc::clone(&service.user_dec_meta_store); let mut rng = service.base_kms.new_rng().await; - add_req_to_meta_store( - &mut service.user_dec_meta_store.write().await, + let meta_permit = add_req_to_meta_store( + &service.user_dec_meta_store, &request_id, OP_USER_DECRYPT_REQUEST, - )?; + ) + .await?; let sig_key = service.base_kms.sig_key().map_err(|e| { MetricedError::new( OP_USER_DECRYPT_REQUEST, @@ -110,6 +111,7 @@ pub async fn user_decrypt_impl< async move { let _timer = timer; let _permit = permit; + let meta_permit = meta_permit; tracing::info!( "Starting user decryption using key_id {} for request ID {}", @@ -131,11 +133,12 @@ pub async fn user_decrypt_impl< .await; let res_with_extra_data = res.map(|(payload, sig)| (payload, sig, extra_data)); let _ = update_req_in_meta_store( - &mut meta_store.write().await, - &request_id, + &meta_store, + meta_permit, res_with_extra_data, OP_USER_DECRYPT_REQUEST, - ); + ) + .await; } .instrument(tracing::Span::current()), ); @@ -165,12 +168,13 @@ pub async fn get_user_decryption_result_impl< }, )?; - let (payload, external_signature, extra_data) = retrieve_from_meta_store( - service.user_dec_meta_store.read().await, + let arc = retrieve_from_meta_store( + &service.user_dec_meta_store, &request_id, OP_USER_DECRYPT_RESULT, ) .await?; + let (payload, external_signature, extra_data) = (*arc).clone(); // sign the response let sig_payload_vec = bc2wrap::serialize(&payload).map_err(|e| { @@ -258,11 +262,12 @@ pub async fn public_decrypt_impl< // if the request already exists, then return the AlreadyExists error // otherwise attempt to insert it to the meta store - add_req_to_meta_store( - &mut service.pub_dec_meta_store.write().await, + let meta_permit = add_req_to_meta_store( + &service.pub_dec_meta_store, &request_id, OP_PUBLIC_DECRYPT_REQUEST, - )?; + ) + .await?; let meta_store = Arc::clone(&service.pub_dec_meta_store); let sig_key = service.base_kms.sig_key().map_err(|e| { @@ -279,6 +284,7 @@ pub async fn public_decrypt_impl< let _handle = tokio::spawn(async move { let _timer = timer; let _permit = permit; + let meta_permit = meta_permit; tracing::info!( "Starting decryption using key_id {} for request ID {}", &key_id, @@ -315,12 +321,8 @@ pub async fn public_decrypt_impl< Err(e) => Err(format!("Error collecting decrypt result: {e:?}")), Ok(Err(e)) => Err(format!("Error during decryption computation: {e}")), }; - let _ = update_req_in_meta_store( - &mut meta_store.write().await, - &request_id, - res, - OP_PUBLIC_DECRYPT_REQUEST, - ); + let _ = update_req_in_meta_store(&meta_store, meta_permit, res, OP_PUBLIC_DECRYPT_REQUEST) + .await; tracing::info!( "⏱️ Core Event Time for decryption computation: {:?}", start.elapsed() @@ -355,12 +357,13 @@ pub async fn get_public_decryption_result_impl< })?; tracing::debug!("Received get key gen result request with id {}", request_id); - let (retrieved_req_id, plaintexts, external_signature, extra_data) = retrieve_from_meta_store( - service.pub_dec_meta_store.read().await, + let arc = retrieve_from_meta_store( + &service.pub_dec_meta_store, &request_id, OP_PUBLIC_DECRYPT_RESULT, ) .await?; + let (retrieved_req_id, plaintexts, external_signature, extra_data) = (*arc).clone(); if retrieved_req_id != request_id { return Err(MetricedError::new( diff --git a/core/service/src/engine/centralized/service/initiator.rs b/core/service/src/engine/centralized/service/initiator.rs index 348fc29ac4..82e1513d04 100644 --- a/core/service/src/engine/centralized/service/initiator.rs +++ b/core/service/src/engine/centralized/service/initiator.rs @@ -84,17 +84,14 @@ pub async fn init_impl< )); } } - add_req_to_meta_store( - &mut service.epoch_ids.write().await, + let permit = add_req_to_meta_store( + &service.epoch_ids, &verified_request.epoch_id.into(), OP_NEW_EPOCH, - )?; - update_req_in_meta_store::<(), anyhow::Error>( - &mut service.epoch_ids.write().await, - &verified_request.epoch_id.into(), - Ok(()), - OP_NEW_EPOCH, - ); + ) + .await?; + update_req_in_meta_store::<(), anyhow::Error>(&service.epoch_ids, permit, Ok(()), OP_NEW_EPOCH) + .await; tracing::warn!( "Init called on centralized KMS with ID {} - no action taken", verified_request.epoch_id diff --git a/core/service/src/engine/centralized/service/key_gen.rs b/core/service/src/engine/centralized/service/key_gen.rs index ad56d2c65e..6d0739b75d 100644 --- a/core/service/src/engine/centralized/service/key_gen.rs +++ b/core/service/src/engine/centralized/service/key_gen.rs @@ -11,7 +11,7 @@ use crate::engine::validation::{ RequestIdParsingErr, parse_grpc_request_id, validate_key_gen_request, }; use crate::util::meta_store::{ - MetaStore, add_req_to_meta_store, handle_res, retrieve_from_meta_store, + EntryState, MetaStore, MetaStorePermit, add_req_to_meta_store, retrieve_from_meta_store, update_err_req_in_meta_store, }; use crate::vault::storage::crypto_material::{CentralizedCryptoMaterialStorage, PublicKeySet}; @@ -26,14 +26,12 @@ use observability::metrics_names::{ CENTRAL_TAG, OP_INSECURE_KEYGEN_REQUEST, OP_INSECURE_KEYGEN_RESULT, OP_KEYGEN_ABORT, OP_KEYGEN_REQUEST, OP_KEYGEN_RESULT, TAG_PARTY_ID, }; -use std::collections::HashMap; -use std::future::Future; use std::sync::Arc; use threshold_execution::keyset_config::KeySetConfig; use threshold_execution::tfhe_internals::parameters::DKGParams; use tokio_util::sync::CancellationToken; -use tokio::sync::{Mutex, RwLock}; +use tokio::sync::RwLock; use tonic::{Request, Response}; use tracing::Instrument; @@ -54,7 +52,7 @@ pub async fn key_gen_impl< } else { OP_KEYGEN_REQUEST }; - let permit = service.rate_limiter.start_keygen(op_tag).await?; + let rate_limiter_permit = service.rate_limiter.start_keygen(op_tag).await?; // TODO add serial lock to have similar flow to threshold case // Acquire the serial lock to make sure no other keygen is running concurrently // let _guard = service.serial_lock.lock().await; @@ -96,24 +94,21 @@ pub async fn key_gen_impl< // also check that the request ID is not used yet // If all is ok write the request ID to the meta store // All validation must be done before inserting the request ID - let (params, permit) = { + let (params, permit, meta_permit) = { // If we're in insecure mode, we skip removing preprocessed material since it may not exist let params = if !insecure { - let preproc = retrieve_from_meta_store( - service.preprocessing_meta_store.read().await, - &preproc_id, - op_tag, - ) - .await - .map_err(|e| { - // Remap the error to include the correct request ID - MetricedError::new( - op_tag, - Some(req_id), - anyhow::anyhow!(e.internal_err().to_string()), - e.code(), - ) - })?; + let preproc = + retrieve_from_meta_store(&service.preprocessing_meta_store, &preproc_id, op_tag) + .await + .map_err(|e| { + // Remap the error to include the correct request ID + MetricedError::new( + op_tag, + Some(req_id), + anyhow::anyhow!(e.internal_err().to_string()), + e.code(), + ) + })?; preproc.dkg_param } else { dkg_params @@ -138,16 +133,13 @@ pub async fn key_gen_impl< )); } - // check that the request ID is not used yet - // and then insert the request ID only if it's unused - // all validation must be done before inserting the request ID - add_req_to_meta_store(&mut service.key_meta_map.write().await, &req_id, op_tag)?; + // Check that the request ID is not used yet and insert it. + let meta_permit = add_req_to_meta_store(&service.key_meta_map, &req_id, op_tag).await?; - (params, permit) + (params, rate_limiter_permit, meta_permit) }; let meta_store = Arc::clone(&service.key_meta_map); - let key_meta_store_cancel = Arc::clone(&service.key_meta_map); let sk = service .base_kms .sig_key() @@ -170,56 +162,64 @@ pub async fn key_gen_impl< } let ongoing = Arc::clone(&service.ongoing_key_gen); let crypto_storage = service.crypto_storage.clone(); - let crypto_storage_cancel = service.crypto_storage.clone(); let preproc_meta_store = Arc::clone(&service.preprocessing_meta_store); - let keygen_background = async move { - // "Remove" the preprocessing material by deleting its entry from the meta store - tracing::info!("Deleting preprocessed material with ID {preproc_id} from meta store"); - let handle = { - let mut meta_store_guard = preproc_meta_store.write().await; - meta_store_guard.delete(&preproc_id) - }; - match handle_res(handle, &preproc_id).await { - Ok(_) => { - tracing::info!( - "Successfully deleted preprocessing ID {preproc_id} after keygen completion for request ID {req_id}" - ); - } - Err(e) => { - MetricedError::handle_unreturnable_error(op_tag, Some(req_id), e); - } - } - key_gen_background( - &req_id, - &preproc_id, - &epoch_id, - meta_store, - crypto_storage, - sk, - params, - internal_keyset_config, - eip712_domain, - extra_data, - op_tag, - ) - .await; - }; service.tracker.spawn( async move { let _timer = timer; let _permit = permit; - run_keygen_with_cancel( - keygen_background, + // "Remove" the preprocessing material by deleting its entry from the meta store + tracing::info!("Deleting preprocessed material with ID {preproc_id} from meta store"); + let delete_res = { + let mut meta_store_guard = preproc_meta_store.write().await; + meta_store_guard.try_delete(&preproc_id) + }; + match delete_res { + Ok(EntryState::Done(Ok(_))) => { + tracing::info!( + "Successfully deleted preprocessing ID {preproc_id} after keygen completion for request ID {req_id}" + ); + } + Ok(EntryState::Done(Err(e))) => { + MetricedError::handle_unreturnable_error( + op_tag, + Some(req_id), + anyhow::anyhow!( + "Preprocessing ID {preproc_id} finished with error: {e}" + ), + ); + } + Ok(_) => { + MetricedError::handle_unreturnable_error( + op_tag, + Some(req_id), + anyhow::anyhow!( + "Preprocessing ID {preproc_id} deleted but was not in Done state" + ), + ); + } + Err(e) => { + MetricedError::handle_unreturnable_error(op_tag, Some(req_id), e); + } + } + key_gen_background( + meta_permit, token, - req_id, - preproc_id, - epoch_id, - ongoing, - key_meta_store_cancel, - crypto_storage_cancel, + &req_id, + &preproc_id, + &epoch_id, + meta_store, + crypto_storage, + sk, + params, + internal_keyset_config, + eip712_domain, + extra_data, + op_tag, ) .await; + // Cleanup runs on every termination (normal completion, error, or abort). + ongoing.lock().await.remove(&preproc_id); } .instrument(tracing::Span::current()), ); @@ -227,64 +227,6 @@ pub async fn key_gen_impl< Ok(Response::new(Empty {})) } -/// Runs the key-generation background future under a cancellation token. If the -/// token is cancelled before the future completes, the key meta store is -/// updated with an `Aborted` error and any partial key material is purged. -/// -/// Extracted from `key_gen_impl` so the cancel arm can be exercised -/// deterministically in tests with a pending future. -#[allow(clippy::too_many_arguments)] -pub(crate) async fn run_keygen_with_cancel< - Fut: Future, - PubS: Storage + Sync + Send + 'static, - PrivS: StorageExt + Sync + Send + 'static, ->( - keygen_background: Fut, - token: CancellationToken, - req_id: RequestId, - preproc_id: RequestId, - epoch_id: EpochId, - ongoing: Arc>>, - key_meta_store_cancel: Arc>>, - crypto_storage_cancel: CentralizedCryptoMaterialStorage, -) { - tokio::select! { - () = keygen_background => { - tracing::info!( - "Key generation of request {} with preproc id {} exiting normally.", - req_id, preproc_id - ); - // Remove cancellation token since generation is now done. - ongoing.lock().await.remove(&preproc_id); - }, - () = token.cancelled() => { - MetricedError::handle_unreturnable_error( - OP_KEYGEN_REQUEST, - Some(req_id), - format!( - "Key generation background with preprocessing id {} failed since the task got aborted", - preproc_id - ), - ); - tracing::error!( - "Key generation of request {} exiting before completion because of an abort request.", - &req_id - ); - let mut guarded_meta_store = key_meta_store_cancel.write().await; - if let Err(e) = guarded_meta_store - .update(&req_id, Result::Err("Key generation was aborted".to_string())) - { - tracing::warn!( - "Failed to mark request {req_id} as aborted in the key meta store: {e}" - ); - } - crypto_storage_cancel - .purge_fhe_keys(&req_id, &epoch_id) - .await; - } - } -} - /// Implementation of the get_key_gen_result endpoint pub async fn get_key_gen_result_impl< PubS: Storage + Sync + Send + 'static, @@ -308,9 +250,8 @@ pub async fn get_key_gen_result_impl< tracing::debug!("Received get key gen result request with id {}", request_id); - let key_gen_res = - retrieve_from_meta_store(service.key_meta_map.read().await, &request_id, op_tag).await?; - match key_gen_res { + let key_gen_res = retrieve_from_meta_store(&service.key_meta_map, &request_id, op_tag).await?; + match (*key_gen_res).clone() { KeyGenMetadata::Current(res) => { if request_id != res.key_id { return Err(MetricedError::new( @@ -394,12 +335,15 @@ pub async fn abort_key_gen_impl< } } -/// Background task for key generation +/// Background task for key generation. Owns the meta-store permit for the +/// entire lifetime of the request. #[allow(clippy::too_many_arguments)] pub(crate) async fn key_gen_background< PubS: Storage + Send + Sync + 'static, PrivS: StorageExt + Send + Sync + 'static, >( + permit: MetaStorePermit, + cancel_token: CancellationToken, req_id: &RequestId, preproc_id: &RequestId, epoch_id: &EpochId, @@ -415,26 +359,30 @@ pub(crate) async fn key_gen_background< let start = tokio::time::Instant::now(); match internal_keyset_config.keyset_config() { KeySetConfig::Standard(standard_key_set_config) => { - let keygen_result = match async_generate_fhe_keys( - &sk, - params, - standard_key_set_config.to_owned(), - req_id, - preproc_id, - None, - eip712_domain, - extra_data, - ) - .await - { + // Race the long-running fhe-key generation against the cancel token. + let outcome = tokio::select! { + biased; // favor the cancel branch to reduce wasted work on cancellation + () = cancel_token.cancelled() => Err("Key generation was aborted".to_string()), + res = async_generate_fhe_keys( + &sk, + params, + standard_key_set_config.to_owned(), + req_id, + preproc_id, + None, + eip712_domain, + extra_data, + ) => res.map_err(|e| format!("Failed key generation: {e}")), + }; + + let keygen_result = match outcome { Ok(result) => result, - Err(e) => { - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - format!("Failed key generation: {e}"), - op_tag, - ); + Err(msg) => { + // Purge any partial key material on cancellation + if cancel_token.is_cancelled() { + crypto_storage.purge_fhe_keys(req_id, epoch_id).await; + } + let _ = update_err_req_in_meta_store(&meta_store, permit, msg, op_tag).await; return; } }; @@ -456,7 +404,7 @@ pub(crate) async fn key_gen_background< ), }; if let Err(e) = crypto_storage - .write_fhe_keys(req_id, epoch_id, key_info, pks, meta_store, op_tag) + .write_fhe_keys(req_id, epoch_id, key_info, pks, meta_store, permit, op_tag) .await { tracing::error!("Failed to write centralized keys for request {req_id}: {e}"); @@ -469,30 +417,32 @@ pub(crate) async fn key_gen_background< Ok((from, to)) => (from, to), Err(e) => { let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, + &meta_store, + permit, format!("Failed to use decompression key generation parameters: {e}"), op_tag, - ); + ) + .await; return; } }; - let decompression_key = match async_generate_decompression_keys( - crypto_storage.clone(), - epoch_id, - &from, - &to, - ) - .await - { - Ok(decompression_key) => decompression_key, - Err(e) => { - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - format!("Failed decompression key generation: {e}"), - op_tag, - ); + let outcome = tokio::select! { + biased; // favor the cancel branch to reduce wasted work on cancellation + () = cancel_token.cancelled() => Err("Key generation was aborted".to_string()), + res = async_generate_decompression_keys( + crypto_storage.clone(), + epoch_id, + &from, + &to, + ) => res.map_err(|e| format!("Failed decompression key generation: {e}")), + }; + let decompression_key = match outcome { + Ok(k) => k, + Err(msg) => { + if cancel_token.is_cancelled() { + crypto_storage.purge_fhe_keys(req_id, epoch_id).await; + } + let _ = update_err_req_in_meta_store(&meta_store, permit, msg, op_tag).await; return; } }; @@ -508,17 +458,18 @@ pub(crate) async fn key_gen_background< Ok(info) => info, Err(e) => { let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, + &meta_store, + permit, format!("Failed to compute decompression key info: {e}"), op_tag, - ); + ) + .await; return; } }; if let Err(e) = crypto_storage .inner - .write_decompression_key(req_id, info, decompression_key, meta_store) + .write_decompression_key(req_id, info, decompression_key, meta_store, permit) .await { tracing::error!( @@ -950,76 +901,4 @@ pub(crate) mod tests { .unwrap_err(); assert_eq!(err.code(), tonic::Code::NotFound); } - - /// Drives the cancel arm of `run_keygen_with_cancel` deterministically by - /// passing a never-completing keygen future, then issues an abort via - /// `abort_key_gen_impl` and checks that `get_key_gen_result_impl` returns - /// `Aborted` rather than waiting the full timeout. Mirrors the threshold - /// `abort_during_key_gen` test that uses `SlowOnlineDistributedKeyGen128`. - #[tokio::test] - async fn abort_during_key_gen() { - use crate::consts::DEFAULT_EPOCH_ID; - use crate::util::meta_store::add_req_to_meta_store; - - let mut rng = AesRng::seed_from_u64(42); - let preproc_id = derive_request_id("test_central_abort_during_key_gen_preproc").unwrap(); - let req_id = derive_request_id("test_central_abort_during_key_gen_reqid").unwrap(); - let (kms, _) = setup_test_kms_with_preproc(&mut rng, &preproc_id).await; - - // Register the request id in the key meta store so `get_key_gen_result_impl` - // has a cell to wait on, mirroring what `key_gen_impl` does before spawning. - add_req_to_meta_store( - &mut kms.key_meta_map.write().await, - &req_id, - OP_KEYGEN_REQUEST, - ) - .unwrap(); - - // Register the cancellation token in `ongoing_key_gen` so `abort_key_gen_impl` - // finds it, and spawn the cancel-aware task with a pending future so only the - // cancel arm can fire. - let token = CancellationToken::new(); - kms.ongoing_key_gen - .lock() - .await - .insert(preproc_id, token.clone()); - - let ongoing = Arc::clone(&kms.ongoing_key_gen); - let key_meta = Arc::clone(&kms.key_meta_map); - let crypto_storage = kms.crypto_storage.clone(); - let epoch_id = *DEFAULT_EPOCH_ID; - let task = tokio::spawn(async move { - run_keygen_with_cancel( - std::future::pending::<()>(), - token, - req_id, - preproc_id, - epoch_id, - ongoing, - key_meta, - crypto_storage, - ) - .await; - }); - - // Abort should succeed and trigger the cancel arm. - abort_key_gen_impl(&kms, Request::new(preproc_id.into())) - .await - .unwrap(); - // Second abort returns NotFound since the token was removed. - let err = abort_key_gen_impl(&kms, Request::new(preproc_id.into())) - .await - .unwrap_err(); - assert_eq!(err.code(), tonic::Code::NotFound); - - // Wait for the cancel arm to finish writing the meta store update. - task.await.unwrap(); - - // The key meta store now holds the aborted error, so `get_key_gen_result_impl` - // returns `Aborted` immediately instead of blocking for the 60s timeout. - let err = get_key_gen_result_impl(&kms, Request::new(req_id.into()), false) - .await - .unwrap_err(); - assert_eq!(err.code(), tonic::Code::Aborted); - } } diff --git a/core/service/src/engine/centralized/service/preprocessing.rs b/core/service/src/engine/centralized/service/preprocessing.rs index acd431c593..ae603817e1 100644 --- a/core/service/src/engine/centralized/service/preprocessing.rs +++ b/core/service/src/engine/centralized/service/preprocessing.rs @@ -59,11 +59,12 @@ pub async fn preprocessing_impl< let (req_id, _context_id, _epoch_id, dkg_param, _key_set_config, eip712_domain, extra_data) = validate_preproc_request(inner)?; - add_req_to_meta_store( - &mut service.preprocessing_meta_store.write().await, + let permit = add_req_to_meta_store( + &service.preprocessing_meta_store, &req_id, OP_KEYGEN_PREPROC_REQUEST, - )?; + ) + .await?; let sk = service.base_kms.sig_key().map_err(|e| { MetricedError::new( @@ -83,11 +84,12 @@ pub async fn preprocessing_impl< }); let _ = update_req_in_meta_store( - &mut service.preprocessing_meta_store.write().await, - &req_id, + &service.preprocessing_meta_store, + permit, preproc_bucket, OP_KEYGEN_PREPROC_REQUEST, - ); + ) + .await; tracing::warn!( "Received a preprocessing request for the central server {} - No action taken", req_id @@ -138,7 +140,7 @@ pub async fn get_preprocessing_res_impl< })?; let preproc_data = retrieve_from_meta_store( - service.preprocessing_meta_store.read().await, + &service.preprocessing_meta_store, &request_id, OP_KEYGEN_PREPROC_RESULT, ) @@ -146,7 +148,7 @@ pub async fn get_preprocessing_res_impl< Ok(Response::new(KeyGenPreprocResult { preprocessing_id: Some(request_id.into()), - external_signature: preproc_data.external_signature, + external_signature: preproc_data.external_signature.clone(), })) } #[cfg(test)] diff --git a/core/service/src/engine/context_manager.rs b/core/service/src/engine/context_manager.rs index e0d50a6498..e3b0b1aac7 100644 --- a/core/service/src/engine/context_manager.rs +++ b/core/service/src/engine/context_manager.rs @@ -12,7 +12,7 @@ use crate::engine::utils::MetricedError; use crate::engine::validation::{ RequestIdParsingErr, parse_grpc_request_id, parse_optional_grpc_request_id, }; -use crate::util::meta_store::add_req_to_meta_store; +use crate::util::meta_store::{MetaStorePermit, add_req_to_meta_store}; use crate::vault::keychain::KeychainProxy; use crate::vault::storage::crypto_material::{CryptoMaterialStorage, data_exists}; use crate::vault::storage::{ @@ -182,11 +182,12 @@ where } } - add_req_to_meta_store( - &mut self.custodian_meta_store.write().await, + let meta_permit = add_req_to_meta_store( + &self.custodian_meta_store, &custodian_context_id, OP_NEW_CUSTODIAN_CONTEXT, - )?; + ) + .await?; tracing::info!( "Custodian context addition under MPC context {:?} starting with context_id={:?}, threshold={} from {} custodians", mpc_context_id, @@ -194,7 +195,7 @@ where custodian_context.threshold, custodian_context.custodian_nodes.len() ); - self.inner_new_custodian_context(custodian_context, mpc_context_id) + self.inner_new_custodian_context(custodian_context, mpc_context_id, meta_permit) .await .map_err(|e| { MetricedError::new(OP_NEW_CUSTODIAN_CONTEXT, None, e, tonic::Code::Internal) @@ -224,12 +225,17 @@ where // Note that care must be taken in the order of getting locks here // Use meta store as sync point - let mut cus_meta_store = self.custodian_meta_store.write().await; - if cus_meta_store.delete(&context_id).is_none() { - tracing::warn!( - "Custodian context with id {context_id} to be deleted does not exist in meta store" - ); - } + let permit = { + let mut cus_meta_store = self.custodian_meta_store.write().await; + cus_meta_store.lock_entry(&context_id).map_err(|e| { + MetricedError::new( + OP_DESTROY_CUSTODIAN_CONTEXT, + Some(context_id), + e, + tonic::Code::FailedPrecondition, + ) + })? + }; let mut guarded_pub_storage = self.crypto_storage.public_storage.lock().await; let guarded_backup_storage_ref = self.crypto_storage.backup_vault.as_ref().ok_or_else(|| { @@ -266,6 +272,7 @@ where &self, context: CustodianContext, mpc_context_id: ContextId, + meta_permit: MetaStorePermit, ) -> anyhow::Result<()> { let backup_vault = match self.crypto_storage.backup_vault { Some(ref backup_vault) => backup_vault, @@ -328,7 +335,11 @@ where ); // Then store the results self.crypto_storage - .write_backup_keys(recovery_validation, Arc::clone(&self.custodian_meta_store)) + .write_backup_keys( + recovery_validation, + Arc::clone(&self.custodian_meta_store), + meta_permit, + ) .await?; tracing::info!( "New custodian context created with context_id={}, threshold={} from {} custodians", diff --git a/core/service/src/engine/threshold/service/crs_generator.rs b/core/service/src/engine/threshold/service/crs_generator.rs index 5e95a1e2bc..f87143e57a 100644 --- a/core/service/src/engine/threshold/service/crs_generator.rs +++ b/core/service/src/engine/threshold/service/crs_generator.rs @@ -29,6 +29,7 @@ use tonic::{Request, Response}; use tracing::Instrument; // === Internal Crate === +use crate::engine::utils::MetricedError; use crate::{ cryptography::signatures::PrivateSigKey, engine::{ @@ -37,12 +38,14 @@ use crate::{ validation::{RequestIdParsingErr, parse_grpc_request_id, validate_crs_gen_request}, }, util::{ - meta_store::{MetaStore, add_req_to_meta_store, retrieve_from_meta_store}, + meta_store::{ + MetaStore, MetaStorePermit, add_req_to_meta_store, retrieve_from_meta_store, + update_err_req_in_meta_store, + }, rate_limiter::RateLimiter, }, vault::storage::{Storage, StorageExt, crypto_material::ThresholdCryptoMaterialStorage}, }; -use crate::{engine::utils::MetricedError, util::meta_store::update_err_req_in_meta_store}; // === Insecure Feature-Specific Imports === cfg_if::cfg_if! { @@ -93,7 +96,7 @@ impl< } else { OP_CRS_GEN_REQUEST }; - let permit = self.rate_limiter.start_crsgen(op_tag).await?; + let rate_limiter_permit = self.rate_limiter.start_crsgen(op_tag).await?; let mut timer = metrics::METRICS.time_operation(op_tag).start(); @@ -133,11 +136,8 @@ impl< )); } - add_req_to_meta_store( - &mut self.crs_meta_store.write().await, - &verified.req_id, - op_tag, - )?; + let meta_permit = + add_req_to_meta_store(&self.crs_meta_store, &verified.req_id, op_tag).await?; let sigkey = self.base_kms.sig_key().map_err(|e| { MetricedError::new( op_tag, @@ -161,7 +161,8 @@ impl< verified.params, &verified.eip712_domain, verified.extra_data, - permit, + rate_limiter_permit, + meta_permit, verified.epoch_id, verified.context_id, sigkey, @@ -182,19 +183,14 @@ impl< dkg_params: DKGParams, eip712_domain: &alloy_sol_types::Eip712Domain, extra_data: Vec, - permit: OwnedSemaphorePermit, + rate_limiter_permit: OwnedSemaphorePermit, + meta_permit: MetaStorePermit, epoch_id: EpochId, context_id: ContextId, sk: Arc, timer: DurationGuard<'static>, insecure: bool, ) -> anyhow::Result<()> { - // Retrieve the correct tag - let op_tag = if insecure { - OP_INSECURE_CRS_GEN_REQUEST - } else { - OP_CRS_GEN_REQUEST - }; let session_id = req_id.derive_session_id()?; // CRS ceremony requires a sync network let session = self @@ -203,9 +199,7 @@ impl< .await?; let meta_store = Arc::clone(&self.crs_meta_store); - let meta_store_cancelled = Arc::clone(&self.crs_meta_store); let crypto_storage = self.crypto_storage.clone(); - let crypto_storage_cancelled = self.crypto_storage.clone(); let eip712_domain_copy = eip712_domain.clone(); // we do not need to hold the handle, @@ -217,43 +211,36 @@ impl< self.ongoing.lock().await.insert(req_id, token.clone()); } let ongoing = Arc::clone(&self.ongoing); - self.tracker - .spawn(async move { - // Capture the timer inside the generation tasks, such that when the task - // exits, the timer is dropped and thus exported + self.tracker.spawn( + async move { + // Capture the timer inside the generation task, such that when the + // task exits, the timer is dropped and thus exported. let _inner_timer = timer; - let _inner_permit = permit; - tokio::select! { - () = Self::crs_gen_background(&req_id, &epoch_id, witness_dim, max_num_bits, session, rng, meta_store, crypto_storage, sk, dkg_params.to_owned(), eip712_domain_copy, extra_data, insecure) => { - // Remove cancellation token since generation is now done. - ongoing.lock().await.remove(&req_id); - tracing::info!("CRS generation of request {} exiting normally.", req_id); - }, - () = token.cancelled() => { - MetricedError::handle_unreturnable_error( - op_tag, - Some(req_id), - anyhow::anyhow!("CRS generation of request exiting before completion because of a cancellation event") - ); - let del_res = crypto_storage_cancelled.inner.purge_crs_material(&req_id, &epoch_id).await; - { - let mut guarded_meta_store = meta_store_cancelled.write().await; - let msg = if del_res { - // Successful deletion - let msg = format!("CRS generation aborted and crs material deleted successfully for request {}", req_id); - tracing::info!(msg); - msg - } else { - // Error during deletion - let msg = format!("CRS generation aborted but failed to delete crs material for request {}", req_id); - tracing::error!(msg); - msg - }; - update_err_req_in_meta_store(&mut guarded_meta_store, &req_id, msg, op_tag); - } - }, - } - }.instrument(tracing::Span::current())); + let _inner_rate_limiter_permit = rate_limiter_permit; + Self::crs_gen_background( + meta_permit, + token, + &req_id, + &epoch_id, + witness_dim, + max_num_bits, + session, + rng, + meta_store, + crypto_storage, + sk, + dkg_params.to_owned(), + eip712_domain_copy, + extra_data, + insecure, + ) + .await; + // Cleanup runs on every termination (normal completion, + // generation error, or abort). + ongoing.lock().await.remove(&req_id); + } + .instrument(tracing::Span::current()), + ); Ok(()) } @@ -272,10 +259,9 @@ impl< parse_grpc_request_id(&request.into_inner(), RequestIdParsingErr::CrsGenResponse) .map_err(|e| MetricedError::new(op_tag, None, e, tonic::Code::InvalidArgument))?; - let crs_data = - retrieve_from_meta_store(self.crs_meta_store.read().await, &request_id, op_tag).await?; + let crs_data = retrieve_from_meta_store(&self.crs_meta_store, &request_id, op_tag).await?; - match crs_data { + match (*crs_data).clone() { CrsGenMetadata::Current(crs_data) => { if crs_data.crs_id != request_id { return Err(MetricedError::new( @@ -348,6 +334,8 @@ impl< #[allow(clippy::too_many_arguments)] async fn crs_gen_background( + permit: MetaStorePermit, + cancel_token: CancellationToken, req_id: &RequestId, epoch_id: &EpochId, witness_dim: usize, @@ -376,105 +364,114 @@ impl< let pke_params = params .get_params_basics_handle() .get_compact_pk_enc_params(); - let pp = if insecure { - // sanity check to make sure we're using the insecure feature - #[cfg(not(feature = "insecure"))] - { - let _ = rng; // stop clippy from complaining - panic!("attempting to call insecure crsgen when the insecure feature is not set"); - } - #[cfg(feature = "insecure")] - { - let my_role = base_session.my_role(); - // We let the first party sample the seed (we are using 1-based party IDs) - let input_party_id = 1; - let domain = eip712_domain.clone(); - if my_role.one_based() == input_party_id { - let crs_res = async_generate_crs( - &sk, - params, - max_num_bits, - domain, - extra_data.clone(), - req_id, - rng, - ) - .await; - let crs = match crs_res { - Ok((crs, _)) => crs, - Err(e) => { - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - e.to_string(), - op_tag, - ); - return; - } - }; - transfer_crs(&base_session, Some(crs), input_party_id).await - } else { - transfer_crs(&base_session, None, input_party_id).await + // Build the long-running generation as a single async block, then race + // it against the cancellation token in an inner select!. Only the + // long-running phase is cancellable; the disk-write phase is + // intentionally uncancellable to avoid torn states. + let extra_data_for_compute = extra_data.clone(); + let domain_for_compute = eip712_domain.clone(); + let generate_pp = async { + if insecure { + // sanity check to make sure we're using the insecure feature + #[cfg(not(feature = "insecure"))] + { + let _ = rng; // stop clippy from complaining + panic!( + "attempting to call insecure crsgen when the insecure feature is not set" + ); } + #[cfg(feature = "insecure")] + { + let my_role = base_session.my_role(); + // We let the first party sample the seed (we are using 1-based party IDs) + let input_party_id = 1; + let domain = eip712_domain.clone(); + if my_role.one_based() == input_party_id { + let crs_res = async_generate_crs( + &sk, + params, + max_num_bits, + domain, + extra_data.clone(), + req_id, + rng, + ) + .await; + let crs = crs_res + .map(|(crs, _)| crs) + .map_err(|e| anyhow::anyhow!(e.to_string()))?; + transfer_crs(&base_session, Some(crs), input_party_id).await + } else { + transfer_crs(&base_session, None, input_party_id).await + } + } + } else { + // secure ceremony (insecure = false) + // CRS ceremony with production-sized params (e.g. 2048 bits) is CPU-intensive, + // each party's turn can take minutes. Increase the timeout to match BK SNS generation + // to prevent other parties from timing out and marking the computing party as corrupt. + base_session.network.set_timeout_for_bk_sns().await; + let real_ceremony = C::default(); + let internal_pp = real_ceremony + .execute::(&mut base_session, witness_dim, max_num_bits) + .await; + internal_pp.and_then(|internal| { + internal.try_into_tfhe_zk_pok_pp(&pke_params, base_session.session_id()) + }) } - } else { - // secure ceremony (insecure = false) - // CRS ceremony with production-sized params (e.g. 2048 bits) is CPU-intensive, - // each party's turn can take minutes. Increase the timeout to match BK SNS generation - // to prevent other parties from timing out and marking the computing party as corrupt. - base_session.network.set_timeout_for_bk_sns().await; - let real_ceremony = C::default(); - let internal_pp = real_ceremony - .execute::(&mut base_session, witness_dim, max_num_bits) - .await; - internal_pp.and_then(|internal| { - internal.try_into_tfhe_zk_pok_pp(&pke_params, base_session.session_id()) - }) }; - let res_info_pp = pp.and_then(|pp| { - compute_info_crs( - &sk, - &DSEP_PUBDATA_CRS, - req_id, - &pp, - &eip712_domain, - extra_data, - ) - .map(|pub_info| (pp, pub_info)) - }); - - let (pp, crs_info) = match res_info_pp { - Ok((pp, pp_id)) => (pp, pp_id), - Err(e) => { - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, + // Race the long-running generation against cancellation. + let outcome: Result<(tfhe::zk::CompactPkeCrs, CrsGenMetadata), String> = tokio::select! { + biased; + () = cancel_token.cancelled() => Err(format!( + "CRS generation of request {req_id} aborted before completion because of a cancellation event" + )), + pp = generate_pp => pp + .and_then(|pp| compute_info_crs( + &sk, + &DSEP_PUBDATA_CRS, req_id, - e.to_string(), + &pp, + &domain_for_compute, + extra_data_for_compute, + ).map(|info| (pp, info))) + .map_err(|e| e.to_string()), + }; + + match outcome { + Err(msg) => { + MetricedError::handle_unreturnable_error( op_tag, + Some(*req_id), + anyhow::anyhow!(msg.clone()), ); - return; + let _ = crypto_storage + .inner + .purge_crs_material(req_id, epoch_id) + .await; + let _ = update_err_req_in_meta_store(&meta_store, permit, msg, op_tag).await; + } + Ok((pp, crs_info)) => { + tracing::info!( + "CRS generation completed for req_id={req_id} with digest={}, storing the CRS.", + hex::encode(crs_info.digest()) + ); + // write_crs consumes the permit via update_meta_store internally + let res = crypto_storage + .inner + .write_crs(req_id, epoch_id, pp, crs_info, meta_store, permit, op_tag) + .await; + let elapsed_time = Instant::now().duration_since(crs_start_timer); + if let Err(e) = res { + tracing::error!("Failed to write CRS for request {req_id}: {e}"); + } else { + tracing::info!( + "CRS stored. CRS ceremony time was {} ms", + elapsed_time.as_millis() + ); + } } - }; - - tracing::info!( - "CRS generation completed for req_id={req_id} with digest={}, storing the CRS.", - hex::encode(crs_info.digest()) - ); - - let res = crypto_storage - .inner - .write_crs(req_id, epoch_id, pp, crs_info, meta_store, op_tag) - .await; - let crs_stop_timer = Instant::now(); - let elapsed_time = crs_stop_timer.duration_since(crs_start_timer); - if let Err(e) = res { - tracing::error!("Failed to write CRS for request {req_id}: {e}"); - } else { - tracing::info!( - "CRS stored. CRS ceremony time was {:?} ms", - (elapsed_time).as_millis() - ); } } } diff --git a/core/service/src/engine/threshold/service/epoch_manager.rs b/core/service/src/engine/threshold/service/epoch_manager.rs index f485ec083d..02d88075c0 100644 --- a/core/service/src/engine/threshold/service/epoch_manager.rs +++ b/core/service/src/engine/threshold/service/epoch_manager.rs @@ -88,8 +88,8 @@ use crate::{ }, util::{ meta_store::{ - MetaStore, retrieve_from_meta_store, update_err_req_in_meta_store, - update_req_in_meta_store, + MetaStore, add_req_to_meta_store, retrieve_from_meta_store, + update_err_req_in_meta_store, update_req_in_meta_store, }, rate_limiter::RateLimiter, }, @@ -461,7 +461,7 @@ impl< new_epoch_id: EpochId, verified_previous_epoch: VerifiedPreviousEpochInfo, ) -> Result< - impl Future> + use, + impl Future> + use, MetricedError, > { let epoch_id_as_request_id = new_epoch_id.into(); @@ -474,8 +474,6 @@ impl< .fetch_existing_crs_metadata(epoch_id_as_request_id, &verified_previous_epoch) .await?; - let meta_store = Arc::clone(&self.reshare_pubinfo_meta_store); - let immutable_session_maker = self.session_maker.make_immutable(); let task = async move { @@ -524,14 +522,9 @@ impl< keys_metadata.push(key_metadata); } - // We update the meta store with the same metadata as in the epoch we reshare from - // i.e. parties in Set 1 only do NOT re-sign the metadata - meta_store.write().await.update( - &epoch_id_as_request_id, - Ok(EpochOutput::Reshare((keys_metadata, crs_metadata))), - )?; - - Ok(()) + // Parties in Set 1 only do NOT re-sign the metadata — we return the + // same metadata as in the epoch we reshare from. + Ok(EpochOutput::Reshare((keys_metadata, crs_metadata))) }; Ok(task) @@ -603,7 +596,6 @@ impl< #[allow(clippy::too_many_arguments)] async fn store_reshared_keys( crypto_storage: &ThresholdCryptoMaterialStorage, - meta_store: Arc>>, sk: &PrivateSigKey, new_epoch_id: EpochId, new_extra_data: Vec, @@ -612,7 +604,7 @@ impl< new_private_keysets: Vec>, eip712_domain: &Eip712Domain, crs_info: Vec, - ) -> anyhow::Result<()> { + ) -> anyhow::Result { let mut fhe_key_infos = Vec::new(); let mut storage_tasks = Vec::new(); for (verified_material, (new_private_keyset, key_info)) in @@ -750,13 +742,11 @@ impl< let res = join_all(storage_tasks).await; let error_agg = res.iter().filter(|r| r.is_err()).collect::>(); - let mut err_msgs = Vec::new(); - let agg_res = if !error_agg.is_empty() { + if !error_agg.is_empty() { let storage_err_msg = format!( "Failed to store all reshared keys for new epoch {}: {:?}", new_epoch_id, error_agg ); - err_msgs.push(storage_err_msg.clone()); // Roll back any partial successes in case something fails during the resharing, // to not leave the storage in a partial state. @@ -786,32 +776,18 @@ impl< } } - Err(storage_err_msg) - } else { - // If the resharing went well, then update the backup - crypto_storage - .inner - .update_backup_vault(false, OP_NEW_EPOCH) - .await; - Ok(EpochOutput::Reshare((fhe_key_infos, crs_metadatas))) - }; - // Finally update the meta store - if !update_req_in_meta_store( - &mut meta_store.write().await, - &new_epoch_id.into(), - agg_res, - OP_NEW_EPOCH, - ) { - err_msgs.push(format!( - "Failed to update the meta store with error for new epoch {}.", - new_epoch_id - )); - } - if err_msgs.is_empty() { - Ok(()) - } else { - Err(anyhow::anyhow!(err_msgs.join(", "))) + return Err(anyhow::anyhow!(storage_err_msg)); } + + // If the resharing went well, then update the backup + crypto_storage + .inner + .update_backup_vault(false, OP_NEW_EPOCH) + .await; + // The caller (the resharing task) propagates this EpochOutput back up + // to the outer spawn, which consumes the meta-store permit to record + // the terminal-state transition. + Ok(EpochOutput::Reshare((fhe_key_infos, crs_metadatas))) } #[allow(clippy::too_many_arguments)] @@ -825,7 +801,7 @@ impl< eip712_domain: Eip712Domain, crs_info: Vec, ) -> Result< - impl Future> + use, + impl Future> + use, MetricedError, > { let epoch_id_as_request_id = new_epoch_id.into(); @@ -858,7 +834,6 @@ impl< ) })?; - let meta_store = Arc::clone(&self.reshare_pubinfo_meta_store); let crypto_storage = self.crypto_storage.clone(); let task = async move { @@ -914,7 +889,6 @@ impl< Self::store_reshared_keys( &crypto_storage, - meta_store, &sk, new_epoch_id, new_extra_data, @@ -941,7 +915,7 @@ impl< eip712_domain: Eip712Domain, crs_info: Vec, ) -> Result< - impl Future> + use, + impl Future> + use, MetricedError, > { let epoch_id_as_request_id = new_epoch_id.into(); @@ -977,7 +951,6 @@ impl< ) })?; - let meta_store = Arc::clone(&self.reshare_pubinfo_meta_store); let crypto_storage = self.crypto_storage.clone(); let task = async move { @@ -1053,7 +1026,6 @@ impl< Self::store_reshared_keys( &crypto_storage, - meta_store, &sk, new_epoch_id, new_extra_data, @@ -1169,7 +1141,7 @@ impl< new_extra_data: &[u8], previous_epoch: PreviousEpochInfo, eip712_domain: Eip712Domain, - ) -> Result>, MetricedError> { + ) -> Result>, MetricedError> { tracing::info!( "Received initiate resharing request from context {:?} to context {:?} for Key IDs {:?} for epoch ID {:?}", previous_epoch.context_id, @@ -1271,7 +1243,7 @@ impl< &self, request: Request, ) -> Result, MetricedError> { - let permit = self.rate_limiter.start_new_epoch().await?; + let rate_limiter_permit = self.rate_limiter.start_new_epoch().await?; let inner = request.into_inner(); let VerifiedNewMpcEpochRequest { @@ -1324,25 +1296,17 @@ impl< .is_some(); let meta_store = Arc::clone(&self.reshare_pubinfo_meta_store); - // Update status - { - let mut guarded_meta_store = self.reshare_pubinfo_meta_store.write().await; - guarded_meta_store.insert(&epoch_id.into()).map_err(|e| { - MetricedError::new( - OP_NEW_EPOCH, - Some(epoch_id.into()), - e, - // Note that there are other reason why insert can fail, but - // AlreadyExists seems the most appropriate - tonic::Code::AlreadyExists, - ) - })?; - } + let meta_permit = add_req_to_meta_store( + &self.reshare_pubinfo_meta_store, + &epoch_id.into(), + OP_NEW_EPOCH, + ) + .await?; let session_maker = self.session_maker.clone(); let crypto_storage = self.crypto_storage.clone(); self.tracker.spawn(async move { - let _permit = permit; + let _rate_limiter_permit = rate_limiter_permit; let crypto_storage = crypto_storage; let context_id = context_id; let epoch_id = epoch_id; @@ -1353,31 +1317,26 @@ impl< .await { let err = format!("PRSS initialization failed during epoch creation: {e:?}"); - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - &epoch_id.into(), - err, - OP_NEW_EPOCH, - ); + let _ = + update_err_req_in_meta_store(&meta_store, meta_permit, err, OP_NEW_EPOCH).await; return; } - if let Some(resharing_task) = resharing_task { - if let Err(e) = resharing_task.await { - let err = format!("Resharing failed during epoch creation: {e:?}"); - let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - &epoch_id.into(), - err, - OP_NEW_EPOCH, - ); - } - } else { - // Can't do much if inserts fails here - let _ = meta_store - .write() + // Either reshare and commit the resulting EpochOutput, or commit + // PRSSInitOnly. Either way, the permit is consumed exactly once. + let result: Result = if let Some(resharing_task) = resharing_task { + resharing_task .await - .update(&epoch_id.into(), Ok(EpochOutput::PRSSInitOnly)); - } + .map_err(|e| format!("Resharing failed during epoch creation: {e:?}")) + } else { + Ok(EpochOutput::PRSSInitOnly) + }; + let _ = update_req_in_meta_store::<_, String>( + &meta_store, + meta_permit, + result, + OP_NEW_EPOCH, + ) + .await; }); Ok(Response::new(Empty {})) @@ -1415,13 +1374,13 @@ impl< })?; let res = retrieve_from_meta_store( - self.reshare_pubinfo_meta_store.read().await, + &self.reshare_pubinfo_meta_store, &request_id, OP_GET_EPOCH_RESULT, ) .await?; - match res { + match (*res).clone() { EpochOutput::PRSSInitOnly => { tracing::info!( "New Epoch with only PRSS initialization for request ID {:?}.", diff --git a/core/service/src/engine/threshold/service/key_generator.rs b/core/service/src/engine/threshold/service/key_generator.rs index e2fd6ef4f8..2c056706a7 100644 --- a/core/service/src/engine/threshold/service/key_generator.rs +++ b/core/service/src/engine/threshold/service/key_generator.rs @@ -42,7 +42,7 @@ use threshold_execution::{ public_keysets::FhePubKeySet, }, }; -use tokio::sync::{Mutex, OwnedSemaphorePermit, RwLock, RwLockReadGuard}; +use tokio::sync::{Mutex, OwnedSemaphorePermit, RwLock}; use tokio_util::{sync::CancellationToken, task::TaskTracker}; use tonic::{Request, Response, Status}; use tracing::Instrument; @@ -72,8 +72,8 @@ use crate::{ }, util::{ meta_store::{ - MetaStore, add_req_to_meta_store, handle_res, retrieve_from_meta_store, - retrieve_from_meta_store_with_timeout, update_err_req_in_meta_store, + EntryState, MetaStore, MetaStorePermit, add_req_to_meta_store, + retrieve_from_meta_store, update_err_req_in_meta_store, }, rate_limiter::RateLimiter, }, @@ -112,6 +112,16 @@ enum ThresholdKeyGenResult { ), } +/// Post-DKG, pre-storage bundle produced by `prepare_threshold_keys`. The +/// `migration_copy_target` is set only on the compressed UseExisting path, +/// which triggers a post-write copy of the new compressed keyset to the +/// original key id. +struct PreparedThresholdKeys { + threshold_fhe_keys: ThresholdFheKeys, + pub_key_set: PublicKeySet, + migration_copy_target: Option, +} + // === Insecure Feature-Specific Imports === #[cfg(feature = "insecure")] use crate::engine::base::INSECURE_PREPROCESSING_ID; @@ -253,6 +263,7 @@ impl< KG: OnlineDistributedKeyGen + 'static, > RealKeyGenerator { + #[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)] async fn launch_dkg( &self, @@ -265,6 +276,7 @@ impl< context_id: ContextId, epoch_id: EpochId, permit: OwnedSemaphorePermit, + meta_permit: MetaStorePermit, ) -> anyhow::Result<()> { //Retrieve the right metric tag let op_tag = match ( @@ -332,10 +344,8 @@ impl< // Clone all the Arcs to give them to the tokio thread let meta_store = Arc::clone(&self.dkg_pubinfo_meta_store); - let meta_store_cancelled = Arc::clone(&self.dkg_pubinfo_meta_store); let sk = self.base_kms.sig_key()?; let crypto_storage = self.crypto_storage.clone(); - let crypto_storage_cancelled = self.crypto_storage.clone(); let eip712_domain_copy = eip712_domain.clone(); let ongoing = Arc::clone(&self.ongoing); @@ -369,136 +379,109 @@ impl< // we must validate the parameter before passing it into the background process internal_keyset_config.validate()?; - // Read existing key tag from public storage if needed - let existing_key_tag: Option = if internal_keyset_config.use_existing_key_tag() { - let existing_keyset_id = internal_keyset_config.get_existing_keyset_id()?; - Some(Self::read_existing_key_tag(&crypto_storage, &existing_keyset_id).await?) - } else { - None - }; - - // For compressed keygen that recycles an existing private keyset, load the OLD - // CompactPublicKey so we can sign and store it instead of the new one derived from - // the newly-generated CompressedXofKeySet. Keeps the externally-visible public key - // stable for clients that already cached it. The digest of the loaded pk is - // verified against the old ThresholdFheKeys.meta_data; generation from existing - // shares stays within the same epoch, so the current epoch_id is used. - let existing_compact_pk: Option = - match internal_keyset_config.keyset_config() { - ddec_keyset_config::KeySetConfig::Standard(inner) - if matches!( - inner.secret_key_config, - ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting - ) && matches!( - inner.compressed_key_config, - ddec_keyset_config::CompressedKeyConfig::All - ) => - { - let existing_keyset_id = internal_keyset_config.get_existing_keyset_id()?; - Some( - Self::read_existing_compact_public_key( - &crypto_storage, - &existing_keyset_id, - &epoch_id, - ) - .await?, - ) - } - _ => None, - }; + // Existing-keyset reads (tag + compact public key) used by the + // UseExisting paths are loaded lazily inside `key_gen_background` so + // their failures surface via the meta store, mirroring how DKG + // failures are handled. This also matches the centralized analogue. - let keygen_background = async move { - // Remove the preprocessing material - match &preproc_handle_w_mode { - PreprocHandleWithMode::Secure((preproc_id, _)) => { - tracing::info!( - "Deleting preprocessed material with ID {preproc_id} from meta store" - ); - let handle = { - let mut meta_store_guard = preproc_bucket.write().await; - meta_store_guard.delete(preproc_id) - }; - match handle_res(handle, preproc_id).await { - Ok(_) => { - tracing::info!( - "Successfully deleted preprocessing ID {preproc_id} after keygen completion for request ID {req_id}" - ); - } - Err(e) => { - MetricedError::handle_unreturnable_error(op_tag, Some(req_id), e); + let token_for_bg = token.clone(); + self.tracker.spawn( + async move { + //Start the metric timer, it will end on drop + let _timer = timer.start(); + // Remove the preprocessing material + match &preproc_handle_w_mode { + PreprocHandleWithMode::Secure((preproc_id, _)) => { + tracing::info!( + "Deleting preprocessed material with ID {preproc_id} from meta store" + ); + let delete_res = { + let mut meta_store_guard = preproc_bucket.write().await; + meta_store_guard.try_delete(preproc_id) + }; + match delete_res { + Ok(EntryState::Done(Ok(_))) => { + tracing::info!( + "Successfully deleted preprocessing ID {preproc_id} after keygen completion for request ID {req_id}" + ); + } + Ok(EntryState::Done(Err(e))) => { + MetricedError::handle_unreturnable_error( + op_tag, + Some(req_id), + anyhow::anyhow!( + "Preprocessing ID {preproc_id} finished with error: {e}" + ), + ); + } + Ok(_) => { + MetricedError::handle_unreturnable_error( + op_tag, + Some(req_id), + anyhow::anyhow!( + "Preprocessing ID {preproc_id} deleted but was not in Done state" + ), + ); + } + Err(e) => { + MetricedError::handle_unreturnable_error(op_tag, Some(req_id), e); + } } } + PreprocHandleWithMode::Insecure => { + // Nothing to remove + } } - PreprocHandleWithMode::Insecure => { - // Nothing to remove - } - } - match internal_keyset_config.keyset_config() { - ddec_keyset_config::KeySetConfig::Standard(inner_config) => { - Self::key_gen_background( - &req_id_clone, - &epoch_id_clone, - dkg_sessions, - meta_store, - crypto_storage, - preproc_handle_w_mode, - sk, - dkg_params, - inner_config.to_owned(), - &internal_keyset_config, - eip712_domain_copy, - extra_data, - permit, - op_tag, - existing_key_tag, - existing_compact_pk, - ) - .await - } - ddec_keyset_config::KeySetConfig::DecompressionOnly => { - Self::decompression_key_gen_background( - &req_id_clone, - &epoch_id_clone, - dkg_sessions.session_z128.base_session, - meta_store, - crypto_storage, - preproc_handle_w_mode, - sk, - dkg_params, - internal_keyset_config - .keyset_added_info().expect("keyset added info must be set for secure key generation and should have been validated before starting key generation").to_owned(), - eip712_domain_copy, - extra_data, - permit, - ) - .await + match internal_keyset_config.keyset_config() { + ddec_keyset_config::KeySetConfig::Standard(inner_config) => { + Self::key_gen_background( + meta_permit, + token_for_bg, + &req_id_clone, + &epoch_id_clone, + dkg_sessions, + meta_store, + crypto_storage, + preproc_handle_w_mode, + sk, + dkg_params, + inner_config.to_owned(), + &internal_keyset_config, + eip712_domain_copy, + extra_data, + permit, + op_tag, + ) + .await; + } + ddec_keyset_config::KeySetConfig::DecompressionOnly => { + Self::decompression_key_gen_background( + meta_permit, + token_for_bg, + &req_id_clone, + &epoch_id_clone, + dkg_sessions.session_z128.base_session, + meta_store, + crypto_storage, + preproc_handle_w_mode, + sk, + dkg_params, + internal_keyset_config + .keyset_added_info().expect("keyset added info must be set for secure key generation and should have been validated before starting key generation").to_owned(), + eip712_domain_copy, + extra_data, + permit, + op_tag, + ) + .await; + } } + // Cleanup runs on every termination. + ongoing.lock().await.remove(&preproc_id); } - }; - self.tracker - .spawn(async move { - //Start the metric timer, it will end on drop - let _timer = timer.start(); - tokio::select! { - () = keygen_background => { - tracing::info!("Key generation of request {} with preproc id {} exiting normally.", req_id, preproc_id); - // Remove cancellation token since generation is now done. - ongoing.lock().await.remove(&preproc_id); - }, - () = token.cancelled() => { - MetricedError::handle_unreturnable_error( - OP_KEYGEN_REQUEST, - Some(req_id), - format!("Key generation background with preprocessing id {} failed since the task got aborted", preproc_id), - ); - tracing::error!("Key generation of request {} exiting before completion because of an abort request.", &req_id); - let mut guarded_meta_store = meta_store_cancelled.write().await; - let _ = guarded_meta_store.update(&req_id, Result::Err("Key generation was aborted".to_string())); - crypto_storage_cancelled.purge_fhe_keys(&req_id, &epoch_id).await; - }, - } - }.instrument(tracing::Span::current())); + .instrument(tracing::Span::current()), + ); Ok(()) } @@ -547,7 +530,7 @@ impl< let (preproc_handle, dkg_params) = // Processes the bucket meta information. This is a slightly funky as in certain situations it may override the DKGParams sepcified in the request Self::retrieve_preproc_handle( - self.preproc_buckets.read().await, + &self.preproc_buckets, req_id, preproc_id, params, @@ -574,11 +557,10 @@ impl< )); } - add_req_to_meta_store( - &mut self.dkg_pubinfo_meta_store.write().await, - &req_id, - op_tag, - )?; + // The meta-store permit is threaded into the spawned background so the + // strong invariant ("only the rightful holder updates") holds end-to-end. + let meta_permit = + add_req_to_meta_store(&self.dkg_pubinfo_meta_store, &req_id, op_tag).await?; // TODO should have specific error we can react to and then disc should be checked later. I think that is also how it is in centralized tracing::info!( "Keygen starting with request_id={:?}, insecure={}", @@ -596,6 +578,7 @@ impl< context_id, epoch_id, permit, + meta_permit, ) .await .map_err(|e| MetricedError::new(op_tag, Some(req_id), e, tonic::Code::Internal))?; @@ -624,7 +607,7 @@ impl< /// Retrieve the preprocessing handle, parameters and preprocessing ID from the request. /// This method also does NOT delete the preprocessing entry from the meta store async fn retrieve_preproc_handle( - bucket_metastore: RwLockReadGuard<'_, MetaStore>, + bucket_metastore: &Arc>>, key_req_id: RequestId, preproc_id: RequestId, params: Option, @@ -656,6 +639,7 @@ impl< e.code(), ) })?; + let preproc_bucket = (*preproc_bucket).clone(); if preproc_bucket.preprocessing_id != preproc_id { return Err(MetricedError::new( op_tag, @@ -692,15 +676,10 @@ impl< let request_id = parse_grpc_request_id(&request.into_inner(), RequestIdParsingErr::KeyGenResponse) .map_err(|e| MetricedError::new(op_tag, None, e, tonic::Code::InvalidArgument))?; - let key_gen_res = retrieve_from_meta_store_with_timeout( - self.dkg_pubinfo_meta_store.read().await, - &request_id, - op_tag, - crate::consts::DURATION_WAITING_ON_KEYGEN_RESULT_SECONDS, - ) - .await?; + let key_gen_res = + retrieve_from_meta_store(&self.dkg_pubinfo_meta_store, &request_id, op_tag).await?; - match key_gen_res { + match (*key_gen_res).clone() { KeyGenMetadata::Current(res) => { if res.key_id != request_id { return Err(MetricedError::new( @@ -996,8 +975,11 @@ impl< .await } + #[allow(clippy::too_many_arguments)] #[allow(clippy::too_many_arguments)] pub async fn decompression_key_gen_background( + meta_permit: MetaStorePermit, + cancel_token: CancellationToken, req_id: &RequestId, epoch_id: &EpochId, mut base_session: BaseSession, @@ -1010,72 +992,86 @@ impl< eip712_domain: alloy_sol_types::Eip712Domain, extra_data: Vec, permit: OwnedSemaphorePermit, + op_tag: &'static str, ) { let _permit = permit; + let _ = op_tag; // reserved for future metric tagging consistency let start = Instant::now(); - let (prep_id, dkg_res) = match preproc_handle_w_mode { - PreprocHandleWithMode::Insecure => { - // sanity check to make sure we're using the insecure feature - #[cfg(not(feature = "insecure"))] - { - panic!( - "attempting to call insecure compression keygen when the insecure feature is not set" - ); - } - #[cfg(feature = "insecure")] - { - ( - *INSECURE_PREPROCESSING_ID, - match Self::get_glwe_and_compression_key_shares( - keyset_added_info, - epoch_id, - crypto_storage.clone(), - ) - .await + // Run the cancellable DKG phase. Neither arm of this select! owns the + // meta-store permit; it lives in this function scope and is consumed + // by one of the terminal arms below. + let crypto_storage_for_dkg = crypto_storage.clone(); + let dkg_outcome: Result<(RequestId, _), String> = tokio::select! { + biased; + () = cancel_token.cancelled() => Err("Key generation was aborted".to_string()), + v = async { + match preproc_handle_w_mode { + PreprocHandleWithMode::Insecure => { + // sanity check to make sure we're using the insecure feature + #[cfg(not(feature = "insecure"))] { - Ok((glwe_shares, compression_shares)) => { - Self::reconstruct_glwe_and_compression_key_shares( - req_id, - &base_session, - params, - glwe_shares, - compression_shares, + panic!( + "attempting to call insecure compression keygen when the insecure feature is not set" + ); + } + #[cfg(feature = "insecure")] + { + ( + *INSECURE_PREPROCESSING_ID, + match Self::get_glwe_and_compression_key_shares( + keyset_added_info, + epoch_id, + crypto_storage_for_dkg, ) .await - } - Err(e) => Err(e), - }, - ) + { + Ok((glwe_shares, compression_shares)) => { + Self::reconstruct_glwe_and_compression_key_shares( + req_id, + &base_session, + params, + glwe_shares, + compression_shares, + ) + .await + } + Err(e) => Err(e), + }, + ) + } + } + PreprocHandleWithMode::Secure((prep_id, preproc_handle)) => { + let mut preproc_handle = preproc_handle.lock().await; + ( + prep_id, + Self::decompression_key_gen_closure( + epoch_id, + &mut base_session, + crypto_storage_for_dkg, + params, + keyset_added_info, + preproc_handle.as_mut(), + ) + .await, + ) + } } - } - PreprocHandleWithMode::Secure((prep_id, preproc_handle)) => { - let mut preproc_handle = preproc_handle.lock().await; - ( - prep_id, - Self::decompression_key_gen_closure( - epoch_id, - &mut base_session, - crypto_storage.clone(), - params, - keyset_added_info, - preproc_handle.as_mut(), - ) - .await, - ) - } + } => match v { + (prep_id, Ok(k)) => Ok((prep_id, k)), + (_, Err(e)) => Err(format!("Failed to construct decompression key: {e}")), + }, }; - // Make sure the dkg ended nicely - let decompression_key = match dkg_res { - Ok(k) => k, - Err(e) => { - // If dkg errored out, update status - update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - format!("Failed to construct decompression key: {e}"), + let (prep_id, decompression_key) = match dkg_outcome { + Ok(v) => v, + Err(msg) => { + let _ = update_err_req_in_meta_store( + &meta_store, + meta_permit, + msg, OP_DECOMPRESSION_KEYGEN, - ); + ) + .await; return; } }; @@ -1092,19 +1088,21 @@ impl< ) { Ok(info) => info, Err(e) => { - update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, + let _ = update_err_req_in_meta_store( + &meta_store, + meta_permit, format!("Failed to compute key info: {e}"), OP_DECOMPRESSION_KEYGEN, - ); + ) + .await; return; } }; + // write_decompression_key consumes the permit via update_meta_store internally. if let Err(e) = crypto_storage .inner - .write_decompression_key(req_id, info, decompression_key, meta_store) + .write_decompression_key(req_id, info, decompression_key, meta_store, meta_permit) .await { tracing::error!( @@ -1223,9 +1221,11 @@ impl< #[allow(clippy::too_many_arguments)] async fn key_gen_background( + meta_permit: MetaStorePermit, + cancel_token: CancellationToken, req_id: &RequestId, epoch_id: &EpochId, - mut dkg_sessions: DkgSessions, + dkg_sessions: DkgSessions, meta_store: Arc>>, crypto_storage: ThresholdCryptoMaterialStorage, preproc_handle_w_mode: PreprocHandleWithMode, @@ -1235,210 +1235,381 @@ impl< internal_keyset_config: &InternalKeySetConfig, eip712_domain: alloy_sol_types::Eip712Domain, extra_data: Vec, - permit: OwnedSemaphorePermit, + rate_limiter_permit: OwnedSemaphorePermit, op_tag: &'static str, - existing_key_tag: Option, - existing_compact_pk: Option, ) { - let _permit = permit; + let _rate_limiter_permit = rate_limiter_permit; let start = Instant::now(); - let (prep_id, dkg_res) = match preproc_handle_w_mode { + + let outcome: Result<(RequestId, ThresholdKeyGenResult), String> = tokio::select! { + biased; + () = cancel_token.cancelled() => Err("Key generation was aborted".to_string()), + v = Self::run_dkg( + dkg_sessions, + preproc_handle_w_mode, + &crypto_storage, + params, + req_id, + epoch_id, + keyset_config, + internal_keyset_config, + ) => v, + }; + + let (prep_id, dkg_res) = match outcome { + Ok(v) => v, + Err(msg) => { + let _ = update_err_req_in_meta_store( + &meta_store, + meta_permit, + format!("Standard key generation failed: {msg}"), + op_tag, + ) + .await; + return; + } + }; + + let prepared = match Self::prepare_threshold_keys( + dkg_res, + &sk, + &prep_id, + req_id, + epoch_id, + &eip712_domain, + extra_data, + &keyset_config, + internal_keyset_config, + &crypto_storage, + ) + .await + { + Ok(p) => p, + Err(msg) => { + let _ = update_err_req_in_meta_store(&meta_store, meta_permit, msg, op_tag).await; + return; + } + }; + + if let Err(e) = crypto_storage + .write_fhe_keys( + req_id, + epoch_id, + prepared.threshold_fhe_keys, + prepared.pub_key_set, + Arc::clone(&meta_store), + meta_permit, + op_tag, + ) + .await + { + tracing::error!("Failed to write threshold keys for request {req_id}: {e}"); + return; + } + + // Compressed-UseExisting only: copy the compressed key to the original + // key id. At this point the new keygen is already Done in the meta + // store; a failure here only affects the migration target, so we log + // loudly but don't fail the parent request. + if let Some(old_key_id) = prepared.migration_copy_target { + // UseExisting reads the old private shares at the current epoch_id + // (see key_gen_from_existing_private_keyset), so the copy targets + // the same (old_key_id, epoch_id) pair. + if let Err(e) = crypto_storage + .copy_compressed_key_to_original( + req_id, + epoch_id, + &old_key_id, + epoch_id, + &sk, + &eip712_domain, + Arc::clone(&meta_store), + ) + .await + { + tracing::error!( + "Compressed keygen for {req_id} committed successfully, but the \ + follow-up copy to original key id {old_key_id} failed: {e}. \ + The new keys at {req_id} are valid; \ + the migration to {old_key_id} must be retried." + ); + } + } + + // Update the backup vault (failures are incremented in the metrics). + crypto_storage + .inner + .update_backup_vault(false, op_tag) + .await; + + tracing::info!( + "DKG protocol took {} ms to complete for request {req_id}", + start.elapsed().as_millis() + ); + } + + /// Dispatches the DKG over (preproc mode × keyset configuration) and + /// returns the produced material paired with the preprocessing id that + /// was actually used. Stringifies errors for the meta-store path. + #[allow(clippy::too_many_arguments)] + async fn run_dkg( + dkg_sessions: DkgSessions, + preproc_handle_w_mode: PreprocHandleWithMode, + crypto_storage: &ThresholdCryptoMaterialStorage, + params: DKGParams, + req_id: &RequestId, + epoch_id: &EpochId, + keyset_config: ddec_keyset_config::StandardKeySetConfig, + internal_keyset_config: &InternalKeySetConfig, + ) -> Result<(RequestId, ThresholdKeyGenResult), String> { + match preproc_handle_w_mode { PreprocHandleWithMode::Insecure => { - // sanity check to make sure we're using the insecure feature + #[cfg(feature = "insecure")] + { + let res = + Self::run_insecure_dkg(dkg_sessions, params, req_id, keyset_config).await?; + Ok((*INSECURE_PREPROCESSING_ID, res)) + } #[cfg(not(feature = "insecure"))] { panic!( "attempting to call insecure keygen when the insecure feature is not set" ); } - #[cfg(feature = "insecure")] - { - ( - *INSECURE_PREPROCESSING_ID, - match ( - keyset_config.secret_key_config, - keyset_config.computation_key_type, - keyset_config.compressed_key_config, - ) { - // Insecure standard keygen - ( - ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, - ddec_keyset_config::ComputeKeyType::Cpu, - ddec_keyset_config::CompressedKeyConfig::None, - ) => insecure_initialize_key_material( - &mut dkg_sessions.session_z128, - params, - req_id.into(), - ) - .await - .map(|(pk, sk)| ThresholdKeyGenResult::Uncompressed(pk, sk)), - // Insecure compressed keygen - ( - ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, - ddec_keyset_config::ComputeKeyType::Cpu, - ddec_keyset_config::CompressedKeyConfig::All, - ) => initialize_compressed_key_material( - &mut dkg_sessions.session_z128, - params, - req_id.into(), - ) - .await - .map(|(compressed_keyset, sk)| { - ThresholdKeyGenResult::Compressed(compressed_keyset, sk) - }), - _ => { - // insecure keygen from existing compression key is not supported - update_err_req_in_meta_store(&mut meta_store.write().await, req_id, "insecure keygen from existing compression key is not supported".to_string(), op_tag); - return; - } - }, - ) - } } PreprocHandleWithMode::Secure((prep_id, preproc_handle)) => { let mut preproc_handle = preproc_handle.lock().await; - ( - prep_id, - match ( - keyset_config.secret_key_config, - keyset_config.computation_key_type, - keyset_config.compressed_key_config, - ) { - ( - ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, - ddec_keyset_config::ComputeKeyType::Cpu, - ddec_keyset_config::CompressedKeyConfig::None, - ) => KG::keygen( - &mut dkg_sessions.session_z128, - preproc_handle.as_mut(), - params, - req_id.into(), - ) - .await - .map(|(pk, sk)| ThresholdKeyGenResult::Uncompressed(pk, sk)), - ( - ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, - ddec_keyset_config::ComputeKeyType::Cpu, - ddec_keyset_config::CompressedKeyConfig::All, - ) => KG::compressed_keygen( - &mut dkg_sessions.session_z128, - preproc_handle.as_mut(), - params, - req_id.into(), - ) - .await - .map(|(compressed_keyset, sk)| { - ThresholdKeyGenResult::Compressed(compressed_keyset, sk) - }), - ( - ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting, - ddec_keyset_config::ComputeKeyType::Cpu, - ddec_keyset_config::CompressedKeyConfig::None, - ) => { - let existing_keyset_id = internal_keyset_config - .get_existing_keyset_id() - .expect("Standard UseExisting keygen must have a validated keyset_added_info.existing_keyset_id"); - let tag: tfhe::Tag = existing_key_tag.unwrap_or_else(|| req_id.into()); - Self::key_gen_from_existing_private_keyset( - &mut dkg_sessions, - crypto_storage.clone(), - params, - existing_keyset_id, - *epoch_id, - preproc_handle.as_mut(), - tag, - ) - .await - .map(|(pk, sk)| ThresholdKeyGenResult::Uncompressed(pk, sk)) - } - ( - ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting, - ddec_keyset_config::ComputeKeyType::Cpu, - ddec_keyset_config::CompressedKeyConfig::All, - ) => { - let existing_keyset_id = internal_keyset_config - .get_existing_keyset_id() - .expect("Standard UseExisting compressed keygen must have a validated keyset_added_info.existing_keyset_id"); - let tag: tfhe::Tag = existing_key_tag.unwrap_or_else(|| req_id.into()); - Self::compressed_key_gen_from_existing_private_keyset( - &mut dkg_sessions, - crypto_storage.clone(), - params, - existing_keyset_id, - *epoch_id, - preproc_handle.as_mut(), - tag, - ) - .await - .map(|(compressed_keyset, sk)| { - ThresholdKeyGenResult::Compressed(compressed_keyset, sk) - }) - } - }, + let res = Self::run_secure_dkg( + dkg_sessions, + preproc_handle.as_mut(), + crypto_storage, + params, + req_id, + epoch_id, + keyset_config, + internal_keyset_config, ) + .await?; + Ok((prep_id, res)) } - }; + } + } - //Make sure the dkg ended nicely - let dkg_result = match dkg_res { - Ok(result) => result, - Err(e) => { - update_err_req_in_meta_store( - &mut meta_store.write().await, + /// Insecure DKG path: deterministic key material generation, no + /// preprocessing. Panics if compiled without the `insecure` feature. + #[cfg(feature = "insecure")] + async fn run_insecure_dkg( + mut dkg_sessions: DkgSessions, + params: DKGParams, + req_id: &RequestId, + keyset_config: ddec_keyset_config::StandardKeySetConfig, + ) -> Result { + match ( + keyset_config.secret_key_config, + keyset_config.computation_key_type, + keyset_config.compressed_key_config, + ) { + ( + ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, + ddec_keyset_config::ComputeKeyType::Cpu, + ddec_keyset_config::CompressedKeyConfig::None, + ) => insecure_initialize_key_material( + &mut dkg_sessions.session_z128, + params, + req_id.into(), + ) + .await + .map(|(pk, sk)| ThresholdKeyGenResult::Uncompressed(pk, sk)) + .map_err(|e| e.to_string()), + ( + ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, + ddec_keyset_config::ComputeKeyType::Cpu, + ddec_keyset_config::CompressedKeyConfig::All, + ) => initialize_compressed_key_material( + &mut dkg_sessions.session_z128, + params, + req_id.into(), + ) + .await + .map(|(compressed_keyset, sk)| ThresholdKeyGenResult::Compressed(compressed_keyset, sk)) + .map_err(|e| e.to_string()), + _ => Err("insecure keygen from existing compression key is not supported".to_string()), + } + } + + #[cfg(not(feature = "insecure"))] + async fn run_insecure_dkg( + _dkg_sessions: DkgSessions, + _params: DKGParams, + _req_id: &RequestId, + _keyset_config: ddec_keyset_config::StandardKeySetConfig, + ) -> Result { + panic!("attempting to call insecure keygen when the insecure feature is not set"); + } + + /// Secure DKG path: dispatches over `(secret_key_config, compressed_key_config)`, + /// fetching the existing keyset tag inline for the UseExisting arms. + #[allow(clippy::too_many_arguments)] + async fn run_secure_dkg

( + mut dkg_sessions: DkgSessions, + preproc_handle: &mut P, + crypto_storage: &ThresholdCryptoMaterialStorage, + params: DKGParams, + req_id: &RequestId, + epoch_id: &EpochId, + keyset_config: ddec_keyset_config::StandardKeySetConfig, + internal_keyset_config: &InternalKeySetConfig, + ) -> Result + where + P: DKGPreprocessing> + + Send + + ?Sized, + { + match ( + keyset_config.secret_key_config, + keyset_config.computation_key_type, + keyset_config.compressed_key_config, + ) { + ( + ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, + ddec_keyset_config::ComputeKeyType::Cpu, + ddec_keyset_config::CompressedKeyConfig::None, + ) => KG::keygen( + &mut dkg_sessions.session_z128, + preproc_handle, + params, + req_id.into(), + ) + .await + .map(|(pk, sk)| ThresholdKeyGenResult::Uncompressed(pk, sk)) + .map_err(|e| e.to_string()), + ( + ddec_keyset_config::KeyGenSecretKeyConfig::GenerateAll, + ddec_keyset_config::ComputeKeyType::Cpu, + ddec_keyset_config::CompressedKeyConfig::All, + ) => KG::compressed_keygen( + &mut dkg_sessions.session_z128, + preproc_handle, + params, + req_id.into(), + ) + .await + .map(|(compressed_keyset, sk)| ThresholdKeyGenResult::Compressed(compressed_keyset, sk)) + .map_err(|e| e.to_string()), + ( + ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting, + ddec_keyset_config::ComputeKeyType::Cpu, + ddec_keyset_config::CompressedKeyConfig::None, + ) => { + let existing_keyset_id = internal_keyset_config + .get_existing_keyset_id() + .expect("Standard UseExisting keygen must have a validated keyset_added_info.existing_keyset_id"); + let tag = Self::existing_key_tag_or_default( + crypto_storage, + internal_keyset_config, req_id, - format!("Standard key generation failed: {e}"), - op_tag, - ); - return; + &existing_keyset_id, + ) + .await?; + Self::key_gen_from_existing_private_keyset( + &mut dkg_sessions, + crypto_storage.clone(), + params, + existing_keyset_id, + *epoch_id, + preproc_handle, + tag, + ) + .await + .map(|(pk, sk)| ThresholdKeyGenResult::Uncompressed(pk, sk)) + .map_err(|e| e.to_string()) } - }; + ( + ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting, + ddec_keyset_config::ComputeKeyType::Cpu, + ddec_keyset_config::CompressedKeyConfig::All, + ) => { + let existing_keyset_id = internal_keyset_config + .get_existing_keyset_id() + .expect("Standard UseExisting compressed keygen must have a validated keyset_added_info.existing_keyset_id"); + let tag = Self::existing_key_tag_or_default( + crypto_storage, + internal_keyset_config, + req_id, + &existing_keyset_id, + ) + .await?; + Self::compressed_key_gen_from_existing_private_keyset( + &mut dkg_sessions, + crypto_storage.clone(), + params, + existing_keyset_id, + *epoch_id, + preproc_handle, + tag, + ) + .await + .map(|(compressed_keyset, sk)| { + ThresholdKeyGenResult::Compressed(compressed_keyset, sk) + }) + .map_err(|e| e.to_string()) + } + } + } - // Handle both compressed and uncompressed keygen results - match dkg_result { + // TODO simpler inline + /// Returns the tag to use for the freshly generated key material on the + /// UseExisting paths: the existing keyset's tag if `use_existing_key_tag` + /// is set, otherwise a tag derived from `req_id`. + async fn existing_key_tag_or_default( + crypto_storage: &ThresholdCryptoMaterialStorage, + internal_keyset_config: &InternalKeySetConfig, + req_id: &RequestId, + existing_keyset_id: &RequestId, + ) -> Result { + if internal_keyset_config.use_existing_key_tag() { + Self::read_existing_key_tag(crypto_storage, existing_keyset_id) + .await + .map_err(|e| e.to_string()) + } else { + Ok(req_id.into()) + } + } + + /// Post-DKG processing: signs metadata, packs the public key material, + /// and decides whether a compressed-UseExisting run needs the follow-up + /// migration copy. On the compressed-UseExisting path this loads the OLD + /// `CompactPublicKey` from storage so signatures + stored bytes stay + /// stable for clients that already hold it. + #[allow(clippy::too_many_arguments)] + async fn prepare_threshold_keys( + dkg_res: ThresholdKeyGenResult, + sk: &PrivateSigKey, + prep_id: &RequestId, + req_id: &RequestId, + epoch_id: &EpochId, + eip712_domain: &alloy_sol_types::Eip712Domain, + extra_data: Vec, + keyset_config: &ddec_keyset_config::StandardKeySetConfig, + internal_keyset_config: &InternalKeySetConfig, + crypto_storage: &ThresholdCryptoMaterialStorage, + ) -> Result { + match dkg_res { ThresholdKeyGenResult::Uncompressed(pub_key_set, private_keys) => { - //Compute all the info required for storing - let info = match compute_info_uncompressed_keygen( - &sk, + let info = compute_info_uncompressed_keygen( + sk, &DSEP_PUBDATA_KEY, - &prep_id, + prep_id, req_id, &pub_key_set, - &eip712_domain, + eip712_domain, extra_data, - ) { - Ok(info) => info, - Err(e) => { - update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - format!( - "Computation of meta data in standard key generation failed: {e}" - ), - op_tag, - ); - return; - } - }; - - let (integer_server_key, decompression_key, sns_key) = { - let ( - raw_server_key, - _raw_ksk_material, - _raw_compression_key, - raw_decompression_key, - raw_noise_squashing_key, - _raw_noise_squashing_compression_key, - _raw_rerandomization_key, - _raw_oprf_key, - _raw_tag, - ) = pub_key_set.server_key.clone().into_raw_parts(); - ( - raw_server_key, - raw_decompression_key, - raw_noise_squashing_key, - ) - }; - + ) + .map_err(|e| { + format!("Computation of meta data in standard key generation failed: {e}") + })?; + let (integer_server_key, _, _, decompression_key, sns_key, _, _, _, _) = + pub_key_set.server_key.clone().into_raw_parts(); let threshold_fhe_keys = ThresholdFheKeys::new( Arc::new(private_keys), PublicKeyMaterial::new_uncompressed( @@ -1448,156 +1619,79 @@ impl< ), info, ); - - //Note: We can't easily check here whether we succeeded writing to the meta store - //thus we can't increment the error counter if it fails - if let Err(e) = crypto_storage - .write_fhe_keys( - req_id, + Ok(PreparedThresholdKeys { + threshold_fhe_keys, + pub_key_set: PublicKeySet::Uncompressed(Arc::new(pub_key_set)), + migration_copy_target: None, + }) + } + ThresholdKeyGenResult::Compressed(compressed_keyset, private_keys) => { + let is_use_existing = matches!( + keyset_config.secret_key_config, + ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting + ); + // When migrating from an existing keyset, load the OLD compact + // public key so signatures + stored bytes stay stable for + // clients that already cached it. For a fresh keygen we use + // the public key derived from the new compressed keyset. + let compact_public_key = if is_use_existing { + let existing_keyset_id = internal_keyset_config + .get_existing_keyset_id() + .expect("Standard UseExisting compressed keygen must have a validated keyset_added_info.existing_keyset_id"); + Self::read_existing_compact_public_key( + crypto_storage, + &existing_keyset_id, epoch_id, - threshold_fhe_keys, - PublicKeySet::Uncompressed(Arc::new(pub_key_set)), - meta_store, - op_tag, ) .await - { - tracing::error!("Failed to write threshold keys for request {req_id}: {e}"); - return; - } - } - ThresholdKeyGenResult::Compressed(compressed_keyset, private_keys) => { - // When migrating from an existing keyset (UseExisting), preserve the OLD - // CompactPublicKey so that signatures and stored bytes stay stable for - // clients that already hold it. For a fresh keygen, use the public key - // derived from the newly generated compressed keyset. - let compact_public_key = match existing_compact_pk { - Some(old_pk) => old_pk, - None => match compressed_keyset.decompress() { - Ok(ks) => ks.into_raw_parts().0, - Err(e) => { - update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - format!( - "Failed to decompress freshly generated compressed keyset: {e}" - ), - op_tag, - ); - return; - } - }, + .map_err(|e| e.to_string())? + } else { + compressed_keyset + .decompress() + .map(|ks| ks.into_raw_parts().0) + .map_err(|e| { + format!("Failed to decompress freshly generated compressed keyset: {e}") + })? }; - - // Compute info for compressed keygen - let info = match compute_info_compressed_keygen( - &sk, + let info = compute_info_compressed_keygen( + sk, &DSEP_PUBDATA_KEY, - &prep_id, + prep_id, req_id, &compressed_keyset, &compact_public_key, - &eip712_domain, + eip712_domain, extra_data, - ) { - Ok(info) => info, - Err(e) => { - update_err_req_in_meta_store( - &mut meta_store.write().await, - req_id, - format!( - "Computation of meta data in standard compressed key generation failed: {e}" - ), - op_tag, - ); - return; - } - }; - + ) + .map_err(|e| { + format!( + "Computation of meta data in standard compressed key generation failed: {e}" + ) + })?; let threshold_fhe_keys = ThresholdFheKeys::new( Arc::new(private_keys), PublicKeyMaterial::new(compressed_keyset.clone()), info, ); - - // NOTE: when there is an existing compact pk from an older keygen (an older key ID), - // then this pk is effectively copied to the new key ID. - if let Err(e) = crypto_storage - .write_fhe_keys( - req_id, - epoch_id, - threshold_fhe_keys, - PublicKeySet::Compressed { - compact_public_key: Arc::new(compact_public_key), - compressed_keyset: Arc::new(compressed_keyset), - }, - Arc::clone(&meta_store), - op_tag, - ) - .await + let migration_copy_target = if is_use_existing + && internal_keyset_config.copy_compressed_key_to_original() { - tracing::error!( - "Failed to write compressed threshold keys for request {req_id}: {e}" - ); - return; - } - - // If requested, copy the compressed key to the original key ID. - // - // Note: at this point the *new* keygen has already been committed - // and its meta_store entry is Done — that part of the request - // succeeded. The copy is a follow-up on a *different* key id - // (old_key_id), so a failure here does not invalidate the - // new_key_id material. We log loudly so operators can detect - // partial success and retry the migration, but we do not try - // to mark the new keygen itself as failed. - // - // Even if this migration copy fails, continue so the successfully - // committed new key material at req_id is swept into the backup - // vault below. - if matches!( - keyset_config.secret_key_config, - ddec_keyset_config::KeyGenSecretKeyConfig::UseExisting - ) && internal_keyset_config.copy_compressed_key_to_original() - { - let old_key_id = internal_keyset_config + Some(internal_keyset_config .get_existing_keyset_id() - .expect("copy_compressed_key_to_original requires the validated UseExisting keyset_added_info.existing_keyset_id"); - // UseExisting reads the old private shares at the current - // epoch_id (see key_gen_from_existing_private_keyset), so - // the copy targets the same (old_key_id, epoch_id) pair. - if let Err(e) = crypto_storage - .copy_compressed_key_to_original( - req_id, - epoch_id, - &old_key_id, - epoch_id, - &sk, - &eip712_domain, - Arc::clone(&meta_store), - ) - .await - { - tracing::error!( - "Compressed keygen for {req_id} committed successfully, but the \ - follow-up copy to original key id {old_key_id} failed: {e}. \ - The new keys at {req_id} are valid; \ - the migration to {old_key_id} must be retried." - ); - } - } + .expect("copy_compressed_key_to_original requires the validated UseExisting keyset_added_info.existing_keyset_id")) + } else { + None + }; + Ok(PreparedThresholdKeys { + threshold_fhe_keys, + pub_key_set: PublicKeySet::Compressed { + compact_public_key: Arc::new(compact_public_key), + compressed_keyset: Arc::new(compressed_keyset), + }, + migration_copy_target, + }) } } - // Update the backup and handle potential failures by incrementing backup errors in the metrics - crypto_storage - .inner - .update_backup_vault(false, op_tag) - .await; - - tracing::info!( - "DKG protocol took {} ms to complete for request {req_id}", - start.elapsed().as_millis() - ); } /// Reads the tag from an existing keyset in public storage. @@ -1753,6 +1847,7 @@ mod tests { consts::{DEFAULT_EPOCH_ID, DEFAULT_MPC_CONTEXT, TEST_PARAM}, dummy_domain, engine::threshold::service::session::SessionMaker, + util::meta_store::update_ok_req_in_meta_store, vault::storage::ram, }; @@ -1863,11 +1958,18 @@ mod tests { )))), dkg_param: TEST_PARAM, }; - let mut guarded_prep_bucket = kg.preproc_buckets.write().await; - (*guarded_prep_bucket).insert(prep_id).unwrap(); - (*guarded_prep_bucket) - .update(prep_id, Ok(dummy_prep)) + let permit = add_req_to_meta_store(&kg.preproc_buckets, prep_id, OP_KEYGEN_REQUEST) + .await .unwrap(); + assert!( + update_ok_req_in_meta_store( + &kg.preproc_buckets, + permit, + dummy_prep, + OP_KEYGEN_REQUEST + ) + .await + ); } (prep_ids, kg) } @@ -2133,8 +2235,9 @@ mod tests { #[tokio::test] async fn use_existing_key_tag_with_wrong_keyset_id() { // When use_existing_key_tag is true but existing_keyset_id points to a - // non-existent key in storage, launch_dkg should fail with Internal - // because read_existing_key_tag cannot find any key material. + // non-existent key in storage, the request itself is accepted (Empty + // is returned); the failure is reported via the meta store and + // surfaces from get_result. let (prep_ids, kg) = setup_key_generator::< DroppingOnlineDistributedKeyGen128<{ ResiduePolyF4Z128::EXTENSION_DEGREE }>, >() @@ -2171,13 +2274,21 @@ mod tests { extra_data: vec![], }); - let res = kg.key_gen(request).await.unwrap_err(); - assert_eq!(res.code(), tonic::Code::Internal); + // The request itself is accepted: the existing-keyset read happens in + // the background after the response. + kg.key_gen(request).await.unwrap(); + // get_result returns the failure recorded in the meta store. + let res = kg + .get_result(tonic::Request::new(key_id.into())) + .await + .unwrap_err(); assert!( res.internal_err() .to_string() - .contains(ERR_FAILED_TO_READ_EXISTING_TAG) + .contains(ERR_FAILED_TO_READ_EXISTING_TAG), + "expected meta-store error to mention {ERR_FAILED_TO_READ_EXISTING_TAG}, got: {}", + res.internal_err() ); } diff --git a/core/service/src/engine/threshold/service/preprocessor.rs b/core/service/src/engine/threshold/service/preprocessor.rs index eb513d8b76..6e0516c2de 100644 --- a/core/service/src/engine/threshold/service/preprocessor.rs +++ b/core/service/src/engine/threshold/service/preprocessor.rs @@ -34,7 +34,6 @@ use tracing::Instrument; // === Internal Crate === use crate::{ anyhow_error_and_log, - consts::DURATION_WAITING_ON_PREPROC_RESULT_SECONDS, cryptography::signatures::PrivateSigKey, engine::{ base::{BaseKmsStruct, compute_external_signature_preprocessing}, @@ -46,7 +45,10 @@ use crate::{ validation::{RequestIdParsingErr, parse_grpc_request_id, validate_preproc_request}, }, util::{ - meta_store::{MetaStore, add_req_to_meta_store, retrieve_from_meta_store_with_timeout}, + meta_store::{ + MetaStore, MetaStorePermit, add_req_to_meta_store, retrieve_from_meta_store, + update_req_in_meta_store, + }, rate_limiter::RateLimiter, }, }; @@ -81,7 +83,8 @@ impl>> Rea extra_data: Vec, domain: &alloy_sol_types::Eip712Domain, timer: DurationGuard<'static>, - permit: OwnedSemaphorePermit, + rate_limiting_permit: OwnedSemaphorePermit, + meta_permit: MetaStorePermit, #[cfg(feature = "insecure")] percentage_offline: Option< kms_grpc::kms::v1::PartialKeyGenPreprocParams, >, @@ -107,7 +110,6 @@ impl>> Rea let factory = Arc::clone(&self.preproc_factory); let bucket_store = Arc::clone(&self.preproc_buckets); - let bucket_store_cancellation = Arc::clone(&self.preproc_buckets); let token = CancellationToken::new(); { @@ -129,48 +131,25 @@ impl>> Rea async move { // Keep timer in the async task, will drop at the end of the task let _timer = timer; - tokio::select! { - res = Self::preprocessing_background( - sk, - &request_id, - &domain_clone, - small_sessions, - bucket_store, - my_identity, - dkg_params, - keyset_config, - extra_data, - factory, - permit, - #[cfg(feature = "insecure")] percentage_offline - ) => { - match res { - Ok(()) => { - tracing::info!("Preprocessing of request {} exiting normally.", &request_id); - }, - Err(()) => { - MetricedError::handle_unreturnable_error( - OP_KEYGEN_PREPROC_REQUEST, - Some(request_id), - "Preprocessing background task failed".to_string(), - ); - } - } - // Remove cancellation token since generation is now done. - ongoing.lock().await.remove(&request_id); - }, - () = token.cancelled() => { - // NOTE: Any correlated randomness that was already generated should be cleaned up from Redis on drop. - tracing::error!("Preprocessing of request {} exiting before completion because of an abort request.", &request_id); - let mut guarded_bucket_store = bucket_store_cancellation.write().await; - let _ = guarded_bucket_store.update(&request_id, Result::Err("Preprocessing was aborted".to_string())); - MetricedError::handle_unreturnable_error( - OP_KEYGEN_PREPROC_REQUEST, - Some(request_id), - format!("Preprocessing background with preprocessing id {} failed since the task got aborted", request_id), - ); - }, - } + Self::preprocessing_background( + sk, + &request_id, + &domain_clone, + small_sessions, + bucket_store, + my_identity, + dkg_params, + keyset_config, + extra_data, + factory, + rate_limiting_permit, + meta_permit, + token, + #[cfg(feature = "insecure")] + percentage_offline, + ) + .await; + ongoing.lock().await.remove(&request_id); } .instrument(tracing::Span::current()), ); @@ -189,13 +168,15 @@ impl>> Rea keyset_config: ddec_keyset_config::KeySetConfig, extra_data: Vec, factory: Arc>>>, - permit: OwnedSemaphorePermit, + rate_limiting_permit: OwnedSemaphorePermit, + meta_permit: MetaStorePermit, + cancel_token: CancellationToken, #[cfg(feature = "insecure")] partial_params: Option< kms_grpc::kms::v1::PartialKeyGenPreprocParams, >, - ) -> Result<(), ()> { + ) { // dropped at the end of the function - let _permit = permit; + let _rate_limiting_permit = rate_limiting_permit; let preprocessing_started_at = Instant::now(); #[cfg(feature = "insecure")] @@ -245,15 +226,21 @@ impl>> Rea PreprocessingOrchestrator::::new(factory, params, keyset_config) }; - // Process the result of orchestration or orchestrator creation - let handle_update = match orchestrator_result { + // Process the result of orchestration or orchestrator creation. + // Only the long-running orchestration is cancellable; orchestrator + // construction is fast and synchronous, and the post-orchestration + // steps (external sig + meta store update) must commit atomically. + let handle_update: Result<_, String> = match orchestrator_result { Ok(orchestrator) => { tracing::info!("Starting Preproc Orchestration on P[{:?}]", own_identity); - // Execute the orchestration with the successfully created orchestrator - match orchestrator - .orchestrate_dkg_processing_small_session::

(sessions) - .await - { + let orchestration_outcome = tokio::select! { + biased; + () = cancel_token.cancelled() => Err("aborted".to_string()), + res = orchestrator + .orchestrate_dkg_processing_small_session::

(sessions) + => res.map_err(|e| e.to_string()), + }; + match orchestration_outcome { Ok((sessions, preproc_handle)) => { tracing::info!( "Preproc orchestration phase finished for request {} on P[{:?}] (elapsed: {:.1}s). Finalizing result...", @@ -265,7 +252,7 @@ impl>> Rea } Err(error) => { tracing::error!("Failed during preprocessing orchestration: {}", error); - Err(error.to_string()) + Err(error) } } } @@ -276,7 +263,7 @@ impl>> Rea }; #[cfg(feature = "insecure")] - let handle_update = { + let handle_update: Result<_, String> = { use threshold_execution::online::preprocessing::{ DKGPreprocessing, dummy::DummyPreprocessing, }; @@ -291,16 +278,20 @@ impl>> Rea own_identity, partial_params.percentage_offline, ); - let preproc = Box::new(DummyPreprocessing::new( - 0, - sessions.first().ok_or_else(|| { + match sessions.first() { + Some(first_session) => { + let preproc: Box> = + Box::new(DummyPreprocessing::new(0, first_session)); + Ok((sessions, Arc::new(Mutex::new(preproc)))) + } + None => { tracing::error!( "Could not retrieve any session after partial preprocessing" - ) - })?, - )); - let preproc: Box> = preproc; - Ok((sessions, Arc::new(Mutex::new(preproc)))) + ); + Err("Could not retrieve any session after partial preprocessing" + .to_string()) + } + } } else { tracing::debug!( "Preproc request {} on P[{:?}] keeping real preprocessing handle (partial={}%)", @@ -315,64 +306,63 @@ impl>> Rea } }; - tracing::debug!( - "Preproc request {} on P[{:?}] computing external signature", - req_id, - own_identity, - ); - let external_signature = - match compute_external_signature_preprocessing(&sk, req_id, domain, extra_data) { - Ok(sig) => sig, - Err(e) => { - tracing::error!("Failed to compute external signature: {}", e); - return Err(()); + // Build the final bucket result by combining the orchestration outcome + // with the external signature computation. + let bucket_result: Result = match handle_update { + Err(e) => Err(e), + Ok((_sessions, inner)) => { + tracing::debug!( + "Preproc request {} on P[{:?}] computing external signature", + req_id, + own_identity, + ); + match compute_external_signature_preprocessing(&sk, req_id, domain, extra_data) { + Ok(external_signature) => Ok(BucketMetaStore { + external_signature, + preprocessing_id: *req_id, + preprocessing_store: inner, + dkg_param: params, + }), + Err(e) => { + tracing::error!("Failed to compute external signature: {}", e); + Err(format!("Failed to compute external signature: {e}")) + } } - }; - - let mut guarded_meta_store = bucket_store.write().await; + } + }; - let handle_update = handle_update.map(|(_sessions, inner)| inner); - // We cannot do much if updating the storage fails at this point... - let meta_store_write = guarded_meta_store.update( - req_id, - handle_update.clone().map(|inner| BucketMetaStore { - external_signature, - preprocessing_id: *req_id, - preprocessing_store: inner, - dkg_param: params, - }), - ); + // Consume the meta-store permit in exactly one terminal-state write. + let bucket_result_ok = bucket_result.is_ok(); + let meta_store_ok = update_req_in_meta_store::<_, String>( + &bucket_store, + meta_permit, + bucket_result, + OP_KEYGEN_PREPROC_REQUEST, + ) + .await; - // Log completion status - match (handle_update, meta_store_write) { - (Ok(_), Ok(_)) => tracing::info!( + if bucket_result_ok && meta_store_ok { + tracing::info!( "Preproc Finished Successfully P[{:?}] for request {} (total elapsed: {:.1}s)", own_identity, req_id, preprocessing_started_at.elapsed().as_secs_f64(), - ), - (Err(e), _) => { - tracing::error!( - "Preproc Failed P[{:?}] for request {} after {:.1}s with error: {}", - own_identity, - req_id, - preprocessing_started_at.elapsed().as_secs_f64(), - e - ); - return Err(()); - } - (_, Err(e)) => { - tracing::error!( - "Preproc Failed due to meta store issue P[{:?}] for request {} after {:.1}s with error: {}", - own_identity, - req_id, - preprocessing_started_at.elapsed().as_secs_f64(), - e - ); - return Err(()); - } + ); + } else if !bucket_result_ok { + tracing::error!( + "Preproc Failed P[{:?}] for request {} after {:.1}s", + own_identity, + req_id, + preprocessing_started_at.elapsed().as_secs_f64(), + ); + } else { + tracing::error!( + "Preproc meta store update failed P[{:?}] for request {} after {:.1}s", + own_identity, + req_id, + preprocessing_started_at.elapsed().as_secs_f64(), + ); } - Ok(()) } async fn inner_key_gen_preproc( @@ -382,7 +372,7 @@ impl>> Rea kms_grpc::kms::v1::PartialKeyGenPreprocParams, >, ) -> Result, MetricedError> { - let permit = self.rate_limiter.start_preproc().await?; + let rate_limiting_permit = self.rate_limiter.start_preproc().await?; let mut timer = METRICS.time_operation(OP_KEYGEN_PREPROC_REQUEST).start(); let ( @@ -405,12 +395,13 @@ impl>> Rea let metric_tags = vec![(TAG_PARTY_ID, my_role.to_string())]; timer.tags(metric_tags); - // Add preprocessing to metastore and fail in case it is already present - add_req_to_meta_store( - &mut self.preproc_buckets.write().await, + // Add preprocessing to metastore and fail in case it is already present. + let meta_permit = add_req_to_meta_store( + &self.preproc_buckets, &request_id, OP_KEYGEN_PREPROC_REQUEST, - )?; + ) + .await?; tracing::info!("Starting preproc generation for Request ID {}", request_id); @@ -423,7 +414,8 @@ impl>> Rea extra_data, &eip712_domain, timer, - permit, + rate_limiting_permit, + meta_permit, #[cfg(feature = "insecure")] partial_params ).await.map_err(|e| MetricedError::new(OP_KEYGEN_PREPROC_REQUEST, @@ -484,19 +476,11 @@ impl> + Se ) })?; - tracing::info!( - "Polling preproc result for request {} (waiting up to {}s)", - request_id, - DURATION_WAITING_ON_PREPROC_RESULT_SECONDS, - ); + tracing::info!("Reading preproc result for request {}", request_id); - let preproc_data = retrieve_from_meta_store_with_timeout( - self.preproc_buckets.read().await, - &request_id, - OP_KEYGEN_PREPROC_RESULT, - DURATION_WAITING_ON_PREPROC_RESULT_SECONDS, - ) - .await?; + let preproc_data = + retrieve_from_meta_store(&self.preproc_buckets, &request_id, OP_KEYGEN_PREPROC_RESULT) + .await?; tracing::info!("Preproc result ready for request {}", request_id); @@ -515,7 +499,7 @@ impl> + Se Ok(Response::new(KeyGenPreprocResult { preprocessing_id: Some(request_id.into()), - external_signature: preproc_data.external_signature, + external_signature: preproc_data.external_signature.clone(), })) } diff --git a/core/service/src/engine/threshold/service/public_decryptor.rs b/core/service/src/engine/threshold/service/public_decryptor.rs index cb17d640f9..f6985d63b5 100644 --- a/core/service/src/engine/threshold/service/public_decryptor.rs +++ b/core/service/src/engine/threshold/service/public_decryptor.rs @@ -318,11 +318,9 @@ impl< // So we need to update it everytime something bad happens, // or put all the code that may error before the first write to the meta-store, // otherwise it'll be in the "Started" state forever. - add_req_to_meta_store( - &mut self.pub_dec_meta_store.write().await, - &req_id, - OP_PUBLIC_DECRYPT_REQUEST, - )?; + let meta_permit = + add_req_to_meta_store(&self.pub_dec_meta_store, &req_id, OP_PUBLIC_DECRYPT_REQUEST) + .await?; let ext_handles_bytes = ciphertexts .iter() @@ -541,6 +539,7 @@ impl< // it when decryptions are available let _timer = timer; // NOTE: _permit should be dropped at the end of this function + let mut meta_permit = Some(meta_permit); let mut decs = HashMap::new(); // Collect all results first, without holding any locks @@ -561,11 +560,14 @@ impl< } }; let _ = update_err_req_in_meta_store( - &mut meta_store.write().await, - &req_id, + &meta_store, + meta_permit + .take() //todo why this? + .expect("permit must still be present on first error"), err_msg, OP_PUBLIC_DECRYPT_INNER, - ); + ) + .await; return; } // All the inner decrypts succeeded ok... @@ -600,11 +602,14 @@ impl< }; update_req_in_meta_store( - &mut meta_store.write().await, - &req_id, + &meta_store, + meta_permit + .take() + .expect("permit must still be present on success path"), res, OP_PUBLIC_DECRYPT_REQUEST, - ); + ) + .await; }; // Increment the error counter if ever the task fails self.tracker.spawn(async move { @@ -633,13 +638,13 @@ impl< ) })?; - let (retrieved_req_id, plaintexts, external_signature, extra_data) = - retrieve_from_meta_store( - self.pub_dec_meta_store.read().await, - &request_id, - OP_PUBLIC_DECRYPT_RESULT, - ) - .await?; + let arc = retrieve_from_meta_store( + &self.pub_dec_meta_store, + &request_id, + OP_PUBLIC_DECRYPT_RESULT, + ) + .await?; + let (retrieved_req_id, plaintexts, external_signature, extra_data) = (*arc).clone(); if request_id != retrieved_req_id { return Err(MetricedError::new( @@ -859,8 +864,7 @@ mod tests { .unwrap(); let mut key_store = MetaStore::new_unlimited(); - key_store.insert(&key_id).unwrap(); - // key_store.update(&key_id, Ok(info.clone())).unwrap(); + let meta_store_permit = key_store.insert(&key_id).unwrap(); let key_meta_store = Arc::new(RwLock::new(key_store)); let public_decryptor = RealPublicDecryptor::init_test_dummy_decryptor( @@ -877,6 +881,7 @@ mod tests { threshold_fhe_keys, PublicKeySet::Uncompressed(Arc::new(fhe_key_set)), Arc::clone(&key_meta_store), + meta_store_permit, "", ) .await diff --git a/core/service/src/engine/threshold/service/user_decryptor.rs b/core/service/src/engine/threshold/service/user_decryptor.rs index 9b6f490097..740aa64a01 100644 --- a/core/service/src/engine/threshold/service/user_decryptor.rs +++ b/core/service/src/engine/threshold/service/user_decryptor.rs @@ -495,11 +495,8 @@ impl< // So we need to update it everytime something bad happens, // or put all the code that may error before the first write to the meta-store, // otherwise it'll be in the "Started" state forever. - add_req_to_meta_store( - &mut meta_store.write().await, - &req_id, - OP_USER_DECRYPT_REQUEST, - )?; + let meta_permit = + add_req_to_meta_store(&meta_store, &req_id, OP_USER_DECRYPT_REQUEST).await?; let sk = (*self.base_kms.sig_key().map_err(|e| { MetricedError::new( @@ -548,6 +545,7 @@ impl< let inner_dec_future = move |_permit| async move { // Capture the timer, it is stopped when it's dropped let _timer = timer; + let meta_permit = meta_permit; let result = Self::inner_user_decrypt( &req_id, @@ -566,12 +564,8 @@ impl< metric_tags, ) .await; - update_req_in_meta_store( - &mut meta_store.write().await, - &req_id, - result, - OP_USER_DECRYPT_REQUEST, - ); + update_req_in_meta_store(&meta_store, meta_permit, result, OP_USER_DECRYPT_REQUEST) + .await; }; self.tracker.spawn(async move { // Ignore the result since this is a background thread. @@ -598,12 +592,13 @@ impl< })?; // Retrieve the UserDecryptMetaStore object - let (payload, external_signature, extra_data) = retrieve_from_meta_store( - self.user_decrypt_meta_store.read().await, + let arc = retrieve_from_meta_store( + &self.user_decrypt_meta_store, &request_id, OP_USER_DECRYPT_RESULT, ) .await?; + let (payload, external_signature, extra_data) = (*arc).clone(); let sig_payload_vec = bc2wrap::serialize(&payload).map_err(|e| { MetricedError::new( @@ -752,7 +747,7 @@ mod tests { let key_id = RequestId::new_random(rng); let mut key_store = MetaStore::new_unlimited(); - key_store.insert(&key_id).unwrap(); + let meta_store_permit = key_store.insert(&key_id).unwrap(); let key_meta_store = Arc::new(RwLock::new(key_store)); let user_decryptor = @@ -782,6 +777,7 @@ mod tests { threshold_fhe_keys, PublicKeySet::Uncompressed(Arc::new(fhe_key_set)), Arc::clone(&key_meta_store), + meta_store_permit, "", ) .await diff --git a/core/service/src/grpc/metastore_status_service.rs b/core/service/src/grpc/metastore_status_service.rs index ca5240fa85..30379d59fe 100644 --- a/core/service/src/grpc/metastore_status_service.rs +++ b/core/service/src/grpc/metastore_status_service.rs @@ -22,7 +22,7 @@ use crate::{ base::{CrsGenMetadata, KeyGenMetadata, PubDecCallValues, UserDecryptCallValues}, threshold::service::BucketMetaStore, }, - util::meta_store::MetaStore, + util::meta_store::{EntryState, MetaStore}, }; use kms_grpc::{ kms::v1::Empty, @@ -169,9 +169,22 @@ impl MetaStoreStatusServiceImpl { store_type ); - let cell_data = store_guard.get_cell(internal_id).map(|cell| cell.try_get()); - - data.push((original_id.clone(), *internal_id, cell_data)); + let entry_status: Option<(RequestProcessingStatus, Option)> = + match store_guard.retrieve(internal_id) { + Some(EntryState::Done(Ok(_))) => { + Some((RequestProcessingStatus::Completed, None)) + } + Some(EntryState::Done(Err(err))) => { + Some((RequestProcessingStatus::Failed, Some(err))) + } + Some(EntryState::Pending) => { + Some((RequestProcessingStatus::Processing, None)) + } + Some(EntryState::Deleted) => Some((RequestProcessingStatus::Deleted, None)), + None => None, + }; + + data.push((original_id.clone(), *internal_id, entry_status)); } data @@ -179,43 +192,21 @@ impl MetaStoreStatusServiceImpl { // Process results without holding lock (expensive operations) let mut statuses = Vec::new(); - for (original_id, internal_id, cell_data) in request_data { - let (status, error_message) = match cell_data { - Some(Some(Ok(_))) => { - tracing::debug!( - "Request {} in {:?} store has COMPLETED status", - internal_id, - store_type - ); - (RequestProcessingStatus::Completed, None) - } - Some(Some(Err(err))) => { - tracing::debug!( - "Request {} in {:?} store has FAILED status: {}", - internal_id, - store_type, - err - ); - (RequestProcessingStatus::Failed, Some(err.clone())) - } - Some(None) => { - tracing::debug!( - "Request {} in {:?} store has PROCESSING status", - internal_id, - store_type - ); - (RequestProcessingStatus::Processing, None) - } - None => { - tracing::debug!( - "Request {} not found in {:?} store", - internal_id, - store_type - ); - // Continue to search other stores - continue; - } + for (original_id, internal_id, entry_status) in request_data { + let Some((status, error_message)) = entry_status else { + tracing::debug!( + "Request {} not found in {:?} store", + internal_id, + store_type + ); + continue; }; + tracing::debug!( + "Request {} in {:?} store has status {:?}", + internal_id, + store_type, + status + ); statuses.push(RequestStatusInfo { request_id: original_id, @@ -268,6 +259,7 @@ impl MetaStoreStatusServiceImpl { Some(RequestProcessingStatus::Completed) => store_guard.get_completed_request_ids(), Some(RequestProcessingStatus::Failed) => store_guard.get_failed_request_ids(), Some(RequestProcessingStatus::Any) | None => store_guard.get_all_request_ids(), + Some(RequestProcessingStatus::Deleted) => store_guard.get_deleted_request_ids(), }; // Handle pagination @@ -322,62 +314,40 @@ impl MetaStoreStatusServiceImpl { }; // Batch collect all request data while holding lock once - let mut request_data = Vec::new(); + let mut request_data: Vec<(_, (RequestProcessingStatus, Option))> = Vec::new(); for request_id in paginated_ids { - if let Some(cell) = store_guard.retrieve(request_id) { - let status_result = cell.try_get(); - request_data.push((*request_id, Some(status_result))); - } else { - request_data.push((*request_id, None)); - } - } - drop(store_guard); // Explicitly release the read lock - - // Convert to RequestStatusInfo with enhanced status detection (without holding lock) - let mut requests = Vec::new(); - let total_request_count = request_ids.len(); - for (request_id, cell_data) in request_data { - let (status, error_message) = match cell_data { - Some(Some(Ok(_))) => { - tracing::debug!( - "Request {} in {:?} store has COMPLETED status", - request_id, - store_type - ); - (RequestProcessingStatus::Completed, None) - } - Some(Some(Err(err))) => { - tracing::debug!( - "Request {} in {:?} store has FAILED status: {}", - request_id, - store_type, - err - ); - (RequestProcessingStatus::Failed, Some(err)) - } - Some(None) => { - tracing::debug!( - "Request {} in {:?} store has PROCESSING status", - request_id, - store_type - ); - (RequestProcessingStatus::Processing, None) - } + let status_pair = match store_guard.retrieve(request_id) { + Some(EntryState::Done(Ok(_))) => (RequestProcessingStatus::Completed, None), + Some(EntryState::Done(Err(err))) => (RequestProcessingStatus::Failed, Some(err)), + Some(EntryState::Pending) => (RequestProcessingStatus::Processing, None), + Some(EntryState::Deleted) => (RequestProcessingStatus::Deleted, None), None => { // INVARIANT VIOLATION: Request ID from store's own collection is not retrievable - // This indicates data corruption or race condition tracing::error!( "INVARIANT VIOLATION: Request {} found in {:?} store ID list but not retrievable - data corruption detected", request_id, store_type ); - // Mark as FAILED since this represents a system error ( RequestProcessingStatus::Failed, Some("Internal error: Request data corrupted".to_string()), ) } }; + request_data.push((*request_id, status_pair)); + } + drop(store_guard); // Explicitly release the read lock + + // Convert to RequestStatusInfo with enhanced status detection (without holding lock) + let mut requests = Vec::new(); + let total_request_count = request_ids.len(); + for (request_id, (status, error_message)) in request_data { + tracing::debug!( + "Request {} in {:?} store has status {:?}", + request_id, + store_type, + status + ); requests.push(RequestStatusInfo { request_id: request_id.to_string(), diff --git a/core/service/src/util/meta_store.rs b/core/service/src/util/meta_store.rs index 8293996f32..4e9a3443e1 100644 --- a/core/service/src/util/meta_store.rs +++ b/core/service/src/util/meta_store.rs @@ -1,38 +1,118 @@ -use crate::{anyhow_error_and_log, consts::DURATION_WAITING_ON_RESULT_SECONDS, some_or_err}; -use anyhow::bail; -use async_cell::sync::AsyncCell; +use crate::{anyhow_error_and_log, some_or_err}; use kms_grpc::RequestId; use std::{ collections::{HashMap, VecDeque}, sync::Arc, }; -#[cfg(feature = "non-wasm")] -use tokio::sync::RwLockReadGuard; use tracing; cfg_if::cfg_if! { if #[cfg(feature = "non-wasm")] { + use crate::consts::DURATION_WAITING_ON_RESULT_SECONDS; use crate::engine::utils::MetricedError; use anyhow::anyhow; use std::fmt::{self}; - use tokio::sync::{RwLock, RwLockWriteGuard}; + use std::time::Duration; + use tokio::sync::RwLock; + use tokio::time::Instant; + } +} + +/// Cadence at which `retrieve_from_meta_store` re-checks a pending entry while +/// waiting for it to complete. Picked to keep worst-case lock acquisitions +/// bounded (60s / 250ms = 240 reads) while keeping latency-after-completion +/// well under half a second. +#[cfg(feature = "non-wasm")] +const META_STORE_POLL_INTERVAL: Duration = Duration::from_millis(250); + +/// Token proving the holder is the rightful updater of a single MetaStore entry. +/// +/// Returned by [`MetaStore::insert`]. Required by [`MetaStore::update`] and +/// [`MetaStore::delete`] to perform the mutation. If the holder drops the +/// permit without consuming it, the entry is left in `Pending`, and any +/// other caller can finish or remove it via [`MetaStore::try_delete`]. +/// +/// "Permit alive" is tracked by the strong count of an internal `Arc<()>`: +/// while the permit exists, `Arc::strong_count` of the entry's claim arc is +/// at least 2 (one held by the entry, one by the permit). `try_*` paths +/// check `strong_count == 1` to determine that no permit is outstanding. +pub struct MetaStorePermit { + req_id: RequestId, + _claim: Arc<()>, +} + +impl MetaStorePermit { + pub fn req_id(&self) -> &RequestId { + &self.req_id + } +} + +impl Drop for MetaStorePermit { + fn drop(&mut self) { + tracing::debug!( + "MetaStorePermit for request ID {} dropped; claim released", + self.req_id + ); + } +} + +/// Public-facing snapshot of an entry's state. +pub enum EntryState { + /// Entry exists, but is being worked on (either being computed or being deleted) + Pending, + /// Entry finished processing with either a success or error result. + Done(Result, String>), + /// The entry has been deleted. + Deleted, +} + +enum StoredEntry { + Pending(Arc<()>), + Done(Result, String>, Arc<()>), + Deleted, +} + +impl StoredEntry { + fn done(value: Result, String>) -> Self { + StoredEntry::Done(value, Arc::new(())) + } +} + +impl From> for EntryState { + fn from(stored: StoredEntry) -> Self { + match stored { + StoredEntry::Pending(_claim) => EntryState::Pending, + StoredEntry::Done(res, _claim) => EntryState::Done(res), + StoredEntry::Deleted => EntryState::Deleted, + } + } +} + +impl From<&StoredEntry> for EntryState { + fn from(stored: &StoredEntry) -> Self { + match stored { + StoredEntry::Pending(_) => EntryState::Pending, + StoredEntry::Done(Ok(arc), _) => EntryState::Done(Ok(Arc::clone(arc))), + StoredEntry::Done(Err(e), _) => EntryState::Done(Err(e.clone())), + StoredEntry::Deleted => EntryState::Deleted, + } } } -/// Data structure that stores elements that are being processed and their status (Started, Done, Error). -/// It holds elements up to a given capacity, and once it is full, it will remove old elements that have status [Done]/[Error], if there are sufficiently many. +/// Data structure that stores elements that are being processed and their status (Pending, Done, Deleted). +/// It holds elements up to a given capacity, and once it is full, it will remove old elements that have status [Done], if there are sufficiently many. pub struct MetaStore { // The maximum amount of entries in total (finished and unfinished) capacity: usize, // The minimum amount of entries that should be kept in the cache after completion and before old ones are evicted min_cache: usize, // Storage of all elements in the system - storage: HashMap>>>, + storage: HashMap>, // Queue of all elements that have been completed complete_queue: VecDeque, } -impl MetaStore { +impl MetaStore { /// Creates a new MetaStore with a given capacity and minimal cache size. /// In more detail, this means that the MetaStore will be able to hold [capacity] of total elements, /// of which we can be sure that at least [min_cache] elements are kept in the cache after completion @@ -57,14 +137,14 @@ impl MetaStore { } } - // Creates a MetaStore with unlimited storage capacity and minimum cache size and populates it with the given map + /// Creates a MetaStore with unlimited storage capacity and minimum cache size and populates it with the given map pub fn new_from_map(map: HashMap) -> Self { let mut completed_queue = VecDeque::new(); let storage = map .into_iter() .map(|(key, value)| { completed_queue.push_back(key); - (key, Arc::new(AsyncCell::new_with(Ok(value)))) + (key, StoredEntry::done(Ok(Arc::new(value)))) }) .collect(); @@ -80,12 +160,6 @@ impl MetaStore { self.storage.contains_key(request_id) } - /// Get cell reference for status checking without cloning Arc - /// Returns borrowed reference to avoid Arc::clone() overhead - pub fn get_cell(&self, request_id: &RequestId) -> Option<&Arc>>> { - self.storage.get(request_id) - } - /// Verify the invariant that storage.len() >= complete_queue.len() /// This is critical for preventing underflow in get_processing_count() /// Logs error if invariant is violated but does not panic @@ -105,35 +179,37 @@ impl MetaStore { // fails, which means that the queue is empty, and thus we do not have any non-local effects. #[allow(unknown_lints)] #[allow(non_local_effect_before_error_return)] - /// Insert a new element, throwing an exception if the element already exists or if the system is fully loaded. + /// Insert a new element, throwing an error if the element already exists or if the system is fully loaded. /// - /// Elements can trivially be inserted until the store reaches its [capacity]. - /// Once the store is full, we will remove old elements that have completed, but only once we have at least [min_cache] elements of them. - /// This is to ensure that: - /// 1. there are never more than [capacity] - [min_cache] elements currently being processed (id not in `complete_queue`) and - /// 2. there is enough time to retrieve an element before it is removed. This timespan is the time it takes to process [min_cache] elements. + /// On success, returns a [`MetaStorePermit`] granting the caller the right to + /// later [`update`] or [`delete`] this entry. Hold the permit until the + /// mutation; drop it if the work is abandoned (other callers can recover via + /// `try_delete`). /// - /// If the store is at max capacity and not enough elements have been completed, we will not accept new elements to be inserted. - pub fn insert(&mut self, request_id: &RequestId) -> anyhow::Result<()> { - if self.exists(request_id) { + /// Elements can trivially be inserted until the store reaches its [capacity]. + /// Once the store is full, old completed elements are evicted, but only once we have at least [min_cache] of them. + /// This ensures: + /// 1. there are never more than [capacity] - [min_cache] elements currently being processed, and + /// 2. there is enough time to retrieve an element before it is removed. + pub fn insert(&mut self, request_id: &RequestId) -> anyhow::Result { + // `Deleted` is a permanent tombstone: a request id, once deleted, may + // not be reused for a fresh insert. Callers that need to overwrite an + // existing entry should use [`replace_done`] instead. + if self.storage.contains_key(request_id) { return Err(anyhow::anyhow!( "The element with ID {request_id} already stored exists. Can not insert it more than once.", )); } if self.storage.len() >= self.capacity { - // We have reached the capacity limit. Delete an old element. if self.complete_queue.len() <= self.min_cache { return Err(anyhow_error_and_log( "The system is fully loaded and the cache of finished elements is not at minimum size yet. Cannot insert new element.", )); } else { - // Remove the first (oldest) element from the age queue let old_request_id = some_or_err( self.complete_queue.pop_front(), "Could not remove an old request from the cache".to_string(), )?; - // and also remove it from the storage map - // If storage removal fails, we need to restore the complete_queue to maintain invariant if self.storage.remove(&old_request_id).is_none() { self.complete_queue.push_front(old_request_id); return Err(anyhow_error_and_log(format!( @@ -142,78 +218,182 @@ impl MetaStore { } } } - // Ignore the result since we have already checked that the element does not exist - let cell = AsyncCell::shared(); - let _ = self.storage.insert(request_id.to_owned(), cell); - - Ok(()) + let claim = Arc::new(()); + let permit = MetaStorePermit { + req_id: *request_id, + _claim: Arc::clone(&claim), + }; + self.storage + .insert(*request_id, StoredEntry::Pending(claim)); + Ok(permit) } - /// Sets the value of an already existing element. Returns an error if something goes wrong, like - /// the element does not exist or the value was already set. - pub fn update( - &mut self, - request_id: &RequestId, - update: Result, - ) -> anyhow::Result<()> { - let cell = if let Some(cell) = self.storage.get(request_id) { - cell - } else { - return Err(anyhow_error_and_log(format!( - "The element with ID {request_id} does not exist, update is not allowed" - ))); + pub fn lock_entry(&mut self, request_id: &RequestId) -> anyhow::Result { + let claim = { + let entry = self.storage.get(request_id).ok_or_else(|| { + anyhow_error_and_log(format!( + "The element with ID {request_id} does not exist, locking is not allowed" + )) + })?; + match entry { + StoredEntry::Pending(arc) => { + if Arc::strong_count(arc) > 1 { + anyhow::bail!( + "The element with ID {request_id} is currently already locked" + ); + } + Arc::clone(arc) + } + StoredEntry::Done(_, arc) => { + if Arc::strong_count(arc) > 1 { + anyhow::bail!( + "The element with ID {request_id} is currently already locked" + ); + } + Arc::clone(arc) + } + StoredEntry::Deleted => { + return Err(anyhow_error_and_log(format!( + "The element with ID {request_id} has been deleted, locking is not allowed" + ))); + } + } }; + Ok(MetaStorePermit { + req_id: *request_id, + _claim: claim, + }) + } - // We only allow setting the result once - if cell.is_set() { + /// Sets the value of an already existing element. Consumes the permit. + /// + /// Enforces the state transitions: + /// - Pending -> Done(Ok) + /// - Pending -> Done(Err) + /// + /// Returns an error if the element does not exist, has already been + /// completed, or has been deleted. + fn update(&mut self, update: Result, permit: MetaStorePermit) -> anyhow::Result<()> { + let req_id = permit.req_id; + let cell = self.storage.get_mut(&req_id).ok_or_else(|| { + anyhow_error_and_log(format!( + "The element with ID {req_id} does not exist, update is not allowed" + )) + })?; + if !matches!(cell, StoredEntry::Pending(_)) { return Err(anyhow_error_and_log(format!( - "The element with ID {request_id} is already done, update is not allowed" + "The element with ID {req_id} is not in a pending state, update is not allowed" ))); } + *cell = StoredEntry::done(update.map(Arc::new)); + self.complete_queue.push_back(req_id); + // `permit` (and its Arc<()>) dropped at end of scope. + Ok(()) + } - cell.set(update); - self.complete_queue.push_back(*request_id); - + /// Set the entry at `request_id` directly to `Done(Ok(value))`, either by + /// overwriting an existing `Done` entry or by inserting a fresh one if no + /// entry exists. Returns an error if the entry is currently `Pending` + /// (a live operation is in progress) or `Deleted` (tombstone). + /// + /// Intended for migration / admin flows that rewrite the metadata of an + /// already-completed entry — or seed a fresh one — without going through + /// the standard `insert → update` lifecycle. Bypasses the permit + /// mechanism on purpose: no live permit can exist for a `Done` entry, and + /// fresh inserts via this path are explicitly admin-scoped. + pub fn replace_done(&mut self, permit: MetaStorePermit, value: T) -> anyhow::Result<()> { + let request_id = permit.req_id; + match self.storage.get(&request_id) { + Some(StoredEntry::Pending(_)) => { + return Err(anyhow_error_and_log(format!( + "The element with ID {request_id} is currently pending, replace_done is not allowed" + ))); + } + Some(StoredEntry::Deleted) => { + return Err(anyhow_error_and_log(format!( + "The element with ID {request_id} has been deleted, replace_done is not allowed" + ))); + } + Some(StoredEntry::Done(_, claim)) => { + if Arc::strong_count(claim) > 1 { + return Err(anyhow_error_and_log(format!( + "The element with ID {request_id} is currently locked, replace_done is not allowed" + ))); + } + // Existing Done entry — overwrite in place, keep complete_queue slot. + self.storage + .insert(request_id, StoredEntry::done(Ok(Arc::new(value)))); + } + None => { + // Fresh insert — record completion. + self.storage + .insert(request_id, StoredEntry::done(Ok(Arc::new(value)))); + self.complete_queue.push_back(request_id); + } + } Ok(()) } - /// Retrieve the cell of an element and return None if it does not exist - pub fn retrieve(&self, request_id: &RequestId) -> Option>>> { - self.storage.get(request_id).cloned() - } - - /// Deletes an element from the meta store and returns the value. - /// Warning: This is a slow operation if the request_id has been completed - /// and should be avoided if possible, since values are automatically removed when running out of space - pub fn delete(&mut self, request_id: &RequestId) -> Option>>> { - match self.storage.remove(request_id) { - Some(handle) => { - // If the cell is set, it means the task has been processed - // and thus added to the complete queue - if handle.is_set() { - let mut found = false; - self.complete_queue.retain(|id| { - if id == request_id { - found = true; - false // Remove this item - } else { - true // Keep this item - } - }); - - // If we couldn't find it in complete_queue but it was completed, - // this indicates a potential invariant violation that we should log - if !found { - tracing::error!( - "INVARIANT VIOLATION: Completed item {request_id} not found in complete_queue during delete - data corruption detected" + /// Retrieve the state of an element and return None if it does not exist. + /// + /// Returns an [`EntryState`] snapshot by value; the internal claim arc on + /// `Pending` / `Done` is intentionally hidden from external callers, who + /// should not depend on locking state. + pub(crate) fn retrieve(&self, request_id: &RequestId) -> Option> { + self.storage.get(request_id).map(EntryState::from) + } + + /// Mark an existing entry as deleted, regardless of whether it was Pending + /// or Done. Consumes the permit. Returns the previous state. If the previous + /// state was `Done`, the entry is also removed from the completion queue. + fn delete(&mut self, permit: MetaStorePermit) -> anyhow::Result> { + let req_id = permit.req_id; + let cell = self.storage.get_mut(&req_id).ok_or_else(|| { + anyhow::anyhow!("The element with ID {req_id} does not exist, deletion is not allowed") + })?; + if matches!(cell, StoredEntry::Deleted) { + anyhow::bail!("The element with ID {req_id} has already been deleted"); + } + // TODO don't use mem replace + let old = std::mem::replace(cell, StoredEntry::Deleted); + if matches!(old, StoredEntry::Done(_, _),) { + self.complete_queue.retain(|id| id != &req_id); + } + Ok(old.into()) + } + + /// Like [`delete`], but for callers that do not hold a permit. Succeeds + /// for any non-Deleted state when no live permit is outstanding. Returns + /// the previous state. + pub fn try_delete(&mut self, request_id: &RequestId) -> anyhow::Result> { + { + let entry = self.storage.get(request_id).ok_or_else(|| { + anyhow_error_and_log(format!( + "The element with ID {request_id} does not exist, deletion is not allowed" + )) + })?; + match entry { + StoredEntry::Pending(arc) => { + if Arc::strong_count(arc) > 1 { + anyhow::bail!( + "The element with ID {request_id} is currently locked and cannot be deleted" ); } } - - Some(handle) + StoredEntry::Done(_, _) => { /* no permit possible on Done */ } + StoredEntry::Deleted => { + anyhow::bail!("The element with ID {request_id} has already been deleted"); + } } - None => None, } + // Safe: we just verified the entry exists and is not Deleted. + let cell = self.storage.get_mut(request_id).unwrap(); + // TODO ensure we can just clone the old entry + let old = std::mem::replace(cell, StoredEntry::Deleted); + if matches!(old, StoredEntry::Done(_, _)) { + self.complete_queue.retain(|id| id != request_id); + } + Ok(old.into()) } /// Get the maximum capacity of this MetaStore @@ -238,10 +418,7 @@ impl MetaStore { /// Get the number of items currently being processed pub fn get_processing_count(&self) -> usize { - // Non-fallible invariant check that logs errors but doesn't panic self.verify_invariant(); - - // Ensure invariant: storage.len() >= complete_queue.len() to prevent underflow self.storage.len().saturating_sub(self.complete_queue.len()) } @@ -265,120 +442,100 @@ impl MetaStore { } /// Get failed request IDs (completed with errors) + /// WARNING: This is a slow operation pub fn get_failed_request_ids(&self) -> Vec { self.complete_queue .iter() - .filter_map(|id| { - // Direct HashMap access - items in complete_queue should exist in storage - match self.storage.get(id) { - Some(cell) => { - if let Some(Err(_)) = cell.try_get() { - Some(*id) - } else { - None - } - } - None => { - // This should never happen - indicates invariant violation - tracing::error!("INVARIANT VIOLATION: Completed item {id} not found in storage - data corruption detected"); - None - } + .filter_map(|id| match self.storage.get(id) { + Some(StoredEntry::Done(Err(_), _)) => Some(*id), + Some(_) => None, + None => { + tracing::error!("INVARIANT VIOLATION: Completed item {id} not found in storage - data corruption detected"); + None } }) .collect() } + + /// Get deleted request IDs (requests that have been deleted) + /// WARNING: This is a slow operation + pub fn get_deleted_request_ids(&self) -> Vec { + self.storage + .iter() + .filter_map(|(id, state)| match state { + StoredEntry::Deleted => Some(*id), + _ => None, + }) + .collect() + } } #[cfg(feature = "non-wasm")] -pub(crate) fn add_req_to_meta_store( - meta_store: &mut RwLockWriteGuard<'_, MetaStore>, +pub(crate) async fn add_req_to_meta_store( + meta_store: &Arc>>, req_id: &RequestId, request_metric: &'static str, -) -> Result<(), MetricedError> { - if meta_store.exists(req_id) { - return Err(MetricedError::new( - request_metric, - Some(*req_id), - anyhow::anyhow!("Duplicate request ID in meta store"), - tonic::Code::AlreadyExists, - )); - } - meta_store.insert(req_id).map_err(|e| { +) -> Result { + meta_store.write().await.insert(req_id).map_err(|e| { // We likely reached capacity here + // TODO update MetricedError::new(request_metric, Some(*req_id), e, tonic::Code::Aborted) - })?; - Ok(()) + }) } #[cfg(feature = "non-wasm")] -pub(crate) fn update_req_in_meta_store< - T: Clone, +pub(crate) async fn update_req_in_meta_store< + T, E: Into> + fmt::Debug, >( - meta_store: &mut RwLockWriteGuard<'_, MetaStore>, - req_id: &RequestId, + meta_store: &Arc>>, + permit: MetaStorePermit, result: Result, request_metric: &'static str, ) -> bool { match result { - Ok(res) => update_ok_req_in_meta_store(meta_store, req_id, res, request_metric), - Result::Err(e) => { - update_err_req_in_meta_store(meta_store, req_id, format!("{e:?}"), request_metric) + Ok(res) => update_ok_req_in_meta_store(meta_store, permit, res, request_metric).await, + Err(e) => { + update_err_req_in_meta_store(meta_store, permit, format!("{e:?}"), request_metric).await } } } #[cfg(feature = "non-wasm")] -pub(crate) fn update_ok_req_in_meta_store( - meta_store: &mut RwLockWriteGuard<'_, MetaStore>, - req_id: &RequestId, +pub(crate) async fn update_ok_req_in_meta_store( + meta_store: &Arc>>, + permit: MetaStorePermit, result: T, request_metric: &'static str, ) -> bool { - match meta_store.update(req_id, Ok(result)) { + let req_id = permit.req_id; + match meta_store.write().await.update(Ok(result), permit) { Ok(()) => true, Err(e) => { - // Update error counter for meta-store update failure - MetricedError::handle_unreturnable_error(request_metric, Some(*req_id), e); + MetricedError::handle_unreturnable_error(request_metric, Some(req_id), e); false } } } +/// Helper method for updating the meta store with an error result. +/// The method gracefully handles potential update failures by logging and updating metrics. +/// [permit] is consumed to mark the entry done. +/// [error] is the error message to store. +/// [request_metric] is a free-form string used only for error logging the origin of the failure. +/// Returns true if the update was successful, false otherwise. #[cfg(feature = "non-wasm")] -pub(crate) async fn ensure_meta_store_request_pending( +pub(crate) async fn update_err_req_in_meta_store( meta_store: &Arc>>, - req_id: &RequestId, -) -> anyhow::Result<()> { - let guarded_meta_store = meta_store.read().await; - let cell = guarded_meta_store.get_cell(req_id).ok_or_else(|| { - anyhow::anyhow!("Error while updating meta store for {req_id}: request is missing") - })?; - if cell.is_set() { - anyhow::bail!("Error while updating meta store for {req_id}: request is already completed"); - } - Ok(()) -} - -/// Helper method for updating the meta store with an error result -/// The method gracefully handles potential update failures by logging and updating metrics -/// [req_id] is the request ID to update -/// [error] is the error message to store -/// [request_metric] is a free-form string used only for error logging the origin of the failure -/// Returns true if the update was successful, false otherwise -#[cfg(feature = "non-wasm")] -pub(crate) fn update_err_req_in_meta_store( - meta_store: &mut RwLockWriteGuard<'_, MetaStore>, - req_id: &RequestId, + permit: MetaStorePermit, error: String, request_metric: &'static str, ) -> bool { - MetricedError::handle_unreturnable_error(request_metric, Some(*req_id), error.clone()); - - match meta_store.update(req_id, Err(error.clone())) { + let req_id = permit.req_id; + MetricedError::handle_unreturnable_error(request_metric, Some(req_id), error.clone()); + match meta_store.write().await.update(Err(error.clone()), permit) { Ok(()) => true, Err(e) => { - // We already logged the original error, so just log the update failure here as there is not much else we can do tracing::error!( "Failed to update meta store on request ID {req_id} with error message \"{error}\" due to update error: {e}" ); @@ -387,147 +544,109 @@ pub(crate) fn update_err_req_in_meta_store( } } -/// Helper method for retrieving the result of a request from an appropriate meta store -/// [req_id] is the request ID to retrieve -/// [request_type] is a free-form string used only for error logging the origin of the failure #[cfg(feature = "non-wasm")] -pub(crate) async fn retrieve_from_meta_store( - meta_store: RwLockReadGuard<'_, MetaStore>, - req_id: &RequestId, - metric_scope: &'static str, -) -> Result { - let handle = meta_store.retrieve(req_id); - drop(meta_store); // Release the read lock early as we otherwise risk holding it for up to DURATION_WAITING_ON_RESULT_SECONDS! - handle_res_metriced(handle, req_id, metric_scope).await +pub(crate) async fn delete_in_meta_store( + meta_store: &Arc>>, + permit: MetaStorePermit, + error: String, + request_metric: &'static str, +) -> bool { + let req_id = permit.req_id; + MetricedError::handle_unreturnable_error(request_metric, Some(req_id), error.clone()); + match meta_store.write().await.delete(permit) { + Ok(_) => true, + Err(e) => { + tracing::error!( + "Failed to delete request ID {req_id} from meta-store, with error message {e}" + ); + false + } + } } -/// Like [`retrieve_from_meta_store`] but with a custom server-side wait duration. -/// Use this for operations (e.g. preprocessing) that are inherently slower than -/// the default 60-second window. +/// Helper for retrieving the result of a request from a meta store. +/// +/// Polls the meta store every [`META_STORE_POLL_INTERVAL`] until either the +/// entry is `Done`, becomes unreachable, or the timeout expires. +/// +/// Returns `Arc` on success; `Unavailable` if the entry is still pending at +/// timeout; `NotFound` if missing or deleted; `Internal`/`Aborted` if the entry +/// completed with an error. +/// +/// Each poll iteration acquires the read lock briefly and releases it before +/// sleeping, so other writers are not blocked. #[cfg(feature = "non-wasm")] -pub(crate) async fn retrieve_from_meta_store_with_timeout( - meta_store: RwLockReadGuard<'_, MetaStore>, +pub(crate) async fn retrieve_from_meta_store( + meta_store: &Arc>>, req_id: &RequestId, metric_scope: &'static str, - wait_secs: u64, -) -> Result { - let handle = meta_store.retrieve(req_id); - drop(meta_store); // Release the read lock early - handle_res_metriced_with_timeout(handle, req_id, metric_scope, wait_secs).await -} - -#[cfg(feature = "non-wasm")] -pub async fn handle_res( - handle: Option>>>, - req_id: &RequestId, -) -> anyhow::Result { - match handle { - None => { - bail!("Could not retrieve the result with request ID {req_id}. It does not exist") - } - Some(cell) => { - let result = tokio::time::timeout( - tokio::time::Duration::from_secs(DURATION_WAITING_ON_RESULT_SECONDS), - cell.get(), - ) - .await; - // Peel off the potential errors - if let Ok(result) = result { - match result { - Ok(result) => Ok(result), - Err(e) => { - let msg = format!( - "Could not retrieve the result with request ID {req_id} since it finished with an error: {e}" - ); - tracing::warn!(msg); - bail!(msg); - } +) -> Result, MetricedError> { + let deadline = Instant::now() + Duration::from_secs(DURATION_WAITING_ON_RESULT_SECONDS); + loop { + match poll_entry(meta_store, req_id, metric_scope).await { + PollOutcome::Done(res) => return res, + PollOutcome::Pending => { + if Instant::now() >= deadline { + let msg = format!( + "Result in scope {metric_scope} with request ID {req_id} not completed after {DURATION_WAITING_ON_RESULT_SECONDS} seconds" + ); + tracing::info!(msg); + return Err(MetricedError::new( + metric_scope, + Some(*req_id), + anyhow!(msg), + tonic::Code::Unavailable, + )); } - } else { - let msg = format!( - "Could not retrieve the result with request ID {req_id} since it is not completed yet after waiting for {DURATION_WAITING_ON_RESULT_SECONDS} seconds" - ); - tracing::info!(msg); - bail!(msg); + tokio::time::sleep(META_STORE_POLL_INTERVAL).await; } - // Note that this is not logged as an error as we expect calls to take some time to be completed } } } #[cfg(feature = "non-wasm")] -async fn handle_res_metriced( - handle: Option>>>, - req_id: &RequestId, - metric_scope: &'static str, -) -> Result { - handle_res_metriced_with_timeout( - handle, - req_id, - metric_scope, - DURATION_WAITING_ON_RESULT_SECONDS, - ) - .await +enum PollOutcome { + Done(Result, MetricedError>), + Pending, } #[cfg(feature = "non-wasm")] -async fn handle_res_metriced_with_timeout( - handle: Option>>>, +async fn poll_entry( + meta_store: &Arc>>, req_id: &RequestId, metric_scope: &'static str, - wait_secs: u64, -) -> Result { - match handle { - None => { +) -> PollOutcome { + let guard = meta_store.read().await; + match guard.retrieve(req_id) { + None | Some(EntryState::Deleted) => { let msg = format!( "Could not retrieve the result in scope {metric_scope} with request ID {req_id}. It does not exist" ); - - Err(MetricedError::new( + PollOutcome::Done(Err(MetricedError::new( metric_scope, Some(*req_id), anyhow!(msg), tonic::Code::NotFound, - )) + ))) } - Some(cell) => { - let result = - tokio::time::timeout(tokio::time::Duration::from_secs(wait_secs), cell.get()).await; - // Peel off the potential errors - if let Ok(result) = result { - match result { - Ok(result) => Ok(result), - Err(e) => { - let msg = format!( - "Could not retrieve the result in scope {metric_scope} with request ID {req_id} since it finished with an error: {e}" - ); - tracing::warn!(msg); - let code = if e.to_ascii_lowercase().contains("abort") { - tonic::Code::Aborted - } else { - tonic::Code::Internal - }; - Err(MetricedError::new( - metric_scope, - Some(*req_id), - anyhow!(msg), - code, - )) - } - } + Some(EntryState::Pending) => PollOutcome::Pending, + Some(EntryState::Done(Ok(arc))) => PollOutcome::Done(Ok(arc)), + Some(EntryState::Done(Err(e))) => { + let msg = format!( + "Could not retrieve the result in scope {metric_scope} with request ID {req_id} since it finished with an error: {e}" + ); + tracing::warn!(msg); + let code = if e.to_ascii_lowercase().contains("abort") { + tonic::Code::Aborted } else { - let msg = format!( - "Could not retrieve the result in scope {metric_scope} with request ID {req_id} since it is not completed yet after waiting for {wait_secs} seconds" - ); - tracing::info!(msg); - Err(MetricedError::new( - metric_scope, - Some(*req_id), - anyhow!(msg), - tonic::Code::Unavailable, - )) - } - // Note that this is not logged as an error as we expect calls to take some time to be completed + tonic::Code::Internal + }; + PollOutcome::Done(Err(MetricedError::new( + metric_scope, + Some(*req_id), + anyhow!(msg), + code, + ))) } } } @@ -537,169 +656,131 @@ mod tests { use super::*; use crate::engine::base::derive_request_id; use kms_grpc::RequestId; - use tokio::sync::RwLock; + + fn assert_done_ok( + store: &MetaStore, + id: &RequestId, + expected: &T, + ) { + match store.retrieve(id).expect("entry missing") { + EntryState::Done(Ok(arc)) => assert_eq!(arc.as_ref(), expected), + other => panic!( + "expected Done(Ok), got {:?}", + match other { + EntryState::Pending => "Pending", + EntryState::Done(Err(_)) => "Done(Err)", + EntryState::Deleted => "Deleted", + EntryState::Done(Ok(_)) => unreachable!(), + } + ), + } + } #[test] fn sunshine() { let mut meta_store: MetaStore = MetaStore::new(2, 1); let request_id: RequestId = derive_request_id("meta_store").unwrap(); - // Data does not exist assert!(!meta_store.exists(&request_id)); - assert!( - meta_store - .update(&request_id, Ok("OK".to_string())) - .is_err() - ); - meta_store.insert(&request_id).unwrap(); - // Data exits + let permit = meta_store.insert(&request_id).unwrap(); assert!(meta_store.exists(&request_id)); - assert!(meta_store.update(&request_id, Ok("OK".to_string())).is_ok()); - - // Re-update not allowed - assert!( - meta_store - .update(&request_id, Ok("NOK".to_string())) - .is_err() - ); + assert!(meta_store.update(Ok("OK".to_string()), permit).is_ok()); + assert_done_ok(&meta_store, &request_id, &"OK".to_string()); } #[test] fn test_kickout_of_errors() { let mut meta_store: MetaStore = MetaStore::new(2, 1); - let request_id_1: RequestId = derive_request_id("1").unwrap(); - let request_id_2: RequestId = derive_request_id("2").unwrap(); - let request_id_3: RequestId = derive_request_id("3").unwrap(); - meta_store.insert(&request_id_1).unwrap(); - assert!( - meta_store - .update(&request_id_1, Err("Err1".to_string())) - .is_ok() - ); - meta_store.insert(&request_id_2).unwrap(); - assert!( - meta_store - .update(&request_id_2, Ok("OK2".to_string())) - .is_ok() - ); - // The storage is full so we should kick the oldest element out - meta_store.insert(&request_id_3).unwrap(); - assert!( - meta_store - .update(&request_id_3, Err("Err3".to_string())) - .is_ok() - ); - - // Validate the oldest element is removed - assert!(!meta_store.exists(&request_id_1)); - // Validate the two newer elements are still there - assert!(meta_store.exists(&request_id_2)); - assert!(meta_store.exists(&request_id_3)); + let id1: RequestId = derive_request_id("1").unwrap(); + let id2: RequestId = derive_request_id("2").unwrap(); + let id3: RequestId = derive_request_id("3").unwrap(); + let p1 = meta_store.insert(&id1).unwrap(); + assert!(meta_store.update(Err("Err1".to_string()), p1).is_ok()); + let p2 = meta_store.insert(&id2).unwrap(); + assert!(meta_store.update(Ok("OK2".to_string()), p2).is_ok()); + // storage full, eviction should kick id1 out + let p3 = meta_store.insert(&id3).unwrap(); + assert!(meta_store.update(Err("Err3".to_string()), p3).is_ok()); + + assert!(!meta_store.exists(&id1)); + assert!(meta_store.exists(&id2)); + assert!(meta_store.exists(&id3)); } #[test] fn double_insert() { let mut meta_store: MetaStore = MetaStore::new(2, 1); - let request_id: RequestId = derive_request_id("meta_store").unwrap(); - meta_store.insert(&request_id).unwrap(); - // We cannot insert the same request_id twice - assert!(meta_store.insert(&request_id).is_err()); + let id: RequestId = derive_request_id("meta_store").unwrap(); + let _p = meta_store.insert(&id).unwrap(); + assert!(meta_store.insert(&id).is_err()); } #[test] fn too_many_elements() { let mut meta_store: MetaStore = MetaStore::new(2, 1); - let req_1: RequestId = derive_request_id("1").unwrap(); - let req_2: RequestId = derive_request_id("2").unwrap(); - let req_3: RequestId = derive_request_id("3").unwrap(); - meta_store.insert(&req_1).unwrap(); - meta_store.insert(&req_2).unwrap(); - // Only room for 2 elements - assert!(meta_store.insert(&req_3).is_err()); + let id1: RequestId = derive_request_id("1").unwrap(); + let id2: RequestId = derive_request_id("2").unwrap(); + let id3: RequestId = derive_request_id("3").unwrap(); + let _p1 = meta_store.insert(&id1).unwrap(); + let _p2 = meta_store.insert(&id2).unwrap(); + assert!(meta_store.insert(&id3).is_err()); } #[test] fn auto_remove() { let mut meta_store: MetaStore = MetaStore::new(2, 1); - let req_1: RequestId = derive_request_id("1").unwrap(); - let req_2: RequestId = derive_request_id("2").unwrap(); - let req_3: RequestId = derive_request_id("3").unwrap(); - meta_store.insert(&req_1).unwrap(); - assert!(meta_store.update(&req_1, Ok("OK".to_string())).is_ok()); - assert!(meta_store.retrieve(&req_1).is_some()); - meta_store.insert(&req_2).unwrap(); - assert!(meta_store.update(&req_2, Ok("OK".to_string())).is_ok()); - assert!(meta_store.retrieve(&req_1).is_some()); - assert!(meta_store.retrieve(&req_2).is_some()); - meta_store.insert(&req_3).unwrap(); - // Only room for 2 elements - assert!(meta_store.retrieve(&req_3).is_some()); - assert!(meta_store.retrieve(&req_2).is_some()); - // The oldest element is removed - assert!(meta_store.retrieve(&req_1).is_none()); - // But the other two elements are - } - - #[tokio::test] - async fn test_subscription() { + let id1: RequestId = derive_request_id("1").unwrap(); + let id2: RequestId = derive_request_id("2").unwrap(); + let id3: RequestId = derive_request_id("3").unwrap(); + let p1 = meta_store.insert(&id1).unwrap(); + assert!(meta_store.update(Ok("OK".to_string()), p1).is_ok()); + assert!(meta_store.retrieve(&id1).is_some()); + let p2 = meta_store.insert(&id2).unwrap(); + assert!(meta_store.update(Ok("OK".to_string()), p2).is_ok()); + assert!(meta_store.retrieve(&id1).is_some()); + assert!(meta_store.retrieve(&id2).is_some()); + let p3 = meta_store.insert(&id3).unwrap(); + assert!(meta_store.retrieve(&id3).is_some()); + assert!(meta_store.retrieve(&id2).is_some()); + // Oldest removed during insert's eviction step + assert!(meta_store.retrieve(&id1).is_none()); + let _ = p3; + } + + #[test] + fn try_delete_blocked_by_live_permit() { let mut meta_store: MetaStore = MetaStore::new(2, 1); - let req_1: RequestId = derive_request_id("1").unwrap(); - meta_store.insert(&req_1).unwrap(); - let meta_store = Arc::new(RwLock::new(meta_store)); - - let cloned_meta_store = Arc::clone(&meta_store); - let handle = tokio::spawn(async move { - let meta_store = Arc::clone(&cloned_meta_store); - let handle = meta_store.read().await.retrieve(&req_1); - handle_res_metriced(handle, &req_1, "test").await - }); - tokio::time::sleep(tokio::time::Duration::from_secs(5)).await; - meta_store - .write() - .await - .update(&req_1, Ok("OK".to_string())) - .unwrap(); - - let result = handle.await.unwrap().unwrap(); - assert_eq!(result, "OK".to_string()); + let id: RequestId = derive_request_id("locked-del").unwrap(); + let permit = meta_store.insert(&id).unwrap(); + assert!(meta_store.try_delete(&id).is_err()); + drop(permit); + assert!(meta_store.try_delete(&id).is_ok()); + assert!(matches!( + meta_store.retrieve(&id), + Some(EntryState::Deleted) + )); + } + + #[test] + fn delete_consumes_permit() { + let mut meta_store: MetaStore = MetaStore::new(2, 1); + let id: RequestId = derive_request_id("perm-del").unwrap(); + let permit = meta_store.insert(&id).unwrap(); + let prev = meta_store.delete(permit).unwrap(); + assert!(matches!(prev, EntryState::Pending)); + assert!(matches!( + meta_store.retrieve(&id), + Some(EntryState::Deleted) + )); + // Cannot delete twice. + assert!(meta_store.try_delete(&id).is_err()); } #[test] - fn delete() { - let mut meta_store: MetaStore = MetaStore::new(10, 2); - let req_1: RequestId = derive_request_id("1").unwrap(); - let req_2: RequestId = derive_request_id("2").unwrap(); - let req_3: RequestId = derive_request_id("3").unwrap(); - let req_4: RequestId = derive_request_id("4").unwrap(); - - meta_store.insert(&req_1).unwrap(); - meta_store.insert(&req_2).unwrap(); - meta_store.insert(&req_3).unwrap(); - meta_store.insert(&req_4).unwrap(); - - assert_eq!(meta_store.complete_queue.len(), 0); - - // set req1 to Done and req2 to Error - assert!(meta_store.update(&req_1, Ok("OK".to_string())).is_ok()); - assert!(meta_store.update(&req_2, Err("Err".to_string())).is_ok()); - assert_eq!(meta_store.complete_queue.len(), 2); - - // check that we can delete req_1 (Done) - assert!(meta_store.delete(&req_1).is_some()); - // check that we cannot delete req_1 again - assert!(meta_store.delete(&req_1).is_none()); - assert_eq!(meta_store.complete_queue.len(), 1); - - // check that we can delete req_2 (Err) - assert!(meta_store.delete(&req_2).is_some()); - // check that we cannot delete req_2 again - assert!(meta_store.delete(&req_2).is_none()); - assert_eq!(meta_store.complete_queue.len(), 0); - - // check that we can delete req_3 (Started) - assert!(meta_store.delete(&req_3).is_some()); - // check that we cannot delete req_3 again - assert!(meta_store.delete(&req_3).is_none()); - assert_eq!(meta_store.complete_queue.len(), 0); + fn permit_send_across_spawn() { + // Compile-time check: MetaStorePermit must be Send so it can be + // moved into tokio::spawn closures. + fn assert_send() {} + assert_send::(); } } diff --git a/core/service/src/vault/storage/crypto_material/base.rs b/core/service/src/vault/storage/crypto_material/base.rs index 8376a37adc..aaac4c4f89 100644 --- a/core/service/src/vault/storage/crypto_material/base.rs +++ b/core/service/src/vault/storage/crypto_material/base.rs @@ -3,8 +3,9 @@ //! This module provides the foundational storage implementation used by //! both centralized and threshold KMS variants. use crate::engine::threshold::service::session::PRSSSetupCombined; -use crate::util::meta_store::update_ok_req_in_meta_store; -use crate::util::meta_store::{ensure_meta_store_request_pending, update_err_req_in_meta_store}; +use crate::util::meta_store::{ + MetaStorePermit, update_err_req_in_meta_store, update_ok_req_in_meta_store, +}; use crate::vault::storage::crypto_material::{data_exists, data_exists_at_epoch}; use crate::vault::storage::store_versioned_at_request_and_epoch_id; use crate::{ @@ -46,7 +47,7 @@ use tfhe::xof_key_set::CompressedXofKeySet; use tfhe::{integer::compression_keys::DecompressionKey, zk::CompactPkeCrs}; use thiserror::Error; use threshold_execution::tfhe_internals::public_keysets::FhePubKeySet; -use tokio::sync::{Mutex, OwnedRwLockReadGuard, RwLock, RwLockWriteGuard}; +use tokio::sync::{Mutex, OwnedRwLockReadGuard, RwLock}; #[derive(Error, Debug, PartialEq, Eq)] pub enum StorageError { @@ -99,6 +100,7 @@ pub enum PublicKeySet { /// /// Warning: In relation to concurrency where multiple locks are needed always lock as follows: /// meta_store -> public_storage -> private_storage second -> backup_vault -> pk_cache. +/// The exception is if you hold a permit to the meta store, then the storages can be used one at a time pub struct CryptoMaterialStorage< PubS: Storage + Send + Sync + 'static, PrivS: StorageExt + Send + Sync + 'static, @@ -305,28 +307,17 @@ where priv_data: Option<(&'a PrivData, PrivDataType)>, meta_data: MetaT, meta_store: Arc>>, + permit: MetaStorePermit, op_metric_tag: &'static str, ) -> Result<(), StorageError> where ::Versioned<'a>: Send + Sync, ::Versioned<'a>: Send + Sync, { - // First ensure that the meta store request is pending - ensure_meta_store_request_pending(&meta_store, req_id) - .await - .map_err(|e| StorageError::MetaStore(e.to_string()))?; let res = self .write_all(req_id, epoch_id, pub_data, priv_data, true, op_metric_tag) .await; - let mut guarded_meta_store = meta_store.write().await; - update_meta_store( - res, - req_id, - meta_data, - &mut guarded_meta_store, - op_metric_tag, - ) - .await + update_meta_store(res, meta_data, &meta_store, permit, op_metric_tag).await } /// General method for handling the storage of both public and private material along with the backup. @@ -738,6 +729,7 @@ where /// Write the CRS to public and private storage and update the meta /// store with the outcome. On a write failure the partial data is /// purged before the error is returned. + #[allow(clippy::too_many_arguments)] pub(crate) async fn write_crs( &self, crs_id: &RequestId, @@ -745,6 +737,7 @@ where pp: CompactPkeCrs, crs_info: CrsGenMetadata, meta_store: Arc>>, + permit: MetaStorePermit, op_metric_tag: &'static str, ) -> Result<(), StorageError> { self.handle_persistent_and_meta_storage( @@ -754,6 +747,7 @@ where Some((&crs_info.clone(), PrivDataType::CrsInfo)), crs_info, meta_store, + permit, op_metric_tag, ) .await @@ -765,11 +759,8 @@ where meta_data: KeyGenMetadata, decompression_key: DecompressionKey, meta_store: Arc>>, + permit: MetaStorePermit, ) -> Result<(), StorageError> { - // First ensure that the meta store request is pending - ensure_meta_store_request_pending(&meta_store, key_id) - .await - .map_err(|e| StorageError::MetaStore(e.to_string()))?; let res = self .write_all::( key_id, @@ -780,15 +771,7 @@ where OP_DECOMPRESSION_KEYGEN, ) .await; - let mut guarded_meta_store = meta_store.write().await; - update_meta_store( - res, - key_id, - meta_data, - &mut guarded_meta_store, - OP_DECOMPRESSION_KEYGEN, - ) - .await + update_meta_store(res, meta_data, &meta_store, permit, OP_DECOMPRESSION_KEYGEN).await } /// Write the backup keys to the storage and update the meta store. @@ -812,16 +795,9 @@ where &self, recovery_material: RecoveryValidationMaterial, meta_store: Arc>, + permit: MetaStorePermit, ) -> Result<(), StorageError> { let req_id = recovery_material.custodian_context().context_id; - // First ensure that the meta store request is pending - ensure_meta_store_request_pending(&meta_store, &req_id) - .await - .map_err(|e| { - StorageError::MetaStore(format!( - "Meta store is not ready for request ID {req_id}: {e}" - )) - })?; // Ensure we have a backup vault before starting let vault = match self.backup_vault.as_ref() { Some(vault) => vault, @@ -854,12 +830,11 @@ where StorageError::BackupVaultPurging }); } - let mut guarded_meta_store = meta_store.write().await; update_meta_store( res, - &req_id, recovery_material, - &mut guarded_meta_store, + &meta_store, + permit, OP_NEW_CUSTODIAN_CONTEXT, ) .await @@ -1089,22 +1064,24 @@ where /// If the meta store update fails, then a MetaStoreError is returned, which includes the original StorageError. pub(in crate::vault::storage::crypto_material) async fn update_meta_store( storage_res: Result<(), StorageError>, - req_id: &RequestId, meta_data: MetaT, - guarded_meta_store: &mut RwLockWriteGuard<'_, MetaStore>, + meta_store: &Arc>>, + permit: MetaStorePermit, op_metric_tag: &'static str, ) -> Result<(), StorageError> { - let mut meta_store_ok = true; - if let Err(e) = &storage_res - && e != &StorageError::Backup - { - // We don't want to fail on backup errors - meta_store_ok &= - update_err_req_in_meta_store(guarded_meta_store, req_id, e.to_string(), op_metric_tag); + let req_id = *permit.req_id(); + let is_storage_err = matches!(&storage_res, Err(e) if e != &StorageError::Backup); + let meta_store_ok = if is_storage_err { + update_err_req_in_meta_store( + meta_store, + permit, + storage_res.as_ref().err().unwrap().to_string(), + op_metric_tag, + ) + .await } else { - meta_store_ok &= - update_ok_req_in_meta_store(guarded_meta_store, req_id, meta_data, op_metric_tag); - } + update_ok_req_in_meta_store(meta_store, permit, meta_data, op_metric_tag).await + }; if !meta_store_ok { // NOTE this would indicate a bug since we have just verified that the meta can be updated in the start of this method // Thus the meta store update can only fail in case of a race condition, which would indicate a bug diff --git a/core/service/src/vault/storage/crypto_material/centralized.rs b/core/service/src/vault/storage/crypto_material/centralized.rs index 10eb733f37..a41213933e 100644 --- a/core/service/src/vault/storage/crypto_material/centralized.rs +++ b/core/service/src/vault/storage/crypto_material/centralized.rs @@ -14,7 +14,7 @@ use kms_grpc::{ use crate::{ engine::base::{KeyGenMetadata, KmsFheKeyHandles}, - util::meta_store::{MetaStore, ensure_meta_store_request_pending}, + util::meta_store::{MetaStore, MetaStorePermit}, vault::{ Vault, storage::{ @@ -69,6 +69,7 @@ impl>>, + permit: MetaStorePermit, op_metric_tag: &'static str, ) -> Result<(), StorageError> { - // First ensure that the meta store request is pending - ensure_meta_store_request_pending(&meta_store, key_id) - .await - .map_err(|e| StorageError::MetaStore(e.to_string()))?; let meta_res = central_fhe_keys.public_key_info.clone(); let res = self .inner @@ -96,15 +94,7 @@ impl>> = Arc::new(RwLock::new(MetaStore::new_unlimited())); - { - let mut write_guard = meta_store.write().await; - add_req_to_meta_store(&mut write_guard, &req_ok, TEST_METRIC).unwrap(); - add_req_to_meta_store(&mut write_guard, &req_backup, TEST_METRIC).unwrap(); - add_req_to_meta_store(&mut write_guard, &req_writing, TEST_METRIC).unwrap(); - assert!( - update_meta_store(Ok(()), &req_ok, 42_u32, &mut write_guard, TEST_METRIC) - .await - .is_ok() - ); - assert_eq!( - update_meta_store( - Err(StorageError::Backup), - &req_backup, - 7_u32, - &mut write_guard, - TEST_METRIC, - ) - .await, + let permit_ok = add_req_to_meta_store(&meta_store, &req_ok, TEST_METRIC) + .await + .unwrap(); + let permit_backup = add_req_to_meta_store(&meta_store, &req_backup, TEST_METRIC) + .await + .unwrap(); + let permit_writing = add_req_to_meta_store(&meta_store, &req_writing, TEST_METRIC) + .await + .unwrap(); + assert!( + update_meta_store(Ok(()), 42_u32, &meta_store, permit_ok, TEST_METRIC) + .await + .is_ok() + ); + assert_eq!( + update_meta_store( Err(StorageError::Backup), - ); - assert_eq!( - update_meta_store( - Err(StorageError::Writing), - &req_writing, - 0_u32, - &mut write_guard, - TEST_METRIC, - ) - .await, + 7_u32, + &meta_store, + permit_backup, + TEST_METRIC, + ) + .await, + Err(StorageError::Backup), + ); + assert_eq!( + update_meta_store( Err(StorageError::Writing), - ); - } + 0_u32, + &meta_store, + permit_writing, + TEST_METRIC, + ) + .await, + Err(StorageError::Writing), + ); assert_eq!( - retrieve_from_meta_store::(meta_store.read().await, &req_ok, TEST_METRIC) + *retrieve_from_meta_store::(&meta_store, &req_ok, TEST_METRIC) .await .unwrap(), 42 ); assert_eq!( - retrieve_from_meta_store::(meta_store.read().await, &req_backup, TEST_METRIC) + *retrieve_from_meta_store::(&meta_store, &req_backup, TEST_METRIC) .await .unwrap(), 7 ); assert!( - retrieve_from_meta_store::(meta_store.read().await, &req_writing, TEST_METRIC) + retrieve_from_meta_store::(&meta_store, &req_writing, TEST_METRIC) .await .is_err(), ); } -#[tokio::test] -async fn update_meta_store_failure_paths() { - // Failure paths: meta store update itself fails, in three flavours: - // - storage Ok + missing entry -> MetaStoreError "but storage succeeded"; - // - storage Err + missing entry -> MetaStoreError "Also failed to store data"; - // - storage Ok + already-set entry -> MetaStoreError (cell.is_set() rejects update). - let missing = derive_request_id("ums_missing").unwrap(); - let already_set = derive_request_id("ums_already_set").unwrap(); - let meta_store: Arc>> = Arc::new(RwLock::new(MetaStore::new_unlimited())); - add_req_to_meta_store(&mut meta_store.write().await, &already_set, TEST_METRIC).unwrap(); - assert!(update_ok_req_in_meta_store( - &mut meta_store.write().await, - &already_set, - 99_u32, - TEST_METRIC, - )); - - let mut guard = meta_store.write().await; - match update_meta_store(Ok(()), &missing, 1_u32, &mut guard, TEST_METRIC) - .await - .unwrap_err() - { - StorageError::MetaStore(msg) => assert!( - msg.contains("but storage succeeded"), - "expected 'but storage succeeded' in: {msg}" - ), - other => panic!("expected MetaStoreError, got {other:?}"), - } - match update_meta_store( - Err(StorageError::Writing), - &missing, - 2_u32, - &mut guard, - TEST_METRIC, - ) - .await - .unwrap_err() - { - StorageError::MetaStore(msg) => assert!( - msg.contains("Also failed to store data"), - "expected combined error in: {msg}" - ), - other => panic!("expected MetaStoreError, got {other:?}"), - } - assert!(matches!( - update_meta_store(Ok(()), &already_set, 1_u32, &mut guard, TEST_METRIC).await, - Err(StorageError::MetaStore(_)), - )); -} +// The previous `update_meta_store_failure_paths` test exercised "missing +// entry" and "already-set entry" failure modes by passing a `None` permit. +// Under the strong-permit invariant, these states are unreachable from the +// outside: `insert` produces a permit only for a fresh `Pending` entry, and +// no API constructs a permit for a missing or already-completed one. The +// fallback `MetaStore::try_delete` primitive still covers the recovery path, +// exercised in `util::meta_store::tests`. #[tokio::test] async fn inner_update_backup_vault_paths() { diff --git a/core/service/src/vault/storage/crypto_material/tests/migration.rs b/core/service/src/vault/storage/crypto_material/tests/migration.rs index cae244ceb5..20fc55bab1 100644 --- a/core/service/src/vault/storage/crypto_material/tests/migration.rs +++ b/core/service/src/vault/storage/crypto_material/tests/migration.rs @@ -379,15 +379,21 @@ async fn test_copy_compressed_key_to_original_success() { // dkg_pubinfo_meta_store now holds the new metadata for old_key_id. { let guard = meta_store.read().await; - let cell = guard - .get_cell(&old_key_id) + let entry = guard + .retrieve(&old_key_id) .unwrap_or_else(|| panic!("meta_store entry should exist for old_key_id={old_key_id}")); - let value = cell.try_get().unwrap_or_else(|| { - panic!("meta_store entry should be set for old_key_id={old_key_id}") - }); - let meta = value.unwrap_or_else(|err| { - panic!("meta_store should hold Ok(metadata) for old_key_id={old_key_id}: {err:?}") - }); + let meta = match entry { + crate::util::meta_store::EntryState::Done(Ok(arc)) => arc, + other => panic!( + "meta_store should hold Done(Ok(metadata)) for old_key_id={old_key_id}, got {state}", + state = match other { + crate::util::meta_store::EntryState::Pending => "Pending".to_owned(), + crate::util::meta_store::EntryState::Done(Err(e)) => format!("Done(Err: {e})"), + crate::util::meta_store::EntryState::Deleted => "Deleted".to_owned(), + crate::util::meta_store::EntryState::Done(Ok(_)) => unreachable!(), + } + ), + }; assert_current_compressed_metadata(&meta, &old_key_id); assert_current_metadata_extra_data(&meta, &extra_data); } @@ -725,7 +731,7 @@ async fn test_copy_compressed_key_validation_failure_is_atomic() { { let guard = meta_store.read().await; assert!( - guard.get_cell(&old_key_id).is_none(), + guard.retrieve(&old_key_id).is_none(), "meta_store must not be mutated for old_key_id={old_key_id} on validation failure while copying from new_key_id={new_key_id}" ); } diff --git a/core/service/src/vault/storage/crypto_material/threshold.rs b/core/service/src/vault/storage/crypto_material/threshold.rs index bdae27b396..7e64ca5d75 100644 --- a/core/service/src/vault/storage/crypto_material/threshold.rs +++ b/core/service/src/vault/storage/crypto_material/threshold.rs @@ -15,7 +15,7 @@ use crate::{ threshold::service::{ThresholdFheKeys, session::PRSSSetupCombined}, utils::verify_public_key_digest_from_bytes, }, - util::meta_store::{MetaStore, ensure_meta_store_request_pending}, + util::meta_store::{MetaStore, MetaStorePermit}, vault::{ Vault, storage::{ @@ -160,6 +160,7 @@ impl>>, + permit: MetaStorePermit, op_metric_tag: &'static str, ) -> Result<(), StorageError> { - // First ensure that the meta store request is pending - ensure_meta_store_request_pending(&meta_store, key_id) - .await - .map_err(|e| StorageError::MetaStore(e.to_string()))?; let meta_res = threshold_fhe_keys.meta_data.clone(); let res = self .inner @@ -187,15 +185,7 @@ impl>>, ) -> anyhow::Result<()> { - // Lock order: meta_store -> pub -> priv -> backup -> fhe_keys. - let mut guarded_meta_store = dkg_pubinfo_meta_store.write().await; - let mut pub_storage = self.inner.public_storage.lock().await; - let mut priv_storage = self.inner.private_storage.lock().await; - let mut back_vault = match self.inner.backup_vault { - Some(ref x) => Some(x.lock().await), - None => None, - }; + let permit = dkg_pubinfo_meta_store + .write() + .await + .lock_entry(new_key_id)?; // --- Phase A: validate everything before mutating anything. --- - - // Source of the migrated compressed keyset. - let compressed_keyset: CompressedXofKeySet = read_versioned_at_request_id( - &*pub_storage, - new_key_id, - &PubDataType::CompressedXofKeySet.to_string(), - ) - .await?; - - // Source of the migrated ThresholdFheKeys (stored by the keygen at - // (new_key_id, new_epoch_id); this may differ from old_epoch_id). - let migrated_fhe_keys: ThresholdFheKeys = read_versioned_at_request_and_epoch_id( - &*priv_storage, - new_key_id, - new_epoch_id, - &PrivDataType::FheKeyInfo.to_string(), - ) - .await?; - - // Validate that the original key exists before mutating any backend. - let _: ThresholdFheKeys = read_versioned_at_request_and_epoch_id( - &*priv_storage, - old_key_id, - old_epoch_id, - &PrivDataType::FheKeyInfo.to_string(), - ) - .await - .map_err(|e| { - anyhow::anyhow!( - "No existing ThresholdFheKeys at (old_key_id={old_key_id}, \ + let migrated_fhe_keys: ThresholdFheKeys = { + let priv_storage = self.inner.private_storage.lock().await; + // Validate that the original key exists before mutating any backend. + let _: ThresholdFheKeys = read_versioned_at_request_and_epoch_id( + &*priv_storage, + old_key_id, + old_epoch_id, + &PrivDataType::FheKeyInfo.to_string(), + ) + .await + .map_err(|e| { + anyhow::anyhow!( + "No existing ThresholdFheKeys at (old_key_id={old_key_id}, \ old_epoch_id={old_epoch_id:?}): {e}" + ) + })?; + // Source of the migrated ThresholdFheKeys (stored by the keygen at + // (new_key_id, new_epoch_id); this may differ from old_epoch_id). + read_versioned_at_request_and_epoch_id( + &*priv_storage, + new_key_id, + new_epoch_id, + &PrivDataType::FheKeyInfo.to_string(), ) - })?; - + .await? + }; // Reject LegacyV0 migrated metadata (we can't re-sign without the // structured digest map) and confirm the CompressedXofKeySet digest // is present. @@ -302,6 +280,7 @@ impl Some(x.lock().await), + None => None, + }; + + // Backup vault (if configured): delete + re-store at the same location + // so a restore brings back the migrated keys, not the pre-migration + // uncompressed ones. + if let Some(vault) = back_vault.as_deref_mut() { + delete_at_request_and_epoch_id( + vault, + old_key_id, + old_epoch_id, + &PrivDataType::FheKeyInfo.to_string(), + ) + .await?; + store_versioned_at_request_and_epoch_id( + vault, + old_key_id, + old_epoch_id, + &updated_fhe_keys, + &PrivDataType::FheKeyInfo.to_string(), + ) + .await?; + } else { + tracing::warn!( + "No backup vault configured. Skipping backup update during \ copy_compressed_key_to_original for {old_key_id}" - ); + ); + } } // --- Phase C: refresh dkg_pubinfo_meta_store for old_key_id. --- - // `MetaStore::update` only accepts pending entries, so replace the - // existing completed entry with delete + insert + update, all under - // the meta_store write guard already held since the top of the function. - let _ = guarded_meta_store.delete(old_key_id); - guarded_meta_store.insert(old_key_id).map_err(|e| { - anyhow::anyhow!("Failed to insert {old_key_id} into keygen meta-store: {e}") - })?; - guarded_meta_store - .update(old_key_id, Ok(new_metadata)) - .map_err(|e| { - anyhow::anyhow!("Failed to update {old_key_id} in keygen meta-store: {e}") - })?; + { + let mut guarded_meta_store = dkg_pubinfo_meta_store.write().await; - tracing::info!( - "Copied compressed key from {new_key_id} to original {old_key_id} \ + // The entry is already in Done state from the original key generation; + // overwrite the metadata in place. `replace_done` keeps the entry's + // slot in the completion queue, which matches what we want here. + guarded_meta_store.replace_done(permit, new_metadata)?; + tracing::info!( + "Copied compressed key from {new_key_id} to original {old_key_id} \ and updated metadata" - ); + ); + } // In-memory cache. {