Skip to content

Commit 438f292

Browse files
authored
Fix timeout to be configurable and retryable for SP1 network proof requests (#38)
actually fix the issue, copilot missed (subtle, I suppose) Authored-by: Nuke <nuke-web3@proton.me>
1 parent 717a56e commit 438f292

File tree

3 files changed

+61
-61
lines changed

3 files changed

+61
-61
lines changed

example.env

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ NETWORK_PRIVATE_KEY=
7878
# https://github.com/succinctlabs/sp1/blob/11ab6b783cfce295b6f1113af088cc5f0a8caa5b/crates/sdk/src/network/prover.rs#L138
7979
SP1_FULFILLMENT_STRATEGY=HOSTED
8080

81+
# Timeout in seconds for network proof requests to ultimately fail (default: 600 seconds = 10 minutes)
82+
PROOF_GEN_TIMEOUT_SECONDS_REMOTE=600
83+
8184
#### Dependent & Provider Settings
8285

8386
# To get a new token, see https://docs.celestia.org/tutorials/node-tutorial#auth-token

service/src/internal/runner.rs

Lines changed: 42 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use sp1_sdk::{
99
SP1Stdin,
1010
network::{Error as SP1NetworkError, FulfillmentStrategy},
1111
};
12-
use std::sync::Arc;
12+
use std::{sync::Arc, time::Duration};
1313
use tokio::sync::{OnceCell, mpsc};
1414

1515
/// Hardcoded ELF binary for the crate `program-keccak-inclusion`
@@ -40,12 +40,14 @@ pub async fn get_program_id() -> SuccNetProgramId {
4040

4141
static CHACHA_SETUP: OnceCell<Arc<SP1ProofSetup>> = OnceCell::const_new();
4242

43-
/// TODO: setup ability to config as needed
44-
pub struct PdaRunnerConfig {}
43+
/// Configuration for the PDA Runner
44+
pub struct PdaRunnerConfig {
45+
/// Timeout in seconds for prover network proof requests
46+
pub zk_proof_gen_timeout_remote: Duration,
47+
}
4548

4649
/// The main service runner.
4750
pub struct PdaRunner {
48-
#[allow(dead_code)] // TODO: use config
4951
pub config: PdaRunnerConfig,
5052
pub config_db: SledTree,
5153
pub queue_db: SledTree,
@@ -247,30 +249,22 @@ impl PdaRunner {
247249
}
248250
Err(e) => {
249251
error!("{job:?} failed to request a proof: {e}");
250-
job_status = JobStatus::Failed(
251-
e,
252-
Some(JobStatus::RemoteZkProofRequesting.into()),
253-
);
254-
self.finalize_job(&job_key, job_status)?;
252+
// NOTE: we internally finalize the job in `handle_zk_client_error`
255253
}
256254
};
257255
debug!("ZK proof request sent");
258256
}
259257
JobStatus::RemoteZkProofPending(zk_request_id) => {
260258
debug!("ZK request waiting");
261-
match self.wait_for_zk_proof(&job_key, zk_request_id).await {
259+
match self.wait_for_zk_proof(&job, &job_key, zk_request_id).await {
262260
Ok(zk_proof) => {
263261
info!("🎉 {job:?} Finished!");
264262
job_status = JobStatus::ZkProofFinished(zk_proof);
265263
self.finalize_job(&job_key, job_status)?;
266264
}
267265
Err(e) => {
268266
error!("{job:?} failed progressing RemoteZkProofPending: {e}");
269-
job_status = JobStatus::Failed(
270-
e,
271-
Some(JobStatus::RemoteZkProofPending(zk_request_id).into()),
272-
);
273-
self.finalize_job(&job_key, job_status)?;
267+
// NOTE: we internally finalize the job in `handle_zk_client_error`
274268
}
275269
}
276270
debug!("Remote ZK request fulfilled, result stored in DB");
@@ -287,14 +281,8 @@ impl PdaRunner {
287281
self.finalize_job(&job_key, job_status)?;
288282
}
289283
Err(e) => {
290-
error!(
291-
"0x{} - Failed progressing LocalZkProofPending: {e}",
292-
hex::encode(&job_key)
293-
);
294-
job_status = JobStatus::Failed(
295-
e, None, // TODO: should this be retry-able?
296-
);
297-
self.finalize_job(&job_key, job_status)?;
284+
error!("{job:?} - Failed progressing LocalZkProofPending: {e}",);
285+
// NOTE: we internally finalize the job in `handle_zk_client_error`
298286
}
299287
};
300288
debug!("0x{} - ZKP stored in finalized DB", hex::encode(job_key));
@@ -415,45 +403,42 @@ impl PdaRunner {
415403
fn handle_zk_client_error(
416404
&self,
417405
zk_client_error: &SP1NetworkError,
406+
job: &Job,
418407
job_key: &[u8],
419408
) -> PdaRunnerError {
420409
error!("SP1 Client error: {zk_client_error}");
421410
let (e, job_status);
422411
match zk_client_error {
423412
SP1NetworkError::SimulationFailed | SP1NetworkError::RequestUnexecutable { .. } => {
424-
e = PdaRunnerError::DaClientError(format!(
425-
"ZKP program critical failure: {zk_client_error} occurred for job 0x{} PLEASE REPORT!",
426-
hex::encode(job_key)
413+
e = PdaRunnerError::ZkClientError(format!(
414+
"ZKP program critical failure: {zk_client_error} occurred for {job:?} PLEASE REPORT!"
427415
));
428416
job_status = JobStatus::Failed(e.clone(), None);
429417
}
430418
SP1NetworkError::RequestUnfulfillable { .. } => {
431-
e = PdaRunnerError::DaClientError(format!(
432-
"ZKP network failure: {zk_client_error} occurred for job 0x{} PLEASE REPORT!",
433-
hex::encode(job_key)
419+
e = PdaRunnerError::ZkClientError(format!(
420+
"ZKP network failure: {zk_client_error} occurred for {job:?} PLEASE REPORT!"
434421
));
435422
job_status = JobStatus::Failed(e.clone(), None);
436423
}
437-
SP1NetworkError::RequestTimedOut { request_id } => {
438-
e = PdaRunnerError::DaClientError(format!(
439-
"ZKP network: {zk_client_error} occurred for job 0x{}",
440-
hex::encode(job_key)
424+
SP1NetworkError::RequestTimedOut { .. } => {
425+
e = PdaRunnerError::ZkClientError(format!(
426+
"ZKP network: {zk_client_error} occurred for {job:?} - callback to start the job over"
441427
));
442-
let id = request_id
443-
.as_slice()
444-
.try_into()
445-
.expect("request ID is always correct length");
428+
429+
// TODO: We cannot clone KeccakInclusionToDataRootProofInput thus we cannot insert into a JobStatus::DataAvailable(proof_input)
430+
// So we just redo the work from scratch for the DA side as a stupid workaround
446431
job_status =
447-
JobStatus::Failed(e.clone(), Some(JobStatus::RemoteZkProofPending(id).into()));
432+
JobStatus::Failed(e.clone(), Some(JobStatus::RemoteZkProofRequesting.into()));
448433
}
449434
SP1NetworkError::RpcError(_) | SP1NetworkError::Other(_) => {
450-
e = PdaRunnerError::DaClientError(format!(
451-
"ZKP network failure: {zk_client_error} occurred for job 0x{} PLEASE REPORT!",
452-
hex::encode(job_key)
435+
e = PdaRunnerError::ZkClientError(format!(
436+
"ZKP network failure: {zk_client_error} occurred for {job:?} PLEASE REPORT!"
453437
));
454-
// TODO: We cannot clone thus we cannot insert into a JobStatus::...(proof_input)
438+
// TODO: We cannot clone KeccakInclusionToDataRootProofInput thus we cannot insert into a JobStatus::DataAvailable(proof_input)
455439
// So we just redo the work from scratch for the DA side as a stupid workaround
456-
job_status = JobStatus::Failed(e.clone(), None);
440+
job_status =
441+
JobStatus::Failed(e.clone(), Some(JobStatus::RemoteZkProofRequesting.into()));
457442
}
458443
}
459444
match self.finalize_job(job_key, job_status) {
@@ -502,8 +487,9 @@ impl PdaRunner {
502487
// TODO: how to handle errors without a concrete type? Anyhow is not the right thing for us...
503488
.map_err(|e| {
504489
if let Some(down) = e.downcast_ref::<SP1NetworkError>() {
505-
return self.handle_zk_client_error(down, job_key);
490+
return self.handle_zk_client_error(down, job, job_key);
506491
}
492+
error!("Unhandled Error: {e:?}");
507493
PdaRunnerError::ZkClientError(format!("Unhandled Error: {e} PLEASE REPORT"))
508494
})?;
509495
debug!("0x{} - Proof complete", hex::encode(job_key));
@@ -554,13 +540,13 @@ impl PdaRunner {
554540
.strategy(strategy)
555541
.groth16()
556542
.skip_simulation(false)
557-
// .timeout(std::time::Duration::from_secs(60))
543+
.timeout(self.config.zk_proof_gen_timeout_remote) // Time allowed for provers to *attempt* a job, or it permanently fails.
558544
.request_async()
559545
.await
560546
// TODO: how to handle errors without a concrete type? Anyhow is not the right thing for us...
561547
.map_err(|e| {
562548
if let Some(down) = e.downcast_ref::<SP1NetworkError>() {
563-
return self.handle_zk_client_error(down, job_key);
549+
return self.handle_zk_client_error(down, job, job_key);
564550
}
565551
PdaRunnerError::ZkClientError(format!("Unhandled Error: {e} PLEASE REPORT"))
566552
})?
@@ -576,31 +562,28 @@ impl PdaRunner {
576562
/// Await a proof request from Succinct's prover network
577563
async fn wait_for_zk_proof(
578564
&self,
565+
job: &Job,
579566
job_key: &[u8],
580567
request_id: util::SuccNetJobId,
581568
) -> Result<SP1ProofWithPublicValues, PdaRunnerError> {
569+
debug!(
570+
"Waiting for proof from prover network with timeout: {} seconds",
571+
self.config.zk_proof_gen_timeout_remote.as_secs()
572+
);
582573
debug!("Waiting for proof from prover network");
583574
let zk_client_handle = self.get_zk_client_remote().await;
584575

585576
let proof = zk_client_handle
586577
.wait_proof(request_id.into(), None)
587578
.await
588579
.map_err(|e| {
589-
error!("UNHANDLED ZK client error: {e:?}");
590-
let e = PdaRunnerError::ZkClientError(
591-
"UNKNOWN ZK client error. PLEASE REPORT!".to_string(),
592-
);
593-
match self.finalize_job(
594-
job_key,
595-
JobStatus::Failed(
596-
e.clone(),
597-
Some(JobStatus::RemoteZkProofPending(request_id).into()),
598-
),
599-
) {
600-
Ok(_) => e,
601-
Err(internal_err) => internal_err,
580+
if let Some(down) = e.downcast_ref::<SP1NetworkError>() {
581+
return self.handle_zk_client_error(down, job, job_key);
602582
}
583+
error!("UNHANDLED ZK client error: {e:?}");
584+
PdaRunnerError::ZkClientError(format!("Unhandled Error: {e} PLEASE REPORT"))
603585
})?;
586+
604587
Ok(proof)
605588
}
606589

service/src/main.rs

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ use rustls::ServerConfig;
1717
use serde_json::json;
1818
use sha2::{Digest, Sha256};
1919
use sp1_sdk::SP1ProofWithPublicValues;
20-
use std::{net::SocketAddr, sync::Arc};
20+
use std::{net::SocketAddr, sync::Arc, time::Duration};
2121
use tokio::{
2222
net::TcpListener,
2323
sync::{OnceCell, mpsc},
@@ -96,8 +96,22 @@ async fn main() -> Result<()> {
9696

9797
info!("Building clients and service setup");
9898
let (job_sender, job_receiver) = mpsc::unbounded_channel::<Option<Job>>();
99+
100+
let zk_proof_gen_timeout_remote = Duration::from_secs(
101+
std::env::var("PROOF_GEN_TIMEOUT_SECONDS_REMOTE")
102+
.expect("PROOF_GEN_TIMEOUT_SECONDS_REMOTE env var required")
103+
.parse()
104+
.expect("PROOF_GEN_TIMEOUT_SECONDS_REMOTE must be integer"),
105+
);
106+
info!(
107+
"Prover network proof generation timeout configured to {} seconds (permanent failure, and new request needed if hit)",
108+
zk_proof_gen_timeout_remote.as_secs()
109+
);
110+
99111
let pda_runner = Arc::new(PdaRunner::new(
100-
PdaRunnerConfig {},
112+
PdaRunnerConfig {
113+
zk_proof_gen_timeout_remote,
114+
},
101115
OnceCell::new(),
102116
OnceCell::new(),
103117
config_db.clone(),

0 commit comments

Comments
 (0)