@@ -9,7 +9,7 @@ use sp1_sdk::{
99 SP1Stdin ,
1010 network:: { Error as SP1NetworkError , FulfillmentStrategy } ,
1111} ;
12- use std:: sync:: Arc ;
12+ use std:: { sync:: Arc , time :: Duration } ;
1313use tokio:: sync:: { OnceCell , mpsc} ;
1414
1515/// Hardcoded ELF binary for the crate `program-keccak-inclusion`
@@ -40,12 +40,14 @@ pub async fn get_program_id() -> SuccNetProgramId {
4040
4141static CHACHA_SETUP : OnceCell < Arc < SP1ProofSetup > > = OnceCell :: const_new ( ) ;
4242
43- /// TODO: setup ability to config as needed
44- pub struct PdaRunnerConfig { }
43+ /// Configuration for the PDA Runner
44+ pub struct PdaRunnerConfig {
45+ /// Timeout in seconds for prover network proof requests
46+ pub zk_proof_gen_timeout_remote : Duration ,
47+ }
4548
4649/// The main service runner.
4750pub struct PdaRunner {
48- #[ allow( dead_code) ] // TODO: use config
4951 pub config : PdaRunnerConfig ,
5052 pub config_db : SledTree ,
5153 pub queue_db : SledTree ,
@@ -247,30 +249,22 @@ impl PdaRunner {
247249 }
248250 Err ( e) => {
249251 error ! ( "{job:?} failed to request a proof: {e}" ) ;
250- job_status = JobStatus :: Failed (
251- e,
252- Some ( JobStatus :: RemoteZkProofRequesting . into ( ) ) ,
253- ) ;
254- self . finalize_job ( & job_key, job_status) ?;
252+ // NOTE: we internally finalize the job in `handle_zk_client_error`
255253 }
256254 } ;
257255 debug ! ( "ZK proof request sent" ) ;
258256 }
259257 JobStatus :: RemoteZkProofPending ( zk_request_id) => {
260258 debug ! ( "ZK request waiting" ) ;
261- match self . wait_for_zk_proof ( & job_key, zk_request_id) . await {
259+ match self . wait_for_zk_proof ( & job , & job_key, zk_request_id) . await {
262260 Ok ( zk_proof) => {
263261 info ! ( "🎉 {job:?} Finished!" ) ;
264262 job_status = JobStatus :: ZkProofFinished ( zk_proof) ;
265263 self . finalize_job ( & job_key, job_status) ?;
266264 }
267265 Err ( e) => {
268266 error ! ( "{job:?} failed progressing RemoteZkProofPending: {e}" ) ;
269- job_status = JobStatus :: Failed (
270- e,
271- Some ( JobStatus :: RemoteZkProofPending ( zk_request_id) . into ( ) ) ,
272- ) ;
273- self . finalize_job ( & job_key, job_status) ?;
267+ // NOTE: we internally finalize the job in `handle_zk_client_error`
274268 }
275269 }
276270 debug ! ( "Remote ZK request fulfilled, result stored in DB" ) ;
@@ -287,14 +281,8 @@ impl PdaRunner {
287281 self . finalize_job ( & job_key, job_status) ?;
288282 }
289283 Err ( e) => {
290- error ! (
291- "0x{} - Failed progressing LocalZkProofPending: {e}" ,
292- hex:: encode( & job_key)
293- ) ;
294- job_status = JobStatus :: Failed (
295- e, None , // TODO: should this be retry-able?
296- ) ;
297- self . finalize_job ( & job_key, job_status) ?;
284+ error ! ( "{job:?} - Failed progressing LocalZkProofPending: {e}" , ) ;
285+ // NOTE: we internally finalize the job in `handle_zk_client_error`
298286 }
299287 } ;
300288 debug ! ( "0x{} - ZKP stored in finalized DB" , hex:: encode( job_key) ) ;
@@ -415,45 +403,42 @@ impl PdaRunner {
415403 fn handle_zk_client_error (
416404 & self ,
417405 zk_client_error : & SP1NetworkError ,
406+ job : & Job ,
418407 job_key : & [ u8 ] ,
419408 ) -> PdaRunnerError {
420409 error ! ( "SP1 Client error: {zk_client_error}" ) ;
421410 let ( e, job_status) ;
422411 match zk_client_error {
423412 SP1NetworkError :: SimulationFailed | SP1NetworkError :: RequestUnexecutable { .. } => {
424- e = PdaRunnerError :: DaClientError ( format ! (
425- "ZKP program critical failure: {zk_client_error} occurred for job 0x{} PLEASE REPORT!" ,
426- hex:: encode( job_key)
413+ e = PdaRunnerError :: ZkClientError ( format ! (
414+ "ZKP program critical failure: {zk_client_error} occurred for {job:?} PLEASE REPORT!"
427415 ) ) ;
428416 job_status = JobStatus :: Failed ( e. clone ( ) , None ) ;
429417 }
430418 SP1NetworkError :: RequestUnfulfillable { .. } => {
431- e = PdaRunnerError :: DaClientError ( format ! (
432- "ZKP network failure: {zk_client_error} occurred for job 0x{} PLEASE REPORT!" ,
433- hex:: encode( job_key)
419+ e = PdaRunnerError :: ZkClientError ( format ! (
420+ "ZKP network failure: {zk_client_error} occurred for {job:?} PLEASE REPORT!"
434421 ) ) ;
435422 job_status = JobStatus :: Failed ( e. clone ( ) , None ) ;
436423 }
437- SP1NetworkError :: RequestTimedOut { request_id } => {
438- e = PdaRunnerError :: DaClientError ( format ! (
439- "ZKP network: {zk_client_error} occurred for job 0x{}" ,
440- hex:: encode( job_key)
424+ SP1NetworkError :: RequestTimedOut { .. } => {
425+ e = PdaRunnerError :: ZkClientError ( format ! (
426+ "ZKP network: {zk_client_error} occurred for {job:?} - callback to start the job over"
441427 ) ) ;
442- let id = request_id
443- . as_slice ( )
444- . try_into ( )
445- . expect ( "request ID is always correct length" ) ;
428+
429+ // TODO: We cannot clone KeccakInclusionToDataRootProofInput thus we cannot insert into a JobStatus::DataAvailable(proof_input)
430+ // So we just redo the work from scratch for the DA side as a stupid workaround
446431 job_status =
447- JobStatus :: Failed ( e. clone ( ) , Some ( JobStatus :: RemoteZkProofPending ( id ) . into ( ) ) ) ;
432+ JobStatus :: Failed ( e. clone ( ) , Some ( JobStatus :: RemoteZkProofRequesting . into ( ) ) ) ;
448433 }
449434 SP1NetworkError :: RpcError ( _) | SP1NetworkError :: Other ( _) => {
450- e = PdaRunnerError :: DaClientError ( format ! (
451- "ZKP network failure: {zk_client_error} occurred for job 0x{} PLEASE REPORT!" ,
452- hex:: encode( job_key)
435+ e = PdaRunnerError :: ZkClientError ( format ! (
436+ "ZKP network failure: {zk_client_error} occurred for {job:?} PLEASE REPORT!"
453437 ) ) ;
454- // TODO: We cannot clone thus we cannot insert into a JobStatus::... (proof_input)
438+ // TODO: We cannot clone KeccakInclusionToDataRootProofInput thus we cannot insert into a JobStatus::DataAvailable (proof_input)
455439 // So we just redo the work from scratch for the DA side as a stupid workaround
456- job_status = JobStatus :: Failed ( e. clone ( ) , None ) ;
440+ job_status =
441+ JobStatus :: Failed ( e. clone ( ) , Some ( JobStatus :: RemoteZkProofRequesting . into ( ) ) ) ;
457442 }
458443 }
459444 match self . finalize_job ( job_key, job_status) {
@@ -502,8 +487,9 @@ impl PdaRunner {
502487 // TODO: how to handle errors without a concrete type? Anyhow is not the right thing for us...
503488 . map_err ( |e| {
504489 if let Some ( down) = e. downcast_ref :: < SP1NetworkError > ( ) {
505- return self . handle_zk_client_error ( down, job_key) ;
490+ return self . handle_zk_client_error ( down, job , job_key) ;
506491 }
492+ error ! ( "Unhandled Error: {e:?}" ) ;
507493 PdaRunnerError :: ZkClientError ( format ! ( "Unhandled Error: {e} PLEASE REPORT" ) )
508494 } ) ?;
509495 debug ! ( "0x{} - Proof complete" , hex:: encode( job_key) ) ;
@@ -554,13 +540,13 @@ impl PdaRunner {
554540 . strategy ( strategy)
555541 . groth16 ( )
556542 . skip_simulation ( false )
557- // .timeout(std::time::Duration::from_secs(60))
543+ . timeout ( self . config . zk_proof_gen_timeout_remote ) // Time allowed for provers to *attempt* a job, or it permanently fails.
558544 . request_async ( )
559545 . await
560546 // TODO: how to handle errors without a concrete type? Anyhow is not the right thing for us...
561547 . map_err ( |e| {
562548 if let Some ( down) = e. downcast_ref :: < SP1NetworkError > ( ) {
563- return self . handle_zk_client_error ( down, job_key) ;
549+ return self . handle_zk_client_error ( down, job , job_key) ;
564550 }
565551 PdaRunnerError :: ZkClientError ( format ! ( "Unhandled Error: {e} PLEASE REPORT" ) )
566552 } ) ?
@@ -576,31 +562,28 @@ impl PdaRunner {
576562 /// Await a proof request from Succinct's prover network
577563 async fn wait_for_zk_proof (
578564 & self ,
565+ job : & Job ,
579566 job_key : & [ u8 ] ,
580567 request_id : util:: SuccNetJobId ,
581568 ) -> Result < SP1ProofWithPublicValues , PdaRunnerError > {
569+ debug ! (
570+ "Waiting for proof from prover network with timeout: {} seconds" ,
571+ self . config. zk_proof_gen_timeout_remote. as_secs( )
572+ ) ;
582573 debug ! ( "Waiting for proof from prover network" ) ;
583574 let zk_client_handle = self . get_zk_client_remote ( ) . await ;
584575
585576 let proof = zk_client_handle
586577 . wait_proof ( request_id. into ( ) , None )
587578 . await
588579 . map_err ( |e| {
589- error ! ( "UNHANDLED ZK client error: {e:?}" ) ;
590- let e = PdaRunnerError :: ZkClientError (
591- "UNKNOWN ZK client error. PLEASE REPORT!" . to_string ( ) ,
592- ) ;
593- match self . finalize_job (
594- job_key,
595- JobStatus :: Failed (
596- e. clone ( ) ,
597- Some ( JobStatus :: RemoteZkProofPending ( request_id) . into ( ) ) ,
598- ) ,
599- ) {
600- Ok ( _) => e,
601- Err ( internal_err) => internal_err,
580+ if let Some ( down) = e. downcast_ref :: < SP1NetworkError > ( ) {
581+ return self . handle_zk_client_error ( down, job, job_key) ;
602582 }
583+ error ! ( "UNHANDLED ZK client error: {e:?}" ) ;
584+ PdaRunnerError :: ZkClientError ( format ! ( "Unhandled Error: {e} PLEASE REPORT" ) )
603585 } ) ?;
586+
604587 Ok ( proof)
605588 }
606589
0 commit comments