cctbx · dwpaley · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025
diff --git a/xfel/merging/application/phil/phil.py b/xfel/merging/application/phil/phil.py
@@ -790,6 +790,97 @@
 }
 """
 
+prepare_phil = """
+prepare
+  .help = The PREPARE section defines operations to prepare data for downstream analysis
+  {
+  spread
+    .help = Prepare data for SpReAD (Spectral Resolved Anomalous Diffraction) analysis.
+    .help = Bins experiments by energy, writes to disk, and generates batch scripts
+    .help = for stage 2 merging and phenix refinement.
+    {
+    binning = *count width
+      .type = choice
+      .help = Binning mode: count for equal-count percentile bins, width for equal-width energy bins
+    n_energy_bins = 100
+      .type = int
+      .help = Number of energy bins (percentiles) for partitioning the dataset (count mode only)
+    window_width = 20
+      .type = int
+      .help = Width of the sliding window in percentile units for stage 2 slices (count mode only)
+    window_step = 1
+      .type = int
+      .help = Step size for sliding window center in percentile units (count mode only)
+    bin_start_eV = None
+      .type = float
+      .help = Start energy in eV for width binning mode
+    bin_end_eV = None
+      .type = float
+      .help = End energy in eV for width binning mode
+    bin_width_eV = 8
+      .type = float
+      .help = Width of sliding window in eV for stage 2 slices (width mode only)
+    bin_step_eV = 1
+      .type = float
+      .help = Step size for sliding window in eV (width mode only)
+    output_dir = None
+      .type = path
+      .help = Directory for writing energy-binned files. If None, uses output.output_dir
+    stage2_phil = None
+      .type = path
+      .help = Path to phil file containing base parameters for stage 2 merge jobs
+    stage2_nproc = 128
+      .type = int
+      .help = Number of MPI ranks for stage 2 merge jobs
+    stage2_nnodes = 2
+      .type = int
+      .help = Number of nodes for stage 2 merge jobs
+    stage2_output_dir = None
+      .type = path
+      .help = Directory for stage 2 merge outputs. If None, uses output_dir/stage2
+    phenix_phil = None
+      .type = path
+      .help = Path to phil/eff file containing phenix.refine parameters
+    phenix_pdb = None
+      .type = path
+      .help = Path to starting model PDB for phenix refinement
+    n_anomalous_scatterers = 1
+      .type = int
+      .help = Number of anomalous scatterers to extract f' and f'' values for
+    statistics_bin_i = None
+      .type = int
+      .help = Resolution bin index (1-based, from merging statistics table) to extract multiplicity from
+    mtz_name = iobs_all.mtz
+      .type = str
+      .help = Name of the merged MTZ file produced by stage 2 merge
+    slurm_partition = None
+      .type = str
+      .help = SLURM partition for batch jobs
+    slurm_account = None
+      .type = str
+      .help = SLURM account for batch jobs
+    slurm_time_limit = 00:30:00
+      .type = str
+      .help = SLURM time limit for each array task
+    slurm_constraint = None
+      .type = str
+      .help = SLURM constraint (e.g. cpu, gpu) for batch jobs
+    slurm_qos = None
+      .type = str
+      .help = SLURM QOS (quality of service) for batch jobs
+    slurm_array_concurrency = 8
+      .type = int
+      .help = Maximum number of SLURM array tasks to run simultaneously
+    cctbx_activate = None
+      .type = path
+      .help = Path to cctbx activation script (sourced before stage 2 merge jobs)
+    phenix_activate = None
+      .type = path
+      .help = Path to phenix activation script (sourced before phenix refinements)
+    }
+  }
+"""
+
 
 # A place to override any defaults included from elsewhere
 program_defaults_phil_str = """
@@ -799,7 +890,8 @@
 master_phil = dispatch_phil + input_phil + tdata_phil + filter_phil + modify_phil + \
               select_phil + scaling_phil + postrefinement_phil + merging_phil + \
               output_phil + statistics_phil + group_phil + lunus_phil + \
-              publish_phil + diffbragg_phil + monitor_phil + filter_global_phil
+              publish_phil + diffbragg_phil + monitor_phil + filter_global_phil + \
+              prepare_phil
 
 import os, importlib
 custom_phil_pathstr = os.environ.get('XFEL_CUSTOM_WORKER_PATH')

diff --git a/xfel/merging/application/prepare/README.md b/xfel/merging/application/prepare/README.md
@@ -0,0 +1,138 @@
+# Prepare for SpReAD pipeline
+
+In recent experiments we have attempted SpReAD refinements with a "moving window"
+of energy-sliced sub-datasets. In conjunction with an energy-scanned X-ray beam,
+this energy slicing technique can provide quasi-monochromatic datasets across the
+full range of an absorption edge, thus allowing the refinement of anomalous
+scattering curves.
+
+This worker provides a convenient interface to the full quasi-monochromatic SpReAD
+pipeline:
+
+- Stage 1: The full energy-scanning dataset is sorted by energy and sliced into
+a large number (say 100) of sub-datasets.
+- Stage 2: According to the "moving window" technique, the fine-sliced datasets
+are reassembled into larger slices for merging. The first merge might accumulate
+data from percentiles 0 through 20. Sub-datasets are allowed to overlap; therefore
+the second merge includes percentiles 1 through 21, then percentiles 2 through 22,
+etc. Thus, for a window width of 20% and a spacing of 1%, we generate 81
+sub-datasets.
+- For each merged sub-dataset, the anomalous scattering factors are refined in
+Phenix.
+- The Phenix logs are analyzed to give a results file with the sub-dataset average
+wavelengths and the refined anomalous scattering factors.
+
+### Job coordination
+
+Stage 1 is a single task performed on a large dataset; we have implemented
+it as a standard merging worker `prepare_spread`. Stage 2 is a series of many
+smaller merging jobs, followed by the same number of Phenix refinement jobs. It
+is not necessarily practical to run the full Stage 1/Stage 2 pipeline at every
+chance. Therefore we write a Stage 2 batch script that the user may submit to
+the queuing system at their discretion. The Stage 2 batch script is implemented
+as a Slurm "array job" in which N jobs are scheduled sequentially from a common
+batch script. The first N-1 jobs are cctbx.xfel.merge runs to generate .mtz files
+for the windowed datasets; the final job is a parallel run of all N-1 Phenix
+refinements. 
+
+### Usage
+
+Include this worker as the merging step of a standard dataset. A possible example
+follows here; this text would be suitable for pasting into an XFEL GUI dataset
+merging task.
+
+```
+dispatch.step_list=input model_scaling statistics_unitcell statistics_beam model_statistics statistics_resolution prepare_spread group errors_merge statistics_intensity merge statistics_intensity_cxi publish
+input.parallel_file_load.method=uniform
+scaling.model=/path/to/model.pdb
+scaling.resolution_scalar=0.96
+statistics.n_bins=20
+merging.d_min=1.9
+merging.merge_anomalous=False
+prepare.spread {
+  stage2_phil=/path/to/merging/params.phil
+  phenix_phil=/path/to/refinement/params.phil
+  phenix_pdb=/path/to/model.pdb
+  slurm_qos=realtime
+  slurm_account=lcls
+  slurm_constraint=cpu
+  slurm_time_limit=120
+  stage2_nnodes=2
+  n_anomalous_scatterers=8
+  binning=width
+  bin_start_eV=6515
+  bin_end_eV=6585
+  bin_width_eV=8
+  statistics_bin_i=16
+  cctbx_activate=/path/to/cctbx/environment/setup.sh
+  phenix_activate=/path/to/phenix/environment/setup.sh
+}
+publish.drive.credential_file=<redacted>
+publish.drive.shared_folder_id=<redacted>
+```
+
+#### prepare.spread phil parameters
+
+- `stage2_phil`: The path to a separate phil file for the Stage 2 merging jobs.
+Paths will be included automatically. This is a suitable example:
+
+```
+dispatch.step_list=input model_scaling filter statistics_unitcell statistics_beam model_statistics statistics_resolution group errors_merge statistics_intensity merge statistics_intensity_cxi
+input.parallel_file_load.method=uniform
+scaling.model=/path/to/model.pdb
+scaling.resolution_scalar=0.96
+statistics.n_bins=20
+merging.d_min=1.9
+merging.merge_anomalous=False
+merging.error.model=mm24
+```
+- `phenix_phil`: A phenix phil file with refinement of anomalous scattering
+factors activated. This excerpt is the important part:
+```
+  refine {
+    strategy = group_anomalous
+
+    anomalous_scatterers {
+      group {
+        selection = "element Ca"
+        f_prime = 0.2893659
+        f_double_prime = 1.637287
+      }
+      group {
+        selection = "name Mn1 and chain A"
+        f_prime = -1.837302
+        f_double_prime = 3.480123
+      }
+      group { [...]
+      }
+    }
+  }
+```
+- `phenix_pdb`: The model for subsequent phenix refinement. This should be a fully
+converged refinement from a "remote" dataset with no significant uncertainty in the
+anomalous scattering factors.
+- `slurm_qos`, `slurm_account`, `slurm_constraint`, `slurm_time_limit`: Configuration
+items for your local queuing system. Only Slurm is currently supported. Depending
+on your local environment, you may replace `slurm_qos` with `slurm_partition`.
+- `stage2_nnodes`: Number of nodes to request for the Stage 2 merging tasks.
+- `n_anomalous_scatterers`: The number of entries to scrape from the phenix logs.
+- `binning`: Choose `width` or `count`. See below for discussion.
+- `bin_start_eV`: Left edge of the first energy bin.
+- `bin_end_eV`: Right edge of the last energy bin.
+- `bin_width_eV`: In this example, the first bin contains energies 6515 to 6523,
+the second bin contains 6516 to 6524, third bin 6517 to 6525, etc.
+- `statistics_bin_i`: The multiplicity of the sliced sub-datasets is reported for
+the `i`-th bin in the merging logs.
+- `cctbx_activate`: The path to a cctbx/dials installation (the same one used for
+regular processing).
+`phenix_activate`: The path to a phenix installation.
+
+#### Binning modes
+
+In `count` mode, energy slicing is by percentile, so that each sub-dataset will
+have the same multiplicity. Where energy coverage is weaker, the sub-dataset ranges
+will be wider.
+
+In `width` mode, energy slicing is by absolute energy, so that each sub-dataset
+will cover the same range in eV. Where energy coverage is weaker, the sub-dataset
+multiplicity will be lower.
diff --git a/xfel/merging/application/prepare/__init__.py b/xfel/merging/application/prepare/__init__.py
diff --git a/xfel/merging/application/prepare/factory.py b/xfel/merging/application/prepare/factory.py
@@ -0,0 +1,10 @@
+from __future__ import absolute_import, division, print_function
+from xfel.merging.application.prepare.prepare_spread import prepare_spread
+from xfel.merging.application.worker import factory as factory_base
+
+class factory(factory_base):
+  """Factory class for preparing data for additional analysis."""
+  @staticmethod
+  def from_parameters(params, additional_info=None, mpi_helper=None, mpi_logger=None):
+    assert additional_info[0] in ['spread',]
+    return [prepare_spread(params, mpi_helper, mpi_logger)]