Skip to content

Commit dcb444f

Browse files
committed
Minimal changes for working tst_memory_policy.py
Memory savings achieved through code specialization, for the case where pixel values are simulated on a small whitelist. Specializations are not yet optimal, as there is still a lot of code duplication. Changes give ~4.5x reduction in memory footprint, but no success yet in resizing the array m_accumulate_floatimage. Attempts so far lead to cuda memory allocation error.
1 parent 510bc4d commit dcb444f

File tree

6 files changed

+71
-113
lines changed

6 files changed

+71
-113
lines changed

simtbx/kokkos/detector.cpp

Lines changed: 15 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -93,16 +93,16 @@ namespace simtbx { namespace Kokkos {
9393
}
9494

9595
template<>
96-
void kokkos_detector<small_whitelist_policy>::hello(){
97-
SCITBX_EXAMINE("small small small");
96+
std::string kokkos_detector<small_whitelist_policy>::hello(){
97+
return("small small small");
9898
}
9999
template<>
100-
void kokkos_detector<large_array_policy>::hello(){
101-
SCITBX_EXAMINE("large large large");
100+
std::string kokkos_detector<large_array_policy>::hello(){
101+
return("large large large");
102102
}
103103

104104
template<> void
105-
kokkos_detector<large_array_policy>::each_image_allocate() {
105+
kokkos_detector<large_array_policy>::each_image_allocate(const std::size_t& n_pixels) {
106106
resize(m_rangemap, m_total_pixel_count);
107107
resize(m_omega_reduction, m_total_pixel_count);
108108
resize(m_max_I_x_reduction, m_total_pixel_count);
@@ -140,9 +140,17 @@ namespace simtbx { namespace Kokkos {
140140

141141
// printf("DONE.\n");
142142
}
143+
143144
template<> void
144-
kokkos_detector<small_whitelist_policy>::each_image_allocate() {
145-
resize(m_maskimage, m_total_pixel_count);
145+
kokkos_detector<small_whitelist_policy>::each_image_allocate(const std::size_t& n_pixels) {
146+
SCITBX_ASSERT(n_pixels > 0);
147+
resize(m_rangemap, n_pixels);
148+
resize(m_omega_reduction, n_pixels);
149+
resize(m_max_I_x_reduction, n_pixels);
150+
resize(m_max_I_y_reduction, n_pixels);
151+
resize(m_floatimage, n_pixels);
152+
153+
resize(m_maskimage, n_pixels);
146154
kokkostbx::transfer_shared2kokkos(m_sdet_vector, metrology.sdet);
147155
kokkostbx::transfer_shared2kokkos(m_fdet_vector, metrology.fdet);
148156
kokkostbx::transfer_shared2kokkos(m_odet_vector, metrology.odet);
@@ -152,80 +160,5 @@ namespace simtbx { namespace Kokkos {
152160
kokkostbx::transfer_shared2kokkos(m_Ybeam, metrology.Ybeam);
153161
fence();
154162
}
155-
156-
template<>
157-
void
158-
kokkos_detector<large_array_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
159-
m_active_pixel_size = active_pixel_list_value.size();
160-
kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
161-
active_pixel_list = active_pixel_list_value;
162-
}
163-
164-
template<>
165-
void
166-
kokkos_detector<small_whitelist_policy>::set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
167-
m_active_pixel_size = active_pixel_list_value.size();
168-
kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
169-
active_pixel_list = active_pixel_list_value;
170-
resize(m_rangemap, m_active_pixel_size);
171-
resize(m_omega_reduction, m_active_pixel_size);
172-
resize(m_max_I_x_reduction, m_active_pixel_size);
173-
resize(m_max_I_y_reduction, m_active_pixel_size);
174-
resize(m_floatimage, m_active_pixel_size);
175-
resize(m_accumulate_floatimage, m_active_pixel_size);
176-
fence();
177-
}
178-
179-
template<> af::shared<double>
180-
kokkos_detector<large_array_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
181-
hello();
182-
//return the data array for the multipanel detector case, but only for whitelist pixels
183-
vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
184-
kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
185-
186-
size_t output_pixel_size = selection.size();
187-
vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
188-
189-
auto temp = m_accumulate_floatimage;
190-
191-
parallel_for("get_active_pixel_selection",
192-
range_policy(0, output_pixel_size),
193-
KOKKOS_LAMBDA (const int i) {
194-
size_t index = active_pixel_selection( i );
195-
active_pixel_results( i ) = temp( index );
196-
});
197-
198-
af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
199-
kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
200-
201-
SCITBX_ASSERT(output_array.size() == output_pixel_size);
202-
return output_array;
203-
}
204-
template<> af::shared<double>
205-
kokkos_detector<small_whitelist_policy>::get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
206-
SCITBX_CHECK_POINT;
207-
hello();
208-
//return the data array for the multipanel detector case, but only for whitelist pixels
209-
210-
std::size_t output_pixel_size = selection.size();
211-
//vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
212-
213-
//auto temp = m_accumulate_floatimage;
214-
215-
//parallel_for("get_active_pixel_selection2",
216-
// range_policy(0, output_pixel_size),
217-
// KOKKOS_LAMBDA (const int i) {
218-
// active_pixel_results( i ) = temp( i );
219-
//});
220-
221-
af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
222-
SCITBX_CHECK_POINT;
223-
kokkostbx::transfer_kokkos2shared(output_array, m_accumulate_floatimage);//active_pixel_results);
224-
SCITBX_CHECK_POINT;
225-
226-
SCITBX_ASSERT(output_array.size() == output_pixel_size);
227-
return output_array;
228-
}
229-
230163
} // Kokkos
231164
} // simtbx

simtbx/kokkos/detector.h

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ using vec3 = kokkostbx::vector3<CUDAREAL>;
2323
using mat3 = kokkostbx::matrix3<CUDAREAL>;
2424
using Kokkos::fence;
2525

26+
2627
namespace simtbx { namespace Kokkos {
2728

2829
namespace af = scitbx::af;
@@ -45,8 +46,7 @@ struct large_array_policy {};
4546
struct small_whitelist_policy {};
4647

4748
template <typename MemoryPolicy>
48-
struct kokkos_detector
49-
{
49+
struct kokkos_detector{
5050
inline kokkos_detector(){printf("NO OPERATION, DEVICE NUMBER IS NEEDED");};
5151
//kokkos_detector(int const&, const simtbx::nanoBragg::nanoBragg& nB);
5252
//kokkos_detector(int const&, dxtbx::model::Detector const &, dxtbx::model::Beam const &);
@@ -56,12 +56,12 @@ struct kokkos_detector
5656
std::cout << "Detector size: " << m_panel_count << " panel" << ( (m_panel_count>1)? "s" : "" ) << std::endl;
5757
metrology.show();
5858
}
59-
//void each_image_allocate();
59+
void each_image_allocate(const std::size_t&);
6060
//void scale_in_place(const double&);
6161
//void write_raw_pixels(simtbx::nanoBragg::nanoBragg&);
6262
//af::flex_double get_raw_pixels();
6363
//void set_active_pixels_on_GPU(af::shared<std::size_t>);
64-
af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
64+
//af::shared<double> get_whitelist_raw_pixels(af::shared<std::size_t>);
6565
inline void each_image_free(){} //no op in Kokkos
6666
int h_deviceID;
6767

@@ -155,8 +155,6 @@ struct kokkos_detector
155155
return view_floatimage;
156156
};
157157

158-
void each_image_allocate();
159-
160158
inline void
161159
scale_in_place(const double& factor){
162160
auto local_accumulate_floatimage = m_accumulate_floatimage;
@@ -165,8 +163,6 @@ struct kokkos_detector
165163
});
166164
}
167165

168-
void set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value);
169-
170166
inline void
171167
write_raw_pixels(simtbx::nanoBragg::nanoBragg& nB) {
172168
//only implement the monolithic detector case, one panel
@@ -203,11 +199,41 @@ struct kokkos_detector
203199
return output_array;
204200
}
205201

206-
void hello();
202+
inline void
203+
set_active_pixels_on_GPU(af::shared<std::size_t> active_pixel_list_value) {
204+
m_active_pixel_size = active_pixel_list_value.size();
205+
kokkostbx::transfer_shared2kokkos(m_active_pixel_list, active_pixel_list_value);
206+
active_pixel_list = active_pixel_list_value;
207+
}
207208

208-
};
209+
inline af::shared<double>
210+
get_whitelist_raw_pixels(af::shared<std::size_t> selection) {
211+
printf("algorithm: %20s selection size %10d\n",hello().c_str(), selection.size());
212+
//return the data array for the multipanel detector case, but only for whitelist pixels
213+
vector_size_t active_pixel_selection = vector_size_t("active_pixel_selection", selection.size());
214+
kokkostbx::transfer_shared2kokkos(active_pixel_selection, selection);
215+
216+
size_t output_pixel_size = selection.size();
217+
vector_cudareal_t active_pixel_results = vector_cudareal_t("active_pixel_results", output_pixel_size);
209218

219+
auto temp = m_accumulate_floatimage;
210220

221+
parallel_for("get_active_pixel_selection",
222+
range_policy(0, output_pixel_size),
223+
KOKKOS_LAMBDA (const int i) {
224+
size_t index = active_pixel_selection( i );
225+
active_pixel_results( i ) = temp( index );
226+
});
227+
228+
af::shared<double> output_array(output_pixel_size, af::init_functor_null<double>());
229+
kokkostbx::transfer_kokkos2shared(output_array, active_pixel_results);
230+
231+
SCITBX_ASSERT(output_array.size() == output_pixel_size);
232+
return output_array;
233+
}
234+
235+
std::string hello();
236+
};
211237
} // Kokkos
212238
} // simtbx
213239
#endif // SIMTBX_KOKKOS_DETECTOR_H

simtbx/kokkos/kokkos_ext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ namespace simtbx { namespace Kokkos {
8787
.def("show_summary",&simtbx::Kokkos::kokkos_detector<memory_t>::show_summary)
8888
.def("each_image_allocate",
8989
&simtbx::Kokkos::kokkos_detector<memory_t>::each_image_allocate,
90+
( arg_("n_pixels")=0 ),
9091
"Allocate large pixel arrays")
9192
.def("scale_in_place", &simtbx::Kokkos::kokkos_detector<memory_t>::scale_in_place,
9293
"Multiply by a scale factor on the GPU")
@@ -95,7 +96,6 @@ namespace simtbx { namespace Kokkos {
9596
.def("get_raw_pixels",&simtbx::Kokkos::kokkos_detector<memory_t>::get_raw_pixels,
9697
"return multipanel detector raw pixels as a flex array")
9798
.def("get_whitelist_raw_pixels",
98-
(af::shared<double> (simtbx::Kokkos::kokkos_detector<memory_t>::*)(af::shared<std::size_t>))
9999
&simtbx::Kokkos::kokkos_detector<memory_t>::get_whitelist_raw_pixels,
100100
"return only those raw pixels requested by the whitelist selection, as a 1D flex array")
101101
.def("each_image_free", &simtbx::Kokkos::kokkos_detector<memory_t>::each_image_free)

simtbx/kokkos/simulation.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,8 @@ namespace Kokkos {
211211
//don't want to free the kec data when the nanoBragg goes out of scope, so switch the pointer
212212
// cu_current_channel_Fhkl = NULL;
213213

214-
add_array(kdt.m_accumulate_floatimage, kdt.m_floatimage);
214+
//for the small_whitelist specialization, have a special version of add_array() that specifies size
215+
add_array_limit(kdt.m_accumulate_floatimage, kdt.m_floatimage, kdt.m_floatimage.span());
215216
}// loop over channels
216217
}
217218

simtbx/kokkos/simulation_kernels.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -955,6 +955,14 @@ void add_array( view_1d_t<T> lhs, const view_1d_t<U> rhs ) {
955955
});
956956
}
957957

958+
template <typename T, typename U>
959+
void add_array_limit( view_1d_t<T> lhs, const view_1d_t<U> rhs, const std::size_t& limit ) {
960+
Kokkos::parallel_for("add_arrays", limit, KOKKOS_LAMBDA(const int& i) {
961+
lhs( i ) = lhs( i ) + (T)rhs( i );
962+
rhs( i ) = 0;
963+
});
964+
}
965+
958966
void add_background_kokkos_kernel(int sources, int nanoBragg_oversample, int override_source,
959967
CUDAREAL pixel_size, int spixels, int fpixels, int detector_thicksteps,
960968
CUDAREAL detector_thickstep, CUDAREAL detector_attnlen,

simtbx/tests/tst_memory_policy.py

Lines changed: 9 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
215215
self.gpu_simulation.allocate()
216216
self.gpu_detector = get_exascale("gpu_detector_small_whitelist",params.context)(
217217
deviceId=self.SIM.device_Id, detector=self.DETECTOR, beam=self.BEAM)
218-
self.gpu_detector.each_image_allocate()
218+
219+
self.gpu_detector.each_image_allocate(n_pixels = whitelist_pixels.size() )
219220
# self.gpu_detector.show_summary()
220221

221222
assert sources
@@ -233,9 +234,8 @@ def specialized_api_for_whitelist_low_memory(self, params, whitelist_pixels, arg
233234
per_image_scale_factor = self.domains_per_crystal # 1.0
234235
self.gpu_detector.scale_in_place(per_image_scale_factor) # apply scale directly on GPU
235236
self.reset_pythony_beams(self.SIM)
236-
print("AAA")
237-
self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_pixels)
238-
print("BBB")
237+
whitelist_idx = flex.size_t(range(whitelist_pixels.size()))
238+
self.whitelist_values = self.gpu_detector.get_whitelist_raw_pixels(whitelist_idx)
239239

240240
def get_whitelist_from_refls(prefix,SIM=None):
241241
#image_size = len(SIM.raw_pixels)
@@ -347,7 +347,7 @@ def run_all(params):
347347
# Now reproduce whitelist sims showing accumulation of large persistent memory
348348
SWCs=[]
349349
for x in range(NTRIALS):
350-
print("Whitelist-only iteration",x)
350+
print("\nWhitelist-only iteration",x)
351351
SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
352352
SWCs[-1].specialized_api_for_whitelist(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
353353

@@ -365,7 +365,7 @@ def run_all(params):
365365
# Reproduce whitelist sims with small-memory mechanism
366366
SWCs=[]
367367
for x in range(NTRIALS):
368-
print("Whitelist-only iteration with small memory",x)
368+
print("\nWhitelist-only iteration with small memory",x)
369369
SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
370370
SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
371371
#produce an output image file for intermediate debugging
@@ -407,25 +407,15 @@ def run_subset_for_NESAP_debug(params):
407407
# Reproduce whitelist sims with small-memory mechanism
408408
SWCs=[]
409409
for x in range(NTRIALS):
410-
print("Whitelist-only iteration with small memory",x)
410+
print("\n Whitelist-only iteration with small memory",x)
411411
SWCs.append(several_wavelength_case_policy(BEAM,DETECTOR,CRYSTAL,SF_model,weights=flex.double([1.])))
412412
SWCs[-1].specialized_api_for_whitelist_low_memory(whitelist_pixels=whitelist_pixels,params=params,argchk=False,sources=True)
413-
#produce an output image file for intermediate debugging
414-
working_raw_pixels = flex.double(image_size) # blank array
415-
working_raw_pixels.set_selected(whitelist_pixels, SWCs[-1].whitelist_values)
416-
working_raw_pixels.reshape(flex.grid(SWCs[-1].SIM.raw_pixels.focus()))
417-
418-
free_gpu_before = get_gpu_memory()[0]
419-
del SWCs
420-
free_gpu_after = get_gpu_memory()[0]
421-
new_memory_use = (free_gpu_after - free_gpu_before)/NTRIALS
422-
print(new_memory_use,"free")
423413

424414
if __name__=="__main__":
425415
params,options = parse_input()
426416
# Initialize based on GPU context
427417
gpu_instance_type = get_exascale("gpu_instance", params.context)
428418
gpu_instance = gpu_instance_type(deviceId = 0)
429-
#run_all(params)
430-
run_subset_for_NESAP_debug(params)
419+
run_all(params)
420+
#run_subset_for_NESAP_debug(params)
431421
print("OK")

0 commit comments

Comments
 (0)