Skip to content

Commit 2e920dc

Browse files
olegsobolevclaude
andcommitted
iotbx.cif.reader: zero-copy parse_file when engine="xcif" + file_path
When the reader is given a plain (uncompressed) file_path and engine="xcif", dispatch to xcif_ext.parse_file (memory-mapped in C++) instead of reading the file into a Python string and calling xcif_ext.parse. Avoids a whole-file str() allocation on the hot path. Binary detection runs via detect_binary_file.from_initial_block on the first 1000 bytes, preserving the "Binary file detected" abort. Compressed files (.gz/.Z/.bz2) and file_object= callers continue to use the existing read-into-string path; parse_file cannot mmap a compressed stream or an open file object. Walker logic factored out of _drive_builder_from_xcif into a shared _walk_xcif_doc helper; both parse/parse_file wrappers reuse it. Tests verify file_path + xcif invokes parse_file (not parse) and that the resulting model matches the input_string path. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 9d524bd commit 2e920dc

File tree

2 files changed

+167
-24
lines changed

2 files changed

+167
-24
lines changed

iotbx/cif/__init__.py

Lines changed: 72 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from libtbx.utils import detect_binary_file
2727
from libtbx import smart_open
2828

29+
import os
2930
import sys
3031

3132
distances_as_cif_loop = geometry.distances_as_cif_loop
@@ -42,37 +43,29 @@ class CifParserError(Sorry):
4243
_VALID_ENGINES = ("ucif", "xcif")
4344

4445

45-
def _drive_builder_from_xcif(builder, input_string, strict):
46-
"""Parse input_string with xcif and drive the given cif_model_builder-style
47-
builder via its callback methods (add_data_block / add_data_item / add_loop /
48-
start_save_frame / end_save_frame).
46+
_XCIF_COMPRESSED_SUFFIXES = (".gz", ".Z", ".bz2")
4947

50-
Returns a (possibly empty) list of error message strings. xcif throws on
51-
first parse error rather than accumulating, so the list has at most one
52-
entry. When it does, the builder is left with whatever state it had
53-
before the throw — which is nothing, since xcif_ext.parse runs to
54-
completion before the walker touches the builder.
5548

56-
strict=False relaxes two things: pair items appearing before the first
57-
data_ block are attached to an implicit 'global_' block; and explicit
58-
'global_' block headers (STAR/DDL2 convention, used by the cctbx
59-
monomer library) are accepted. strict=True rejects both.
49+
def _xcif_can_use_parse_file(file_path):
50+
"""True when file_path is safe to hand to xcif_ext.parse_file (which
51+
memory-maps). False for compressed files; smart_open handles those,
52+
parse_file does not."""
53+
if file_path is None:
54+
return False
55+
return not file_path.endswith(_XCIF_COMPRESSED_SUFFIXES)
56+
57+
58+
def _walk_xcif_doc(builder, doc):
59+
"""Drive `builder` (a cif_model_builder-style object with
60+
add_data_block / add_data_item / add_loop / start_save_frame /
61+
end_save_frame) from an already-parsed xcif Document.
6062
6163
The xcif C++ parser stores block and save-frame names with the data_/save_
6264
prefix stripped; the builder's strip-before-first-underscore logic
6365
(iotbx/cif/builders.py add_data_block / start_save_frame) expects raw
6466
tokens, so we prepend the prefix back.
6567
"""
6668
import xcif_ext
67-
try:
68-
doc = xcif_ext.parse(input_string, strict=strict)
69-
except (ValueError, RuntimeError) as e:
70-
# xcif raises ValueError for std::invalid_argument (explicit translator)
71-
# and RuntimeError for CifError (inherits std::runtime_error, default
72-
# boost.python translation). Return the message so the caller can
73-
# either surface it as CifParserError or stash it for error_count().
74-
return [str(e)]
75-
7669
# Use the enum values from xcif_ext directly — no ABI drift.
7770
# int(...) because bp::enum_ wraps values in a Python type;
7871
# the tuple comparisons below want plain ints.
@@ -119,6 +112,45 @@ def _emit_pair_or_loop(kind, idx, pair_tags, pair_values, loops):
119112
builder.end_save_frame()
120113
else:
121114
_emit_pair_or_loop(kind, idx, pair_tags, pair_values, loops)
115+
116+
117+
def _drive_builder_from_xcif(builder, input_string, strict):
118+
"""Parse input_string with xcif and drive the given builder.
119+
120+
Returns a (possibly empty) list of error message strings. xcif throws on
121+
first parse error rather than accumulating, so the list has at most one
122+
entry. When it does, the builder is left with whatever state it had
123+
before the throw — which is nothing, since xcif_ext.parse runs to
124+
completion before the walker touches the builder.
125+
126+
strict=False relaxes two things: pair items appearing before the first
127+
data_ block are attached to an implicit 'global_' block; and explicit
128+
'global_' block headers (STAR/DDL2 convention, used by the cctbx
129+
monomer library) are accepted. strict=True rejects both.
130+
"""
131+
import xcif_ext
132+
try:
133+
doc = xcif_ext.parse(input_string, strict=strict)
134+
except (ValueError, RuntimeError) as e:
135+
# xcif raises ValueError for std::invalid_argument (explicit translator)
136+
# and RuntimeError for CifError (inherits std::runtime_error, default
137+
# boost.python translation). Return the message so the caller can
138+
# either surface it as CifParserError or stash it for error_count().
139+
return [str(e)]
140+
_walk_xcif_doc(builder, doc)
141+
return []
142+
143+
144+
def _drive_builder_from_xcif_file(builder, file_path, strict):
145+
"""Like _drive_builder_from_xcif but dispatches to xcif_ext.parse_file
146+
(memory-mapped, zero-copy). Avoids allocating a Python string copy of
147+
the file contents when the caller supplied a plain uncompressed path."""
148+
import xcif_ext
149+
try:
150+
doc = xcif_ext.parse_file(file_path, strict=strict)
151+
except (ValueError, RuntimeError) as e:
152+
return [str(e)]
153+
_walk_xcif_doc(builder, doc)
122154
return []
123155

124156

@@ -146,6 +178,24 @@ def __init__(self,
146178
else: assert cif_object is None
147179
self.builder = builder
148180
self.original_arrays = None
181+
self._xcif_errors = []
182+
self.parser = None
183+
# Fast path: xcif + plain uncompressed file_path -> memory-mapped
184+
# parse_file, skipping the Python-string copy of the whole file.
185+
# file_object= callers always go through the read-into-string path
186+
# because there is no file descriptor to hand to mmap.
187+
if (engine == "xcif"
188+
and file_object is None
189+
and file_path is not None
190+
and _xcif_can_use_parse_file(file_path)):
191+
resolved = os.path.expanduser(file_path)
192+
if detect_binary_file.from_initial_block(resolved):
193+
raise CifParserError("Binary file detected, aborting parsing.")
194+
self._xcif_errors = _drive_builder_from_xcif_file(
195+
self.builder, resolved, strict)
196+
if raise_if_errors and self._xcif_errors:
197+
raise CifParserError(self._xcif_errors[0])
198+
return
149199
if file_path is not None:
150200
file_object = smart_open.for_reading(file_path)
151201
else:
@@ -159,11 +209,9 @@ def __init__(self,
159209
len(input_string), binary_detector.monitor_initial)
160210
if binary_detector.is_binary_file(block=input_string):
161211
raise CifParserError("Binary file detected, aborting parsing.")
162-
self._xcif_errors = []
163212
if engine == "xcif":
164213
self._xcif_errors = _drive_builder_from_xcif(
165214
self.builder, input_string, strict)
166-
self.parser = None
167215
if raise_if_errors and self._xcif_errors:
168216
raise CifParserError(self._xcif_errors[0])
169217
else:

xcif/regression/tst_iotbx_cif_reader_engine.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,16 @@
33
Covers:
44
- Unknown engine values are rejected with ValueError before any file I/O.
55
- Known engines ("ucif", "xcif") are accepted.
6+
- engine="xcif" with file_path dispatches to xcif_ext.parse_file (zero-copy
7+
mmap) rather than reading the whole file into a Python string.
68
"""
79
from __future__ import absolute_import, division, print_function
810

911
import os
1012
import tempfile
1113

1214
import iotbx.cif
15+
import xcif_ext
1316

1417

1518
def exercise_engine_validated_before_io():
@@ -47,9 +50,101 @@ def exercise_known_engines_accepted():
4750
"pair not parsed with engine=%s: got %r" % (engine, model["t"]["_k"])
4851

4952

53+
_SAMPLE_CIF = """\
54+
data_t
55+
_k v
56+
loop_
57+
_a.id
58+
_a.val
59+
1 x
60+
2 y
61+
"""
62+
63+
64+
def _write_sample(tmpdir):
65+
p = os.path.join(tmpdir, "sample.cif")
66+
with open(p, "w") as f:
67+
f.write(_SAMPLE_CIF)
68+
return p
69+
70+
71+
def exercise_file_path_uses_parse_file():
72+
# Record which xcif_ext entry point the reader invokes.
73+
real_parse = xcif_ext.parse
74+
real_parse_file = xcif_ext.parse_file
75+
calls = {"parse": 0, "parse_file": 0}
76+
77+
def _tracking_parse(text, **kw):
78+
calls["parse"] += 1
79+
return real_parse(text, **kw)
80+
81+
def _tracking_parse_file(path, **kw):
82+
calls["parse_file"] += 1
83+
return real_parse_file(path, **kw)
84+
85+
xcif_ext.parse = _tracking_parse
86+
xcif_ext.parse_file = _tracking_parse_file
87+
try:
88+
tmpdir = tempfile.mkdtemp(prefix="xcif_reader_item5_")
89+
try:
90+
path = _write_sample(tmpdir)
91+
92+
# input_string + xcif: parse() is used, parse_file() is NOT.
93+
iotbx.cif.reader(input_string=_SAMPLE_CIF, engine="xcif").model()
94+
assert calls["parse"] == 1, \
95+
"input_string path should call parse: %r" % calls
96+
assert calls["parse_file"] == 0, \
97+
"input_string path must NOT call parse_file: %r" % calls
98+
99+
# file_path + xcif: parse_file() is used, parse() is NOT.
100+
calls["parse"] = 0
101+
calls["parse_file"] = 0
102+
iotbx.cif.reader(file_path=path, engine="xcif").model()
103+
assert calls["parse_file"] == 1, \
104+
"file_path path should call parse_file: %r" % calls
105+
assert calls["parse"] == 0, \
106+
"file_path path must NOT call parse (avoid Python-string copy): %r" % (
107+
calls,)
108+
finally:
109+
import shutil
110+
shutil.rmtree(tmpdir, ignore_errors=True)
111+
finally:
112+
xcif_ext.parse = real_parse
113+
xcif_ext.parse_file = real_parse_file
114+
115+
116+
def exercise_file_path_matches_input_string():
117+
tmpdir = tempfile.mkdtemp(prefix="xcif_reader_item5_")
118+
try:
119+
path = _write_sample(tmpdir)
120+
m_str = iotbx.cif.reader(
121+
input_string=_SAMPLE_CIF, engine="xcif").model()
122+
m_file = iotbx.cif.reader(file_path=path, engine="xcif").model()
123+
# Compare block set, pair items, loop tag+content.
124+
assert list(m_str.keys()) == list(m_file.keys())
125+
for name in m_str:
126+
b_s = m_str[name]
127+
b_f = m_file[name]
128+
assert list(b_s._set) == list(b_f._set), \
129+
"source-order drift between input_string and file_path"
130+
assert dict(b_s._items) == dict(b_f._items)
131+
assert set(b_s.loops.keys()) == set(b_f.loops.keys())
132+
for ln in b_s.loops:
133+
lp_s = b_s.loops[ln]
134+
lp_f = b_f.loops[ln]
135+
assert set(lp_s.keys()) == set(lp_f.keys())
136+
for tag in lp_s.keys():
137+
assert list(lp_s[tag]) == list(lp_f[tag])
138+
finally:
139+
import shutil
140+
shutil.rmtree(tmpdir, ignore_errors=True)
141+
142+
50143
def run():
51144
exercise_engine_validated_before_io()
52145
exercise_known_engines_accepted()
146+
exercise_file_path_uses_parse_file()
147+
exercise_file_path_matches_input_string()
53148
print("OK")
54149

55150

0 commit comments

Comments
 (0)