2626from libtbx .utils import detect_binary_file
2727from libtbx import smart_open
2828
29+ import os
2930import sys
3031
3132distances_as_cif_loop = geometry .distances_as_cif_loop
@@ -42,37 +43,29 @@ class CifParserError(Sorry):
4243_VALID_ENGINES = ("ucif" , "xcif" )
4344
4445
45- def _drive_builder_from_xcif (builder , input_string , strict ):
46- """Parse input_string with xcif and drive the given cif_model_builder-style
47- builder via its callback methods (add_data_block / add_data_item / add_loop /
48- start_save_frame / end_save_frame).
46+ _XCIF_COMPRESSED_SUFFIXES = (".gz" , ".Z" , ".bz2" )
4947
50- Returns a (possibly empty) list of error message strings. xcif throws on
51- first parse error rather than accumulating, so the list has at most one
52- entry. When it does, the builder is left with whatever state it had
53- before the throw — which is nothing, since xcif_ext.parse runs to
54- completion before the walker touches the builder.
5548
56- strict=False relaxes two things: pair items appearing before the first
57- data_ block are attached to an implicit 'global_' block; and explicit
58- 'global_' block headers (STAR/DDL2 convention, used by the cctbx
59- monomer library) are accepted. strict=True rejects both.
49+ def _xcif_can_use_parse_file (file_path ):
50+ """True when file_path is safe to hand to xcif_ext.parse_file (which
51+ memory-maps). False for compressed files; smart_open handles those,
52+ parse_file does not."""
53+ if file_path is None :
54+ return False
55+ return not file_path .endswith (_XCIF_COMPRESSED_SUFFIXES )
56+
57+
58+ def _walk_xcif_doc (builder , doc ):
59+ """Drive `builder` (a cif_model_builder-style object with
60+ add_data_block / add_data_item / add_loop / start_save_frame /
61+ end_save_frame) from an already-parsed xcif Document.
6062
6163 The xcif C++ parser stores block and save-frame names with the data_/save_
6264 prefix stripped; the builder's strip-before-first-underscore logic
6365 (iotbx/cif/builders.py add_data_block / start_save_frame) expects raw
6466 tokens, so we prepend the prefix back.
6567 """
6668 import xcif_ext
67- try :
68- doc = xcif_ext .parse (input_string , strict = strict )
69- except (ValueError , RuntimeError ) as e :
70- # xcif raises ValueError for std::invalid_argument (explicit translator)
71- # and RuntimeError for CifError (inherits std::runtime_error, default
72- # boost.python translation). Return the message so the caller can
73- # either surface it as CifParserError or stash it for error_count().
74- return [str (e )]
75-
7669 # Use the enum values from xcif_ext directly — no ABI drift.
7770 # int(...) because bp::enum_ wraps values in a Python type;
7871 # the tuple comparisons below want plain ints.
@@ -119,6 +112,45 @@ def _emit_pair_or_loop(kind, idx, pair_tags, pair_values, loops):
119112 builder .end_save_frame ()
120113 else :
121114 _emit_pair_or_loop (kind , idx , pair_tags , pair_values , loops )
115+
116+
117+ def _drive_builder_from_xcif (builder , input_string , strict ):
118+ """Parse input_string with xcif and drive the given builder.
119+
120+ Returns a (possibly empty) list of error message strings. xcif throws on
121+ first parse error rather than accumulating, so the list has at most one
122+ entry. When it does, the builder is left with whatever state it had
123+ before the throw — which is nothing, since xcif_ext.parse runs to
124+ completion before the walker touches the builder.
125+
126+ strict=False relaxes two things: pair items appearing before the first
127+ data_ block are attached to an implicit 'global_' block; and explicit
128+ 'global_' block headers (STAR/DDL2 convention, used by the cctbx
129+ monomer library) are accepted. strict=True rejects both.
130+ """
131+ import xcif_ext
132+ try :
133+ doc = xcif_ext .parse (input_string , strict = strict )
134+ except (ValueError , RuntimeError ) as e :
135+ # xcif raises ValueError for std::invalid_argument (explicit translator)
136+ # and RuntimeError for CifError (inherits std::runtime_error, default
137+ # boost.python translation). Return the message so the caller can
138+ # either surface it as CifParserError or stash it for error_count().
139+ return [str (e )]
140+ _walk_xcif_doc (builder , doc )
141+ return []
142+
143+
144+ def _drive_builder_from_xcif_file (builder , file_path , strict ):
145+ """Like _drive_builder_from_xcif but dispatches to xcif_ext.parse_file
146+ (memory-mapped, zero-copy). Avoids allocating a Python string copy of
147+ the file contents when the caller supplied a plain uncompressed path."""
148+ import xcif_ext
149+ try :
150+ doc = xcif_ext .parse_file (file_path , strict = strict )
151+ except (ValueError , RuntimeError ) as e :
152+ return [str (e )]
153+ _walk_xcif_doc (builder , doc )
122154 return []
123155
124156
@@ -146,6 +178,24 @@ def __init__(self,
146178 else : assert cif_object is None
147179 self .builder = builder
148180 self .original_arrays = None
181+ self ._xcif_errors = []
182+ self .parser = None
183+ # Fast path: xcif + plain uncompressed file_path -> memory-mapped
184+ # parse_file, skipping the Python-string copy of the whole file.
185+ # file_object= callers always go through the read-into-string path
186+ # because there is no file descriptor to hand to mmap.
187+ if (engine == "xcif"
188+ and file_object is None
189+ and file_path is not None
190+ and _xcif_can_use_parse_file (file_path )):
191+ resolved = os .path .expanduser (file_path )
192+ if detect_binary_file .from_initial_block (resolved ):
193+ raise CifParserError ("Binary file detected, aborting parsing." )
194+ self ._xcif_errors = _drive_builder_from_xcif_file (
195+ self .builder , resolved , strict )
196+ if raise_if_errors and self ._xcif_errors :
197+ raise CifParserError (self ._xcif_errors [0 ])
198+ return
149199 if file_path is not None :
150200 file_object = smart_open .for_reading (file_path )
151201 else :
@@ -159,11 +209,9 @@ def __init__(self,
159209 len (input_string ), binary_detector .monitor_initial )
160210 if binary_detector .is_binary_file (block = input_string ):
161211 raise CifParserError ("Binary file detected, aborting parsing." )
162- self ._xcif_errors = []
163212 if engine == "xcif" :
164213 self ._xcif_errors = _drive_builder_from_xcif (
165214 self .builder , input_string , strict )
166- self .parser = None
167215 if raise_if_errors and self ._xcif_errors :
168216 raise CifParserError (self ._xcif_errors [0 ])
169217 else :
0 commit comments