I/O: Apache Iceberg (inline comments)

amotl · amotl · commit a3d7292d8b1d · 2026-02-19T12:55:08.000+01:00
diff --git a/cratedb_toolkit/cluster/core.py b/cratedb_toolkit/cluster/core.py
@@ -640,6 +640,9 @@ def save_table(
         """
         Export data from a database table on a standalone CrateDB Server.
 
+        Note: The `transformation` parameter is not respected yet, but required by contract.
+              In this spirit, it is reserved for later use.
+
         Synopsis
         --------
         export CRATEDB_CLUSTER_URL=crate://crate@localhost:4200/testdrive/demo
@@ -653,11 +656,8 @@ def save_table(
         if target_url_obj.scheme.startswith("iceberg") or target_url_obj.scheme.endswith("iceberg"):
             from cratedb_toolkit.io.iceberg import to_iceberg
 
-            if to_iceberg(source_url, target.url):
-                self._load_table_result = True
-            else:
-                logger.error("Data loading failed or incomplete")
-                self._load_table_result = False
+            if not to_iceberg(source_url, target.url):
+                raise IOError("Data loading failed or incomplete")
 
         else:
             raise NotImplementedError(f"Exporting resource not implemented yet: {target_url_obj}")
diff --git a/cratedb_toolkit/io/iceberg.py b/cratedb_toolkit/io/iceberg.py
@@ -172,14 +172,19 @@ def from_iceberg(source_url, target_url, progress: bool = False):
     logger.info(f"Running Iceberg copy with chunksize={chunksize}")
     engine = sa.create_engine(str(cratedb_url))
 
-    # This conversion to pandas is zero-copy,
-    # so we can utilize their SQL utils for free.
-    # https://github.com/pola-rs/polars/issues/7852
+    # Note: The conversion to pandas is zero-copy,
+    #       so we can utilize their SQL utils for free.
+    #       https://github.com/pola-rs/polars/issues/7852
     # Note: This code also uses the most efficient `insert_bulk` method with CrateDB.
-    # https://cratedb.com/docs/sqlalchemy-cratedb/dataframe.html#efficient-insert-operations-with-pandas
+    #       https://cratedb.com/docs/sqlalchemy-cratedb/dataframe.html#efficient-insert-operations-with-pandas
     # Note: `collect_batches()` is marked as unstable and slower than native sinks;
-    # consider native Polars sinks (e.g., write_database) as a faster alternative if available.
-    # https://github.com/crate/cratedb-toolkit/pull/444#discussion_r2825382887
+    #       consider native Polars sinks (e.g., write_database) as a faster alternative if available.
+    #       https://github.com/crate/cratedb-toolkit/pull/444#discussion_r2825382887
+    # Note: This variant appeared to be much slower, let's revisit and investigate why?
+    #       table.to_polars().collect(streaming=True).write_database(
+    #         table_name=cratedb_table.fullname, connection=engine, if_table_exists="replace"  # noqa: ERA001
+    # Note: When `collect_batches` yields more than one batch, the first batch must use the
+    #       user-specified `if_exists`, but subsequent batches must use "append".
     with pl.Config(streaming_chunk_size=chunksize):
         table = iceberg_address.load_table()
         for batch in table.collect_batches(engine="streaming", chunk_size=chunksize):
@@ -192,14 +197,6 @@ def from_iceberg(source_url, target_url, progress: bool = False):
                 chunksize=chunksize,
                 method=insert_bulk,
             )
-
-    # Note: This variant was much slower.
-    """
-    table.to_polars().collect(streaming=True).write_database(
-        table_name=cratedb_table.fullname, connection=engine, if_table_exists="replace"
-    )
-    """
-
     return True