1+ """
2+ Apache Iceberg integration for CrateDB Toolkit.
3+
4+ This module provides functionality to transfer data between Iceberg tables
5+ and CrateDB databases, supporting both import and export operations.
6+ """
7+
18import dataclasses
29import logging
310import tempfile
@@ -44,7 +51,7 @@ def from_url(cls, url: str):
4451 if iceberg_url .scheme .endswith ("+iceberg" ):
4552 iceberg_url .scheme = iceberg_url .scheme .replace ("+iceberg" , "" )
4653 u2 = copy (iceberg_url )
47- u2 ._query = ""
54+ u2 .query_params . clear ()
4855 location = str (u2 )
4956 return cls (
5057 url = iceberg_url ,
@@ -55,6 +62,11 @@ def from_url(cls, url: str):
5562 )
5663
5764 def load_catalog (self ) -> Catalog :
65+ """
66+ Load the Iceberg catalog with appropriate configuration.
67+ """
68+ # TODO: Consider accepting catalog configuration as parameters
69+ # to support different catalog types (Hive, REST, etc.).
5870 return load_catalog (self .catalog , ** self .catalog_properties )
5971
6072 @property
@@ -69,29 +81,34 @@ def catalog_properties(self):
6981
7082 @property
7183 def storage_options (self ):
72- return {
84+ opts = {
7385 "s3.endpoint" : self .url .query_params .get ("s3.endpoint" ),
7486 "s3.region" : self .url .query_params .get ("s3.region" ),
7587 "s3.access-key-id" : self .url .query_params .get ("s3.access-key-id" ),
7688 "s3.secret-access-key" : self .url .query_params .get ("s3.secret-access-key" ),
7789 }
90+ return {k : v for k , v in opts .items () if v is not None }
7891
7992 @property
8093 def identifier (self ):
94+ """
95+ Return the catalog-table identifier tuple.
96+ """
8197 return (self .namespace , self .table )
8298
8399 def load_table (self ) -> pl .LazyFrame :
84100 """
85- Load a table from a catalog, or by scanning the filesystem.
101+ Load the Iceberg table as a Polars LazyFrame.
102+
103+ Either load a table from a catalog, or by scanning the filesystem.
86104 """
87105 if self .catalog is not None :
88106 catalog = self .load_catalog ()
89107 return catalog .load_table (self .identifier ).to_polars ()
90- else :
91- return pl .scan_iceberg (self .location , storage_options = self .storage_options )
108+ return pl .scan_iceberg (self .location , storage_options = self .storage_options )
92109
93110
94- def from_iceberg (source_url , cratedb_url , progress : bool = False ):
111+ def from_iceberg (source_url , target_url , progress : bool = False ):
95112 """
96113 Scan an Iceberg table from local filesystem or object store, and load into CrateDB.
97114 https://docs.pola.rs/api/python/stable/reference/api/polars.scan_iceberg.html
@@ -121,11 +138,11 @@ def from_iceberg(source_url, cratedb_url, progress: bool = False):
121138 # Display parameters.
122139 logger .info (f"Iceberg address: Path: { iceberg_address .location } " )
123140
124- cratedb_address = DatabaseAddress .from_string (cratedb_url )
141+ cratedb_address = DatabaseAddress .from_string (target_url )
125142 cratedb_url , cratedb_table = cratedb_address .decode ()
126143 if cratedb_table .table is None :
127144 raise ValueError ("Table name is missing. Please adjust CrateDB database URL." )
128- logger .info (f "Target address: { cratedb_address } " )
145+ logger .info ("Target address: %s" , cratedb_address )
129146
130147 # Invoke copy operation.
131148 logger .info ("Running Iceberg copy" )
@@ -139,7 +156,7 @@ def from_iceberg(source_url, cratedb_url, progress: bool = False):
139156 # https://github.com/pola-rs/polars/issues/7852
140157 # Note: This code also uses the most efficient `insert_bulk` method with CrateDB.
141158 # https://cratedb.com/docs/sqlalchemy-cratedb/dataframe.html#efficient-insert-operations-with-pandas
142- table .collect (streaming = True ).to_pandas ().to_sql (
159+ table .collect (engine = "streaming" ).to_pandas ().to_sql (
143160 name = cratedb_table .table ,
144161 schema = cratedb_table .schema ,
145162 con = engine ,
@@ -153,8 +170,10 @@ def from_iceberg(source_url, cratedb_url, progress: bool = False):
153170 # table.to_polars().collect(streaming=True) \ # noqa: ERA001
154171 # .write_database(table_name=table_address.fullname, connection=engine, if_table_exists="replace")
155172
173+ return True
156174
157- def to_iceberg (cratedb_url , target_url , progress : bool = False ):
175+
176+ def to_iceberg (source_url , target_url , progress : bool = False ):
158177 """
159178 Synopsis
160179 --------
@@ -167,7 +186,7 @@ def to_iceberg(cratedb_url, target_url, progress: bool = False):
167186 "s3+iceberg://bucket1/?catalog=default&namespace=demo&table=taxi-tiny&s3.access-key-id=<your_access_key_id>&s3.secret-access-key=<your_secret_access_key>&s3.endpoint=<endpoint_url>&s3.region=<s3-region>"
168187 """ # noqa:E501
169188
170- cratedb_address = DatabaseAddress .from_string (cratedb_url )
189+ cratedb_address = DatabaseAddress .from_string (source_url )
171190 cratedb_url , cratedb_table = cratedb_address .decode ()
172191 if cratedb_table .table is None :
173192 raise ValueError ("Table name is missing. Please adjust CrateDB database URL." )
@@ -197,3 +216,5 @@ def to_iceberg(cratedb_url, target_url, progress: bool = False):
197216 catalog_properties = catalog_properties ,
198217 append = False , # TODO: Make available via parameter.
199218 )
219+
220+ return True
0 commit comments