GenXProject · gschivley · Feb 5, 2025 · Jul 11, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## Unreleased
 
+### Added
+- Support for reading input files in CSV, gzipped CSV (.csv.gz), and Parquet (.parquet) formats with automatic format detection.
+- New setting `TemporalOutputFormat` to allow users to write full time series temporal output files in CSV, gzipped CSV, or Parquet formats for storage reduction.
+
+### Changed
+- Input file loading now uses DuckDB instead of CSV.jl, enabling support for compressed and Parquet formats.
+- Full time series temporal outputs can now be written in compressed formats based on the `TemporalOutputFormat` setting (annual outputs always remain CSV).
+
 ## [0.4.5] - 2025-07-07
 
 ### Added

diff --git a/Project.toml b/Project.toml
@@ -11,6 +11,7 @@ DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
+DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1"
 HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
@@ -31,6 +32,7 @@ DataFrames = "1.3.4"
 DataStructures = "0.18.13"
 Dates = "1"
 Distances = "0.10.7"
+DuckDB = "1, 1.4"
 HiGHS = "1.1.4"
 JuMP = "1.1.1"
 LinearAlgebra = "1"

diff --git a/docs/src/User_Guide/model_input.md b/docs/src/User_Guide/model_input.md
@@ -1,6 +1,6 @@
 # GenX Inputs
 
-All input files are in CSV format. Running the GenX model requires a minimum of four **mandatory input files** and one folder, which consists of CSV files for generating resources:
+All input files are in CSV format, but can also be provided as gzipped CSV (.csv.gz) or Parquet (.parquet) files. GenX will automatically detect and read any of these formats. Running the GenX model requires a minimum of four **mandatory input files** and one folder, which consists of CSV files for generating resources:
 
 ```@raw html
 <ol>
@@ -35,6 +35,9 @@ Additionally, the user may need to specify eight more **settings-specific** inpu
 !!! note "Note"
     Names of the input files are case sensitive.
 
+!!! tip "Input File Formats"
+    Input files can be provided in CSV (.csv), gzipped CSV (.csv.gz), or Parquet (.parquet) formats. GenX will automatically detect and read the appropriate format. To use compressed or Parquet formats, simply use the same filename stem (e.g., `Demand_data.csv.gz` or `Demand_data.parquet` instead of `Demand_data.csv`). This can significantly reduce storage requirements for large input files without requiring any changes to settings files.
+
 
 ## 1 Mandatory input data
 

diff --git a/docs/src/User_Guide/model_output.md b/docs/src/User_Guide/model_output.md
@@ -2,6 +2,14 @@
 
 The table below summarizes the units of each output variable reported as part of the various CSV files produced after each model run. The reported units are also provided. If a result file includes time-dependent values, the value will not include the hour weight in it. An annual sum ("AnnualSum") column/row will be provided whenever it is possible (e.g., `emissions.csv`).
 
+!!! tip "Temporal Output File Formats"
+    For full time series temporal outputs (when `WriteOutputs: "full"`), users can specify the output format using the `TemporalOutputFormat` setting in `genx_settings.yml`. Supported formats are:
+    - `"csv"` (default): Standard CSV format
+    - `"gzip"`: Gzipped CSV format (.csv.gz) for 50-70% storage reduction
+    - `"parquet"`: Apache Parquet format (.parquet) for 80-90% storage reduction
+
+    This setting affects files like power.csv, charge.csv, storage.csv, curtailment.csv, and other time series outputs. Annual outputs (when `WriteOutputs: "annual"`) are always written in CSV format.
+
 ## 1 Default output files
 
 ### 1.1 capacity.csv

diff --git a/src/GenX.jl b/src/GenX.jl
@@ -26,6 +26,7 @@ export run_timedomainreduction!
 using JuMP # used for mathematical programming
 using DataFrames #This package allows put together data into a matrix
 using CSV
+using DuckDB
 using StatsBase
 using LinearAlgebra
 using YAML

diff --git a/src/configure_settings/configure_settings.jl b/src/configure_settings/configure_settings.jl
@@ -32,6 +32,7 @@ function default_settings()
         "HydrogenHourlyMatching" => 0,
         "DC_OPF" => 0,
         "WriteOutputs" => "full",
+        "TemporalOutputFormat" => "csv",
         "ComputeConflicts" => 0,
         "StorageVirtualDischarge" => 1,
         "ResourcesFolder" => "resources",
@@ -76,6 +77,14 @@ function validate_settings!(settings::Dict{Any, Any})
     # make WriteOutputs setting lowercase and check for valid value
     settings["WriteOutputs"] = lowercase(settings["WriteOutputs"])
     @assert settings["WriteOutputs"] ∈ ["annual", "full"]
+
+    # make TemporalOutputFormat setting lowercase and check for valid value
+    if haskey(settings, "TemporalOutputFormat")
+        settings["TemporalOutputFormat"] = lowercase(settings["TemporalOutputFormat"])
+        @assert settings["TemporalOutputFormat"] ∈ ["csv", "gzip", "parquet"] "TemporalOutputFormat must be one of: csv, gzip, parquet"
+    else
+        settings["TemporalOutputFormat"] = "csv"  # default to CSV for backward compatibility
+    end
 
     if "OperationWrapping" in keys(settings)
         @warn """The behavior of the TimeDomainReduction and OperationWrapping

diff --git a/src/load_inputs/load_dataframe.jl b/src/load_inputs/load_dataframe.jl
@@ -6,6 +6,7 @@ end
     file_exists(dir::AbstractString, basenames::Vector{String})::Bool
 
 Checks that a file exists in a directory under (at least) one of a list of 'aliases'.
+Now also checks for .csv.gz and .parquet alternatives.
 """
 function file_exists(dir, basenames::Vector{String})::Bool
     if !isdir(dir)
@@ -18,6 +19,15 @@ function file_exists(dir, basenames::Vector{String})::Bool
     end
 
     FILENOTFOUND = filenotfoundconstant()
+
+    # Try to find the file with different extensions if the basename ends with .csv
+    if endswith(best_basename, ".csv")
+        basename_without_ext = best_basename[1:end-4]
+        found_file = find_file_with_extension(dir, basename_without_ext)
+        if found_file != FILENOTFOUND
+            return true
+        end
+    end
 
     for base in basenames
         target = look_for_file_with_alternate_case(dir, base)
@@ -54,15 +64,40 @@ end
 function load_dataframe(dir::AbstractString, basenames::Vector{String})::DataFrame
     best_basename = popfirst!(basenames)
     best_path = joinpath(dir, best_basename)
+
+    # First try exact match
     if isfile(best_path)
         return load_dataframe_from_file(best_path)
     end
-
+    
     FILENOTFOUND = filenotfoundconstant()
-
+
+    # Try to find the file with different extensions if the basename ends with .csv
+    if endswith(best_basename, ".csv")
+        basename_without_ext = best_basename[1:end-4]  # Remove .csv extension
+        found_file = find_file_with_extension(dir, basename_without_ext)
+        if found_file != FILENOTFOUND
+            if found_file != best_basename
+                # Warn if using a different extension
+                @info "Using file '$found_file' instead of '$best_basename'"
+            end
+            return load_dataframe_from_file(joinpath(dir, found_file))
+        end
+    else
+        # If basename doesn't end with .csv, try case-insensitive match
+        target = look_for_file_with_alternate_case(dir, best_basename)
+        if target != FILENOTFOUND
+            Base.depwarn(
+                """The filename '$target' is deprecated. '$best_basename' is preferred.""",
+                :load_dataframe,
+                force = true)
+            return load_dataframe_from_file(joinpath(dir, target))
+        end
+    end
+
+    # Try alternative basenames (deprecated names)
     for base in basenames
         target = look_for_file_with_alternate_case(dir, base)
-        # admonish
         if target != FILENOTFOUND
             Base.depwarn(
                 """The filename '$target' is deprecated. '$best_basename' is preferred.""",
@@ -71,7 +106,7 @@ function load_dataframe(dir::AbstractString, basenames::Vector{String})::DataFra
             return load_dataframe_from_file(joinpath(dir, target))
         end
     end
-
+    
     throw_filenotfound_error(dir, best_basename)
 end
 
@@ -83,6 +118,34 @@ function throw_filenotfound_error(dir, base)
     error(err_str)
 end
 
+"""
+    find_file_with_extension(dir::AbstractString, basename_without_ext::AbstractString)::String
+
+Find a file in the directory with one of the supported extensions (.csv, .csv.gz, .parquet).
+Returns the filename if found, or FILENOTFOUND constant if not found.
+"""
+function find_file_with_extension(dir::AbstractString, basename_without_ext::AbstractString)::String
+    FILENOTFOUND = filenotfoundconstant()
+
+    # Try different extensions in order of preference
+    extensions = [".csv", ".csv.gz", ".parquet"]
+
+    for ext in extensions
+        candidate = basename_without_ext * ext
+        if isfile(joinpath(dir, candidate))
+            return candidate
+        end
+
+        # Also try case-insensitive match
+        target = look_for_file_with_alternate_case(dir, candidate)
+        if target != FILENOTFOUND
+            return target
+        end
+    end
+
+    return FILENOTFOUND
+end
+
 function look_for_file_with_alternate_case(dir, base)::String
     lower_base = lowercase(base)
 
@@ -101,11 +164,38 @@ function look_for_file_with_alternate_case(dir, base)::String
     return target
 end
 
-function csv_header(path::AbstractString)
-    f = open(path, "r")
-    header = readline(f)
-    close(f)
-    header
+"""
+    get_column_names(path::AbstractString)
+
+Get column names from a file using DuckDB's DESCRIBE functionality.
+Supports CSV, gzipped CSV (.csv.gz), and Parquet files.
+
+Note: The path comes from file system operations (not user input) and is 
+validated by Julia's file existence checks before reaching this function.
+"""
+function get_column_names(path::AbstractString)
+    # Validate that the path exists (security check)
+    if !isfile(path)
+        error("File does not exist: $path")
+    end
+
+    # Use DuckDB to describe the file and get column names
+    db = DuckDB.DB()
+    try
+        # Escape single quotes in path for SQL safety
+        # This is sufficient since paths come from file system, not user input
+        escaped_path = replace(path, "'" => "''")
+
+        # DuckDB can automatically detect file type
+        desc_query = "DESCRIBE SELECT * FROM read_csv_auto('$escaped_path')"
+        if endswith(path, ".parquet")
+            desc_query = "DESCRIBE SELECT * FROM '$escaped_path'"
+        end
+        result = DuckDB.execute(db, desc_query) |> DataFrame
+        return String.(result.column_name)
+    finally
+        DuckDB.close(db)
+    end
 end
 
 function keep_duplicated_entries!(s, uniques)
@@ -116,20 +206,65 @@ function keep_duplicated_entries!(s, uniques)
 end
 
 function check_for_duplicate_keys(path::AbstractString)
-    header = csv_header(path)
-    keys = split(header, ',')
-    uniques = unique(keys)
-    if length(keys) > length(uniques)
-        dupes = keep_duplicated_entries!(keys, uniques)
-        @error """Some duplicate column names detected in the header of $path: $dupes.
+    column_names = get_column_names(path)
+
+    # DuckDB automatically renames duplicate columns (e.g., Name -> Name_1)
+    # Check if any column names end with _N where N is a number, which indicates a duplicate
+    duplicate_pattern = r"(.+)_(\d+)$"
+    potential_dupes = String[]
+
+    for col in column_names
+        m = match(duplicate_pattern, col)
+        if !isnothing(m)
+            # Found a column with _N suffix, check if base name exists
+            base_name = m.captures[1]
+            if base_name in column_names
+                push!(potential_dupes, col)
+            end
+        end
+    end
+
+    if !isempty(potential_dupes)
+        @error """Some duplicate column names detected in the header of $path: $potential_dupes.
+        DuckDB has automatically renamed them by appending _N suffixes.
         Duplicate column names may cause errors, as only the first is used.
         """
     end
 end
 
+"""
+    load_dataframe_from_file(path)::DataFrame
+
+Load a dataframe from a file using DuckDB.
+Supports CSV, gzipped CSV (.csv.gz), and Parquet files.
+
+Note: The path comes from file system operations (not user input) and is
+validated by file existence checks before reaching this function.
+"""
 function load_dataframe_from_file(path)::DataFrame
+    # Validate that the path exists (security check)
+    if !isfile(path)
+        error("File does not exist: $path")
+    end
+
     check_for_duplicate_keys(path)
-    CSV.read(path, DataFrame, header = 1)
+
+    # Use DuckDB to read the file
+    db = DuckDB.DB()
+    try
+        # Escape single quotes in path for SQL safety
+        # This is sufficient since paths come from file system, not user input
+        escaped_path = replace(path, "'" => "''")
+
+        # DuckDB automatically detects file type and handles CSV, CSV.GZ, and Parquet
+        query = "SELECT * FROM read_csv_auto('$escaped_path')"
+        if endswith(path, ".parquet")
+            query = "SELECT * FROM '$escaped_path'"
+        end
+        return DuckDB.execute(db, query) |> DataFrame
+    finally
+        DuckDB.close(db)
+    end
 end
 
 function find_matrix_columns_in_dataframe(df::DataFrame,