Skip to content

Commit 5aa136d

Browse files
committed
Add docs and validation for chunked disk vector formats; update utils, tests, and add vignette about chunking
1 parent ef3f81c commit 5aa136d

File tree

9 files changed

+461
-16
lines changed

9 files changed

+461
-16
lines changed

R/inspire_grid.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
#' This automatic limiting can be overridden by setting `options(gridmaker.tile_multiplier)`.
5555
#' **Note:** Parallel processing is not supported when `output_type = "spatraster"`.
5656
#' Raster output will always run sequentially.
57-
#' @param max_memory_gb A numeric value. Maximum memory in gigabytes to use for grid creation. Default is NULL, in which case there is an automatic limit of available system memory. The available memory detection may fail on certain HPC (High Performance Computing) systems where jobs are allocated a fixed amount of memory that is less than the total system memory of the allocated node.
57+
#' @param max_memory_gb A numeric value. Maximum memory in gigabytes to use for grid creation. Default is `NULL`, in which case there is an automatic limit based on **available free system memory** (not total system RAM). Using this argument allows manual override, which is recommended on certain HPC (High Performance Computing) systems where jobs are allocated a fixed amount of memory that is less than the total free memory of the allocated node.
5858
#' @inheritParams inspire_grid_params
5959
#'
6060
#' @return If \code{dsn} is \code{NULL} (the default), an \code{sf} object, \code{data.frame},

R/inspire_grid_params.R

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,21 @@
3030
#' `"path/to/grid.tif"` for raster data) or a database connection string.
3131
#' If \code{dsn} is provided, the grid is written to the specified location
3232
#' instead of being returned as an object.
33+
#'
34+
#' **Supported vector formats for chunked disk writes:**
35+
#' \itemize{
36+
#' \item `.gpkg` (GeoPackage) - **Recommended** - Best balance of speed, compatibility, and modern features
37+
#' \item `.shp` (Shapefile) - Widely used, fast writes, but has limitations (10-char field names, 2GB limit)
38+
#' \item `.geojson`, `.json` (GeoJSON) - Web-friendly, works but slower for large grids
39+
#' \item `.geojsonl`, `.geojsonseq` (GeoJSONSeq) - Newline-delimited GeoJSON
40+
#' \item `.sqlite` (SQLite/SpatiaLite) - Database format (GeoPackage is built on SQLite)
41+
#' \item `.fgb` (FlatGeobuf) - Cloud-optimized format
42+
#' \item `.gdb` (OpenFileGDB) - ESRI FileGDB format
43+
#' \item `.csv`, `.tsv`, `.txt` (for dataframe output only)
44+
#' }
45+
#'
46+
#'
47+
#' Other formats not listed have not been tested and will generate a warning.
3348
#' @param layer The name of the grid layer, passed directly to `sf::st_write`.
3449
#' Its interpretation depends on the destination driver. For a GeoPackage
3550
#' file, this will be the layer name. If \code{dsn} is a file path and `layer` is
@@ -39,6 +54,7 @@
3954
#' When writing to spatial files via \code{dsn}, these are passed to \code{\link[sf]{st_write}}.
4055
#' For \code{output_type = "spatraster"} writing, these are passed to \code{\link[terra]{writeRaster}}.
4156
#' For streaming backends (`mirai` or sequential), this can include \code{max_cells_per_chunk} to control memory usage.
57+
#' @param max_memory_gb A numeric value. Maximum memory in gigabytes to use for grid creation. Default is `NULL`, in which case there is an automatic limit based on **available free system memory** (not total system RAM).
4258
#'
4359
#' @name inspire_grid_params
4460
#' @keywords internal

R/utils.R

Lines changed: 60 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,9 @@ regex_match <- function(text, pattern, i = NULL, ...) {
343343
#' @keywords internal
344344
#' @noRd
345345
validate_disk_compatibility <- function(output_type, dsn) {
346-
if (is.null(dsn)) return(TRUE)
346+
if (is.null(dsn)) {
347+
return(TRUE)
348+
}
347349

348350
ext <- tolower(tools::file_ext(dsn))
349351
is_text <- ext %in% c("csv", "tsv", "txt")
@@ -352,6 +354,29 @@ validate_disk_compatibility <- function(output_type, dsn) {
352354
is_raster <- output_type == "spatraster"
353355
is_raster_format <- ext %in% c("tif", "tiff", "nc", "img", "asc", "grd")
354356

357+
# Vector formats that support append (required for chunked disk writes)
358+
# Empirically tested and confirmed to work:
359+
# - GeoPackage (.gpkg) and SQLite (.sqlite) - excellent append support
360+
# - Shapefile (.shp) - supports append (but has other limitations like field name length)
361+
# - GeoJSON (.geojson, .json) - supports append
362+
# - FlatGeobuf (.fgb) - cloud-optimized, supports append
363+
# - OpenFileGDB (.gdb) - ESRI FileGDB, supports append
364+
# - GeoJSONSeq (.geojsonl, .geojsonseq) - newline-delimited GeoJSON, supports append
365+
append_safe_vector_formats <- c(
366+
"gpkg",
367+
"sqlite",
368+
"shp",
369+
"geojson",
370+
"json",
371+
"fgb",
372+
"gdb",
373+
"geojsonl",
374+
"geojsonseq"
375+
)
376+
377+
# Formats explicitly confirmed to NOT support append
378+
no_append_formats <- c("kml", "gml")
379+
355380
# 1. Prevent Dataframe -> Spatial Vector Format (e.g. gpkg, shp)
356381
if (is_dataframe && !is_text && !is_raster_format) {
357382
stop(
@@ -374,10 +399,42 @@ validate_disk_compatibility <- function(output_type, dsn) {
374399
)
375400
}
376401

377-
# 3. Check for readr availability if text output is requested
402+
# 3. Validate vector format supports append (required for chunked disk writes)
403+
if (is_spatial_vector && !is_text) {
404+
if (ext %in% no_append_formats) {
405+
# Explicitly unsupported formats
406+
stop(
407+
sprintf(
408+
"Output type '%s' cannot be written to '.%s' format.\n The '.%s' format does not support appending to existing files.\n Supported vector formats: %s\n Or generate the grid in memory (dsn = NULL) and save manually.",
409+
output_type,
410+
ext,
411+
ext,
412+
paste0(".", append_safe_vector_formats, collapse = ", ")
413+
),
414+
call. = FALSE
415+
)
416+
} else if (!ext %in% append_safe_vector_formats) {
417+
# Unknown/untested formats - provide a warning but more permissive
418+
warning(
419+
sprintf(
420+
"Output type '%s' with '.%s' format has not been tested for append support.\n Tested formats: %s\n The operation may fail if this format does not support appending.",
421+
output_type,
422+
ext,
423+
paste0(".", append_safe_vector_formats, collapse = ", ")
424+
),
425+
call. = FALSE,
426+
immediate. = TRUE
427+
)
428+
}
429+
}
430+
431+
# 4. Check for readr availability if text output is requested
378432
if (is_text) {
379433
if (!requireNamespace("readr", quietly = TRUE)) {
380-
stop("Package 'readr' is required to write to .csv/.tsv/.txt files. Please install it.", call. = FALSE)
434+
stop(
435+
"Package 'readr' is required to write to .csv/.tsv/.txt files. Please install it.",
436+
call. = FALSE
437+
)
381438
}
382439
}
383440

@@ -405,7 +462,6 @@ write_grid_chunk <- function(chunk, dsn, layer, append, quiet, ...) {
405462

406463
# --- Text/Delimited Output (readr) ---
407464
if (ext %in% c("csv", "tsv", "txt")) {
408-
409465
# Drop geometry if it exists (e.g. user asked for sf_polygons but wrote to .csv)
410466
if (inherits(chunk, "sf")) {
411467
chunk <- sf::st_drop_geometry(chunk)
@@ -435,7 +491,6 @@ write_grid_chunk <- function(chunk, dsn, layer, append, quiet, ...) {
435491
)
436492

437493
do.call(readr::write_delim, call_args)
438-
439494
} else {
440495
# --- Spatial Output (sf) ---
441496
# sf::st_write accepts '...' for driver specific options

_pkgdown.yml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,14 @@ navbar:
3838
home:
3939
title: 'gridmaker: Create INSPIRE-compliant grids with IDs'
4040
description: 'Create INSPIRE-compliant grids with IDs'
41+
42+
articles:
43+
- title: "Get started"
44+
navbar: "Get Started"
45+
contents:
46+
- gridmaker
47+
48+
- title: "Tutorials"
49+
navbar: "Tutorials"
50+
contents:
51+
- grid-to-disk

man/inspire_grid.Rd

Lines changed: 16 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/inspire_grid_params.Rd

Lines changed: 17 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-write_to_disk.R

Lines changed: 58 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,10 @@ test_that("inspire_grid_from_extent streams correctly to disk with mirai backend
3232
# 3. GENERATE REFERENCE GRID (IN-MEMORY) ----
3333
# This is the "ground truth" that we will compare against.
3434
# It is run sequentially to ensure deterministic output.
35-
grid_in_memory <- do.call(inspire_grid_from_extent, c(common_args, list(parallel = FALSE)))
35+
grid_in_memory <- do.call(
36+
inspire_grid_from_extent,
37+
c(common_args, list(parallel = FALSE))
38+
)
3639

3740
# 4. RUN STREAMING GRID CREATION (ON-DISK) ----
3841
# Set up a 2-core mirai backend for the test
@@ -96,7 +99,10 @@ test_that("inspire_grid_from_extent handles `layer` argument correctly for disk
9699
dir.create(temp_dir)
97100
withr::defer(unlink(temp_dir, recursive = TRUE, force = TRUE))
98101

99-
simple_extent <- sf::st_bbox(c(xmin = 0, ymin = 0, xmax = 100, ymax = 100), crs = 3035)
102+
simple_extent <- sf::st_bbox(
103+
c(xmin = 0, ymin = 0, xmax = 100, ymax = 100),
104+
crs = 3035
105+
)
100106

101107
# Test 1: When layer is NULL, it defaults from dsn and gives a message
102108
dsn_default <- file.path(temp_dir, "default.gpkg")
@@ -157,7 +163,7 @@ test_that("inspire_grid_from_extent returns dsn invisibly when writing to disk",
157163
expect_true(file.exists(temp_dsn))
158164
})
159165

160-
test_that("validate_disk_compatibility throws correct errors", {
166+
test_that("validate_disk_compatibility validates formats correctly", {
161167
# Error: Dataframe -> GPKG
162168
expect_error(
163169
validate_disk_compatibility("dataframe", "test.gpkg"),
@@ -168,10 +174,38 @@ test_that("validate_disk_compatibility throws correct errors", {
168174
skip_if_not_installed("readr")
169175
expect_true(validate_disk_compatibility("dataframe", "test.csv"))
170176

171-
# Success: SF -> GPKG
177+
# Success: Tested vector formats that support append
172178
expect_true(validate_disk_compatibility("sf_polygons", "test.gpkg"))
179+
expect_true(validate_disk_compatibility("sf_polygons", "test.sqlite"))
180+
expect_true(validate_disk_compatibility("sf_polygons", "test.shp"))
181+
expect_true(validate_disk_compatibility("sf_polygons", "test.geojson"))
182+
expect_true(validate_disk_compatibility("sf_polygons", "test.json"))
183+
184+
# Error: KML and GML explicitly cannot append
185+
expect_error(
186+
validate_disk_compatibility("sf_polygons", "test.kml"),
187+
"cannot be written to '.kml' format"
188+
)
189+
190+
expect_error(
191+
validate_disk_compatibility("sf_polygons", "test.gml"),
192+
"cannot be written to '.gml' format"
193+
)
194+
195+
# Success: Newly added tested formats
196+
expect_true(validate_disk_compatibility("sf_polygons", "test.fgb"))
197+
expect_true(validate_disk_compatibility("sf_polygons", "test.gdb"))
198+
expect_true(validate_disk_compatibility("sf_polygons", "test.geojsonl"))
199+
expect_true(validate_disk_compatibility("sf_polygons", "test.geojsonseq"))
200+
201+
# Warning: Truly untested format (e.g., MapInfo TAB)
202+
expect_warning(
203+
validate_disk_compatibility("sf_polygons", "test.tab"),
204+
"has not been tested for append support"
205+
)
173206
})
174207

208+
175209
test_that("inspire_grid_from_ids writes dataframe to CSV correctly (with chunking)", {
176210
skip_if_not_installed("readr")
177211
skip_if_not_installed("sf")
@@ -208,7 +242,10 @@ test_that("inspire_grid_from_extent streams to CSV (dropping geometry)", {
208242
# Setup: Create a grid that will definitely be chunked (small RAM limit sim or just standard stream)
209243
# We use standard stream_grid_sequential via inspire_grid_from_extent by not setting parallel
210244

211-
simple_extent <- sf::st_bbox(c(xmin = 0, ymin = 0, xmax = 20000, ymax = 20000), crs = 3035)
245+
simple_extent <- sf::st_bbox(
246+
c(xmin = 0, ymin = 0, xmax = 20000, ymax = 20000),
247+
crs = 3035
248+
)
212249
tmp_csv <- tempfile(fileext = ".csv")
213250
on.exit(unlink(tmp_csv), add = TRUE)
214251

@@ -266,7 +303,14 @@ test_that("CSV writing respects extra arguments (e.g. na string)", {
266303
chunk <- data.frame(a = c(1, NA), b = c("x", "y"))
267304

268305
# Write with custom NA string
269-
write_grid_chunk(chunk, tmp_csv, layer = NULL, append = FALSE, quiet = TRUE, na = "MISSING")
306+
write_grid_chunk(
307+
chunk,
308+
tmp_csv,
309+
layer = NULL,
310+
append = FALSE,
311+
quiet = TRUE,
312+
na = "MISSING"
313+
)
270314

271315
# Read back raw text to verify "MISSING" is there
272316
lines <- readLines(tmp_csv)
@@ -287,7 +331,14 @@ test_that("CSV writing respects extra arguments (e.g. na string)", {
287331
# Verify pass-through of 'quote' arg
288332
tmp_csv3 <- tempfile(fileext = ".csv")
289333
on.exit(unlink(tmp_csv3), add = TRUE)
290-
write_grid_chunk(chunk, tmp_csv3, layer = NULL, append = FALSE, quiet = TRUE, quote = "all")
334+
write_grid_chunk(
335+
chunk,
336+
tmp_csv3,
337+
layer = NULL,
338+
append = FALSE,
339+
quiet = TRUE,
340+
quote = "all"
341+
)
291342
lines3 <- readLines(tmp_csv3)
292343
# Look for quoted character values like "x" (quote = "all" quotes character columns)
293344
expect_true(any(grepl('"x"', lines3)))

vignettes/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@
22
*.R
33

44
/.quarto/
5+
6+
**/*.quarto_ipynb

0 commit comments

Comments
 (0)