Zarr Operations Cookbook

The hardware and bandwidth for this mirror is donated by dogado GmbH, the Webhosting and Full Service-Cloud Provider. Check out our Wordpress Tutorial.
If you wish to report a bug, or if you are interested in having us mirror your free-software or open-source project, please feel free to contact us at mirror[@]dogado.de.

This vignette covers common zarr array operations: persistent storage, compression, resizing, filters, and advanced indexing.

Persistent arrays

Create an array on disk, close the session, and reopen it later.

path <- file.path(tempdir(), "example.zarr")

# Create a persistent array backed by a DirectoryStore
z <- zarr_open_array(
  store = path, mode = "w",
  shape = c(5, 10), chunks = c(5, 5), dtype = "<f4"
)

# Write data
z$set_item("...", array(1:50, dim = c(5, 10)))
#> NULL

z$get_shape()
#> [1]  5 10

Reopen the same path in read mode:

z2 <- zarr_open_array(store = path, mode = "r")

z2$get_shape()
#> [1]  5 10

z2$get_item("...")$data
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    1    6   11   16   21   26   31   36   41    46
#> [2,]    2    7   12   17   22   27   32   37   42    47
#> [3,]    3    8   13   18   23   28   33   38   43    48
#> [4,]    4    9   14   19   24   29   34   39   44    49
#> [5,]    5   10   15   20   25   30   35   40   45    50

For quick save/load of an existing array:

save_path <- file.path(tempdir(), "saved.zarr")

# Save an R array directly
zarr_save_array(save_path, zarr_create_array(
  data = volcano, shape = dim(volcano), dtype = "<f8"
))
#> <ZarrArray> /
#>   Shape       : (87, 61)
#>   Chunks      : (87, 61)
#>   Data type   : <f8
#>   Fill value  : 0
#>   Order       : F
#>   Read-only   : FALSE
#>   Compressor  : ZstdCodec
#>   Store type  : DirectoryStore
#>   Zarr format : 2

# Reopen
z3 <- zarr_open_array(save_path, mode = "r")

all.equal(z3$as.array(), volcano)
#> [1] TRUE

Compression

By default, pizzarr uses Zstandard compression. You can choose a different compressor when creating an array.

Zstandard (default)

z_zstd <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = ZstdCodec$new(level = 3)
)

z_zstd$get_compressor()$get_config()
#> $id
#> [x] "zstd"
#> 
#> $level
#> [x] 3

Gzip

Gzip compression is interoperable with zarr-python and other implementations, but is slower than Zstandard because R lacks an in-memory gzip API. For best write performance, prefer ZstdCodec.

z_gzip <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = GzipCodec$new(level = 5)
)

z_gzip$get_compressor()$get_config()
#> $id
#> [x] "gzip"
#> 
#> $level
#> [x] 5

Blosc (with algorithm selection)

z_blosc <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = BloscCodec$new(cname = "lz4", clevel = 5, shuffle = TRUE)
)

z_blosc$get_compressor()$get_config()
#> $id
#> [x] "blosc"
#> 
#> $cname
#> [x] "lz4"
#> 
#> $clevel
#> [x] 5
#> 
#> $shuffle
#> [x] 1
#> 
#> $blocksize
#> [x] 0

No compression

z_none <- zarr_create(
  shape = c(100, 100), dtype = "<f4",
  compressor = NA
)

is.na(z_none$get_compressor())
#> Warning in is.na(z_none$get_compressor()): is.na() applied to non-(list or
#> vector) of type 'environment'
#> [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE

Resizing arrays

Arrays can be resized after creation. Data in the overlapping region is preserved; new regions are filled with the fill value.

z <- zarr_create(
  shape = c(5, 10), chunks = c(5, 5),
  dtype = "<i4", fill_value = 0L,
  compressor = "default"
)

z$set_item("...", array(1:50, dim = c(5, 10)))
#> NULL

z$get_shape()
#> [1]  5 10

# Grow the array
z$resize(10, 20)

z$get_shape()
#> [1] 10 20

# Original data is preserved in the top-left corner
z[1:5, 1:10]$data
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    1    6   11   16   21   26   31   36   41    46
#> [2,]    2    7   12   17   22   27   32   37   42    47
#> [3,]    3    8   13   18   23   28   33   38   43    48
#> [4,]    4    9   14   19   24   29   34   39   44    49
#> [5,]    5   10   15   20   25   30   35   40   45    50

# New region is filled with fill_value
z[6:10, 1:5]$data
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    0    0    0    0    0
#> [2,]    0    0    0    0    0
#> [3,]    0    0    0    0    0
#> [4,]    0    0    0    0    0
#> [5,]    0    0    0    0    0

Shrinking removes chunks that fall outside the new shape:

z$resize(3, 4)

z$get_shape()
#> [1] 3 4

z$get_item("...")$data
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    6   11   16
#> [2,]    2    7   12   17
#> [3,]    3    8   13   18

Appending data

Use append() to grow an array along an axis, adding new data at the end. This is equivalent to zarr-python’s z.append(data, axis=0), but uses R’s 1-based axis indexing (axis 1 = first dimension).

z <- zarr_create(
  shape = c(3, 4), chunks = c(3, 4),
  dtype = "<i4", fill_value = 0L
)

z$set_item("...", array(1:12, dim = c(3, 4)))
#> NULL

z$as.array()
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12

Append new rows (axis 1, the default):

new_rows <- array(13:20, dim = c(2, 4))

z$append(new_rows)
#> NULL

z$get_shape()
#> [1] 5 4

z$as.array()
#>      [,1] [,2] [,3] [,4]
#> [1,]    1    4    7   10
#> [2,]    2    5    8   11
#> [3,]    3    6    9   12
#> [4,]   13   15   17   19
#> [5,]   14   16   18   20

Append new columns (axis 2):

new_cols <- array(21:30, dim = c(5, 2))

z$append(new_cols, axis = 2)
#> NULL

z$get_shape()
#> [1] 5 6

z$as.array()
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    4    7   10   21   26
#> [2,]    2    5    8   11   22   27
#> [3,]    3    6    9   12   23   28
#> [4,]   13   15   17   19   24   29
#> [5,]   14   16   18   20   25   30

Filters

Filters transform chunk data before compression. They are codec instances passed as a list to the filters parameter. A common use case is variable-length UTF-8 string arrays, which require VLenUtf8Codec as a filter.

words <- c("alpha", "bravo", "charlie", "delta")

z_str <- zarr_create_array(
  data = array(words, dim = length(words)),
  shape = length(words), dtype = "|O",
  object_codec = VLenUtf8Codec$new()
)

z_str$get_item("...")$data
#> [1] "alpha"   "bravo"   "charlie" "delta"

z_str$get_filters()
#> [[1]]
#> <VLenUtf8Codec>
#>   Inherits from: <Codec>
#>   Public:
#>     clone: function (deep = FALSE) 
#>     decode: function (buf, zarr_arr) 
#>     encode: function (buf, zarr_arr) 
#>     get_config: function ()

Advanced indexing

Beyond basic slicing with slice() or [, pizzarr supports orthogonal indexing for independent selection along each dimension.

Setup

z <- zarr_create_array(
  data = matrix(1:30, nrow = 5, ncol = 6),
  shape = c(5, 6), dtype = "<i4"
)

z$as.array()
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    6   11   16   21   26
#> [2,]    2    7   12   17   22   27
#> [3,]    3    8   13   18   23   28
#> [4,]    4    9   14   19   24   29
#> [5,]    5   10   15   20   25   30

Basic slicing with `[`

The bracket operator uses orthogonal indexing internally:

# Select rows 1-3, columns 2-4
z[1:3, 2:4]$data
#>      [,1] [,2] [,3]
#> [1,]    6   11   16
#> [2,]    7   12   17
#> [3,]    8   13   18

Orthogonal selection with integer arrays

Select specific rows and columns independently. Note that get_orthogonal_selection uses zero-based indices (like zarr-python), while the [ operator uses R’s one-based indexing:

z$get_orthogonal_selection(list(c(0L, 2L, 4L), zb_slice(0, 6)))$data
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    6   11   16   21   26
#> [2,]    3    8   13   18   23   28
#> [3,]    5   10   15   20   25   30

Boolean (mask) dimension indexing

Select dimensions using logical vectors:

row_mask <- c(TRUE, FALSE, TRUE, FALSE, TRUE)

z$get_orthogonal_selection(list(row_mask, zb_slice(0, 6)))$data
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    1    6   11   16   21   26
#> [2,]    3    8   13   18   23   28
#> [3,]    5   10   15   20   25   30

Using the OIndex object

The $get_oindex() accessor provides the same orthogonal indexing:

oi <- z$get_oindex()

oi$get_item(list(c(0L, 4L), c(1L, 3L, 5L)))$data
#>      [,1] [,2] [,3]
#> [1,]    6   16   26
#> [2,]   10   20   30

Slicing with step

Select every other row, every third column using seq() in bracket notation:

z[seq(1, 5, 2), seq(1, 6, 3)]$data
#>      [,1] [,2]
#> [1,]    1   16
#> [2,]    3   18
#> [3,]    5   20

Ellipsis and colon shorthand

"..." selects all remaining dimensions; ":" selects all along one dimension. These work with get_item():

# All rows, column 1
z$get_item(list(":", 1))$data
#>      [,1]
#> [1,]    6
#> [2,]    7
#> [3,]    8
#> [4,]    9
#> [5,]   10

# Row 1, all columns
z$get_item(list(1, "..."))$data
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    2    7   12   17   22   27

These binaries (installable software) and packages are in development.
They may not be fully stable and should be used with caution. We make no claims about them.
Health stats visible at Monitor.

Zarr Operations Cookbook

Persistent arrays

Compression

Zstandard (default)

Gzip

Blosc (with algorithm selection)

No compression

Resizing arrays

Appending data

Filters

Advanced indexing

Setup

Basic slicing with [

Orthogonal selection with integer arrays

Boolean (mask) dimension indexing

Using the OIndex object

Slicing with step

Ellipsis and colon shorthand

Basic slicing with `[`