zarr-python by davila7/claude-code-templates
npx skills add https://github.com/davila7/claude-code-templates --skill zarr-pythonZarr 是一个用于存储大型 N 维数组的 Python 库,支持分块和压缩。应用此技能可实现高效的并行 I/O、云原生工作流,以及与 NumPy、Dask 和 Xarray 的无缝集成。
uv pip install zarr
需要 Python 3.11+。如需云存储支持,请安装额外的包:
uv pip install s3fs # 用于 S3
uv pip install gcsfs # 用于 Google Cloud Storage
import zarr
import numpy as np
# 创建具有分块和压缩功能的 2D 数组
z = zarr.create_array(
store="data/my_array.zarr",
shape=(10000, 10000),
chunks=(1000, 1000),
dtype="f4"
)
# 使用 NumPy 风格的索引写入数据
z[:, :] = np.random.random((10000, 10000))
# 读取数据
data = z[0:100, 0:100] # 返回 NumPy 数组
Zarr 提供了多种便捷函数用于数组创建:
# 创建空数组
z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000), dtype='f4',
store='data.zarr')
# 创建填充数组
z = zarr.ones((5000, 5000), chunks=(500, 500))
z = zarr.full((1000, 1000), fill_value=42, chunks=(100, 100))
# 从现有数据创建
data = np.arange(10000).reshape(100, 100)
z = zarr.array(data, chunks=(10, 10), store='data.zarr')
# 创建类似数组
z2 = zarr.zeros_like(z) # 匹配 z 的形状、分块和数据类型
广告位招租
在这里展示您的产品或服务
触达数万 AI 开发者,精准高效
# 打开数组(默认为读写模式)
z = zarr.open_array('data.zarr', mode='r+')
# 只读模式
z = zarr.open_array('data.zarr', mode='r')
# open() 函数自动检测数组与组
z = zarr.open('data.zarr') # 返回 Array 或 Group
Zarr 数组支持类似 NumPy 的索引:
# 写入整个数组
z[:] = 42
# 写入切片
z[0, :] = np.arange(100)
z[10:20, 50:60] = np.random.random((10, 10))
# 读取数据(返回 NumPy 数组)
data = z[0:100, 0:100]
row = z[5, :]
# 高级索引
z.vindex[[0, 5, 10], [2, 8, 15]] # 坐标索引
z.oindex[0:10, [5, 10, 15]] # 正交索引
z.blocks[0, 0] # 块/分块索引
# 调整数组大小
z.resize(15000, 15000) # 扩展或缩小维度
# 沿轴追加数据
z.append(np.random.random((1000, 10000)), axis=0) # 添加行
分块对性能至关重要。根据访问模式选择分块大小和形状。
最小分块大小 :建议 1 MB 以获得最佳性能
平衡 :更大的分块 = 更少的元数据操作;更小的分块 = 更好的并行访问
内存考虑 :压缩期间整个分块必须能放入内存
z = zarr.zeros( shape=(10000, 10000), chunks=(512, 512), # ~1MB 分块 dtype='f4' )
关键 :分块形状根据数据访问方式会显著影响性能。
# 如果频繁访问行(第一维度)
z = zarr.zeros((10000, 10000), chunks=(10, 10000)) # 分块跨越列
# 如果频繁访问列(第二维度)
z = zarr.zeros((10000, 10000), chunks=(10000, 10)) # 分块跨越行
# 对于混合访问模式(平衡方法)
z = zarr.zeros((10000, 10000), chunks=(1000, 1000)) # 方形分块
性能示例 :对于 (200, 200, 200) 数组,沿第一维度读取:
当数组有数百万个小分块时,使用分片将分块分组到更大的存储对象中:
from zarr.codecs import ShardingCodec, BytesCodec
from zarr.codecs.blosc import BloscCodec
# 创建带分片的数组
z = zarr.create_array(
store='data.zarr',
shape=(100000, 100000),
chunks=(100, 100), # 用于访问的小分块
shards=(1000, 1000), # 每个分片包含 100 个分块
dtype='f4'
)
优势 :
重要 :写入前整个分片必须能放入内存。
Zarr 对每个分块应用压缩以减少存储空间,同时保持快速访问。
from zarr.codecs.blosc import BloscCodec
from zarr.codecs import GzipCodec, ZstdCodec
# 默认:使用 Zstandard 的 Blosc
z = zarr.zeros((1000, 1000), chunks=(100, 100)) # 使用默认压缩
# 配置 Blosc 编解码器
z = zarr.create_array(
store='data.zarr',
shape=(1000, 1000),
chunks=(100, 100),
dtype='f4',
codecs=[BloscCodec(cname='zstd', clevel=5, shuffle='shuffle')]
)
# 可用的 Blosc 压缩器:'blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'
# 使用 Gzip 压缩
z = zarr.create_array(
store='data.zarr',
shape=(1000, 1000),
chunks=(100, 100),
dtype='f4',
codecs=[GzipCodec(level=6)]
)
# 禁用压缩
z = zarr.create_array(
store='data.zarr',
shape=(1000, 1000),
chunks=(100, 100),
dtype='f4',
codecs=[BytesCodec()] # 无压缩
)
Blosc(默认):快速压缩/解压缩,适用于交互式工作负载
Zstandard :更好的压缩比,比 LZ4 稍慢
Gzip :最大压缩,性能较慢
LZ4 :最快的压缩,压缩比较低
Shuffle :对数值数据启用 shuffle 过滤器以获得更好的压缩
codecs=[BloscCodec(cname='zstd', clevel=5, shuffle='shuffle')]
codecs=[BloscCodec(cname='lz4', clevel=1)]
codecs=[GzipCodec(level=9)]
Zarr 通过灵活的存储接口支持多种存储后端。
from zarr.storage import LocalStore
# 显式创建存储
store = LocalStore('data/my_array.zarr')
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
# 或使用字符串路径(自动创建 LocalStore)
z = zarr.open_array('data/my_array.zarr', mode='w', shape=(1000, 1000),
chunks=(100, 100))
from zarr.storage import MemoryStore
# 创建内存存储
store = MemoryStore()
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
# 数据仅存在于内存中,不持久化
from zarr.storage import ZipStore
# 写入 ZIP 文件
store = ZipStore('data.zip', mode='w')
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
z[:] = np.random.random((1000, 1000))
store.close() # 重要:必须关闭 ZipStore
# 从 ZIP 文件读取
store = ZipStore('data.zip', mode='r')
z = zarr.open_array(store=store)
data = z[:]
store.close()
import s3fs
import zarr
# S3 存储
s3 = s3fs.S3FileSystem(anon=False) # 使用凭据
store = s3fs.S3Map(root='my-bucket/path/to/array.zarr', s3=s3)
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
z[:] = data
# Google Cloud Storage
import gcsfs
gcs = gcsfs.GCSFileSystem(project='my-project')
store = gcsfs.GCSMap(root='my-bucket/path/to/array.zarr', gcs=gcs)
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
云存储最佳实践 :
zarr.consolidate_metadata(store)组用于分层组织多个数组,类似于目录或 HDF5 组。
# 创建根组
root = zarr.group(store='data/hierarchy.zarr')
# 创建子组
temperature = root.create_group('temperature')
precipitation = root.create_group('precipitation')
# 在组内创建数组
temp_array = temperature.create_array(
name='t2m',
shape=(365, 720, 1440),
chunks=(1, 720, 1440),
dtype='f4'
)
precip_array = precipitation.create_array(
name='prcp',
shape=(365, 720, 1440),
chunks=(1, 720, 1440),
dtype='f4'
)
# 使用路径访问
array = root['temperature/t2m']
# 可视化层次结构
print(root.tree())
# 输出:
# /
# ├── temperature
# │ └── t2m (365, 720, 1440) f4
# └── precipitation
# └── prcp (365, 720, 1440) f4
Zarr 为熟悉 HDF5 的用户提供了 h5py 兼容的接口:
# 使用 h5py 风格的方法创建组
root = zarr.group('data.zarr')
dataset = root.create_dataset('my_data', shape=(1000, 1000), chunks=(100, 100),
dtype='f4')
# 像 h5py 一样访问
grp = root.require_group('subgroup')
arr = grp.require_dataset('array', shape=(500, 500), chunks=(50, 50), dtype='i4')
使用属性将自定义元数据附加到数组和组:
# 向数组添加属性
z = zarr.zeros((1000, 1000), chunks=(100, 100))
z.attrs['description'] = 'Temperature data in Kelvin'
z.attrs['units'] = 'K'
z.attrs['created'] = '2024-01-15'
z.attrs['processing_version'] = 2.1
# 属性以 JSON 格式存储
print(z.attrs['units']) # 输出:K
# 向组添加属性
root = zarr.group('data.zarr')
root.attrs['project'] = 'Climate Analysis'
root.attrs['institution'] = 'Research Institute'
# 属性随数组/组持久化
z2 = zarr.open('data.zarr')
print(z2.attrs['description'])
重要 :属性必须是 JSON 可序列化的(字符串、数字、列表、字典、布尔值、null)。
Zarr 数组实现了 NumPy 数组接口:
import numpy as np
import zarr
z = zarr.zeros((1000, 1000), chunks=(100, 100))
# 直接使用 NumPy 函数
result = np.sum(z, axis=0) # NumPy 操作 Zarr 数组
mean = np.mean(z[:100, :100])
# 转换为 NumPy 数组
numpy_array = z[:] # 将整个数组加载到内存中
Dask 在 Zarr 数组上提供惰性、并行计算:
import dask.array as da
import zarr
# 创建大型 Zarr 数组
z = zarr.open('data.zarr', mode='w', shape=(100000, 100000),
chunks=(1000, 1000), dtype='f4')
# 作为 Dask 数组加载(惰性,不加载数据)
dask_array = da.from_zarr('data.zarr')
# 执行计算(并行,核外)
result = dask_array.mean(axis=0).compute() # 并行计算
# 将 Dask 数组写入 Zarr
large_array = da.random.random((100000, 100000), chunks=(1000, 1000))
da.to_zarr(large_array, 'output.zarr')
优势 :
Xarray 提供带有 Zarr 后端的带标签的多维数组:
import xarray as xr
import zarr
# 将 Zarr 存储作为 Xarray Dataset 打开(惰性加载)
ds = xr.open_zarr('data.zarr')
# Dataset 包含坐标和元数据
print(ds)
# 访问变量
temperature = ds['temperature']
# 执行带标签的操作
subset = ds.sel(time='2024-01', lat=slice(30, 60))
# 将 Xarray Dataset 写入 Zarr
ds.to_zarr('output.zarr')
# 使用坐标从头创建
ds = xr.Dataset(
{
'temperature': (['time', 'lat', 'lon'], data),
'precipitation': (['time', 'lat', 'lon'], data2)
},
coords={
'time': pd.date_range('2024-01-01', periods=365),
'lat': np.arange(-90, 91, 1),
'lon': np.arange(-180, 180, 1)
}
)
ds.to_zarr('climate_data.zarr')
优势 :
from zarr import ThreadSynchronizer
import zarr
# 用于多线程写入
synchronizer = ThreadSynchronizer()
z = zarr.open_array('data.zarr', mode='r+', shape=(10000, 10000),
chunks=(1000, 1000), synchronizer=synchronizer)
# 支持来自多个线程的并发写入
# (当写入不跨越分块边界时)
from zarr import ProcessSynchronizer
import zarr
# 用于多进程写入
synchronizer = ProcessSynchronizer('sync_data.sync')
z = zarr.open_array('data.zarr', mode='r+', shape=(10000, 10000),
chunks=(1000, 1000), synchronizer=synchronizer)
# 支持来自多个进程的并发写入
注意 :
对于具有许多数组的分层存储,将元数据合并到单个文件中以减少 I/O 操作:
import zarr
# 创建数组/组后
root = zarr.group('data.zarr')
# ... 创建多个数组/组 ...
# 合并元数据
zarr.consolidate_metadata('data.zarr')
# 使用合并的元数据打开(更快,尤其是在云存储上)
root = zarr.open_consolidated('data.zarr')
优势 :
tree() 操作和组遍历注意事项 :
分块大小 :目标为每个分块 1-10 MB
# 对于 float32:1MB = 262,144 个元素
chunks = (512, 512) # 512×512×4 字节 = ~1MB
分块形状 :与访问模式对齐
# 行式访问 → 分块跨越列:(小, 大)
# 列式访问 → 分块跨越行:(大, 小)
# 随机访问 → 平衡:(中, 中)
压缩 :根据工作负载选择
# 交互式/快速:BloscCodec(cname='lz4')
# 平衡:BloscCodec(cname='zstd', clevel=5)
# 最大压缩:GzipCodec(level=9)
存储后端 :与环境匹配
# 本地:LocalStore(默认)
# 云:S3Map/GCSMap 配合合并元数据
# 临时:MemoryStore
分片 :用于大规模数据集
# 当你有数百万个小分块时
shards=(10*chunk_size, 10*chunk_size)
并行 I/O :对大型操作使用 Dask
import dask.array as da
dask_array = da.from_zarr('data.zarr')
result = dask_array.compute(scheduler='threads', num_workers=8)
# 打印详细的数组信息
print(z.info)
# 输出包括:
# - 类型、形状、分块、数据类型
# - 压缩编解码器和级别
# - 存储大小(压缩 vs 未压缩)
# - 存储位置
# 检查存储大小
print(f"压缩大小:{z.nbytes_stored / 1e6:.2f} MB")
print(f"未压缩大小:{z.nbytes / 1e6:.2f} MB")
print(f"压缩比:{z.nbytes / z.nbytes_stored:.2f}x")
# 将时间序列存储为时间作为第一维度
# 这允许高效追加新的时间步长
z = zarr.open('timeseries.zarr', mode='a',
shape=(0, 720, 1440), # 从 0 个时间步长开始
chunks=(1, 720, 1440), # 每个分块一个时间步长
dtype='f4')
# 追加新的时间步长
new_data = np.random.random((1, 720, 1440))
z.append(new_data, axis=0)
import dask.array as da
# 在 Zarr 中创建大型矩阵
z = zarr.open('matrix.zarr', mode='w',
shape=(100000, 100000),
chunks=(1000, 1000),
dtype='f8')
# 使用 Dask 进行并行计算
dask_z = da.from_zarr('matrix.zarr')
result = (dask_z @ dask_z.T).compute() # 并行矩阵乘法
import s3fs
import zarr
# 写入 S3
s3 = s3fs.S3FileSystem()
store = s3fs.S3Map(root='s3://my-bucket/data.zarr', s3=s3)
# 创建适合云存储的分块数组
z = zarr.open_array(store=store, mode='w',
shape=(10000, 10000),
chunks=(500, 500), # ~1MB 分块
dtype='f4')
z[:] = data
# 合并元数据以加速读取
zarr.consolidate_metadata(store)
# 从 S3 读取(随时随地)
store_read = s3fs.S3Map(root='s3://my-bucket/data.zarr', s3=s3)
z_read = zarr.open_consolidated(store_read)
subset = z_read[0:100, 0:100]
# HDF5 转 Zarr
import h5py
import zarr
with h5py.File('data.h5', 'r') as h5:
dataset = h5['dataset_name']
z = zarr.array(dataset[:],
chunks=(1000, 1000),
store='data.zarr')
# NumPy 转 Zarr
import numpy as np
data = np.load('data.npy')
z = zarr.array(data, chunks='auto', store='data.zarr')
# Zarr 转 NetCDF(通过 Xarray)
import xarray as xr
ds = xr.open_zarr('data.zarr')
ds.to_netcdf('data.nc')
诊断 :检查分块大小和对齐情况
print(z.chunks) # 分块大小是否合适?
print(z.info) # 检查压缩比
解决方案 :
原因 :将整个数组或大分块加载到内存中
解决方案 :
# 不要加载整个数组
# 错误:data = z[:]
# 正确:分块处理
for i in range(0, z.shape[0], 1000):
chunk = z[i:i+1000, :]
process(chunk)
# 或使用 Dask 进行自动分块
import dask.array as da
dask_z = da.from_zarr('data.zarr')
result = dask_z.mean().compute() # 分块处理
解决方案 :
# 1. 合并元数据
zarr.consolidate_metadata(store)
z = zarr.open_consolidated(store)
# 2. 使用合适的分块大小(云存储建议 5-100 MB)
chunks = (2000, 2000) # 云存储使用更大的分块
# 3. 启用分片
shards = (10000, 10000) # 将许多分块分组
解决方案 :使用同步器或确保非重叠写入
from zarr import ProcessSynchronizer
sync = ProcessSynchronizer('sync.sync')
z = zarr.open_array('data.zarr', mode='r+', synchronizer=sync)
# 或设计工作流使每个进程写入独立的分块
如需详细的 API 文档、高级用法和最新更新:
相关库 :
每周安装数
124
仓库
GitHub 星标数
22.6K
首次出现
Jan 21, 2026
安全审计
安装于
claude-code107
opencode99
cursor95
gemini-cli92
antigravity84
codex83
Zarr is a Python library for storing large N-dimensional arrays with chunking and compression. Apply this skill for efficient parallel I/O, cloud-native workflows, and seamless integration with NumPy, Dask, and Xarray.
uv pip install zarr
Requires Python 3.11+. For cloud storage support, install additional packages:
uv pip install s3fs # For S3
uv pip install gcsfs # For Google Cloud Storage
import zarr
import numpy as np
# Create a 2D array with chunking and compression
z = zarr.create_array(
store="data/my_array.zarr",
shape=(10000, 10000),
chunks=(1000, 1000),
dtype="f4"
)
# Write data using NumPy-style indexing
z[:, :] = np.random.random((10000, 10000))
# Read data
data = z[0:100, 0:100] # Returns NumPy array
Zarr provides multiple convenience functions for array creation:
# Create empty array
z = zarr.zeros(shape=(10000, 10000), chunks=(1000, 1000), dtype='f4',
store='data.zarr')
# Create filled arrays
z = zarr.ones((5000, 5000), chunks=(500, 500))
z = zarr.full((1000, 1000), fill_value=42, chunks=(100, 100))
# Create from existing data
data = np.arange(10000).reshape(100, 100)
z = zarr.array(data, chunks=(10, 10), store='data.zarr')
# Create like another array
z2 = zarr.zeros_like(z) # Matches shape, chunks, dtype of z
# Open array (read/write mode by default)
z = zarr.open_array('data.zarr', mode='r+')
# Read-only mode
z = zarr.open_array('data.zarr', mode='r')
# The open() function auto-detects arrays vs groups
z = zarr.open('data.zarr') # Returns Array or Group
Zarr arrays support NumPy-like indexing:
# Write entire array
z[:] = 42
# Write slices
z[0, :] = np.arange(100)
z[10:20, 50:60] = np.random.random((10, 10))
# Read data (returns NumPy array)
data = z[0:100, 0:100]
row = z[5, :]
# Advanced indexing
z.vindex[[0, 5, 10], [2, 8, 15]] # Coordinate indexing
z.oindex[0:10, [5, 10, 15]] # Orthogonal indexing
z.blocks[0, 0] # Block/chunk indexing
# Resize array
z.resize(15000, 15000) # Expands or shrinks dimensions
# Append data along an axis
z.append(np.random.random((1000, 10000)), axis=0) # Adds rows
Chunking is critical for performance. Choose chunk sizes and shapes based on access patterns.
Minimum chunk size : 1 MB recommended for optimal performance
Balance : Larger chunks = fewer metadata operations; smaller chunks = better parallel access
Memory consideration : Entire chunks must fit in memory during compression
z = zarr.zeros( shape=(10000, 10000), chunks=(512, 512), # ~1MB chunks dtype='f4' )
Critical : Chunk shape dramatically affects performance based on how data is accessed.
# If accessing rows frequently (first dimension)
z = zarr.zeros((10000, 10000), chunks=(10, 10000)) # Chunk spans columns
# If accessing columns frequently (second dimension)
z = zarr.zeros((10000, 10000), chunks=(10000, 10)) # Chunk spans rows
# For mixed access patterns (balanced approach)
z = zarr.zeros((10000, 10000), chunks=(1000, 1000)) # Square chunks
Performance example : For a (200, 200, 200) array, reading along the first dimension:
When arrays have millions of small chunks, use sharding to group chunks into larger storage objects:
from zarr.codecs import ShardingCodec, BytesCodec
from zarr.codecs.blosc import BloscCodec
# Create array with sharding
z = zarr.create_array(
store='data.zarr',
shape=(100000, 100000),
chunks=(100, 100), # Small chunks for access
shards=(1000, 1000), # Groups 100 chunks per shard
dtype='f4'
)
Benefits :
Important : Entire shards must fit in memory before writing.
Zarr applies compression per chunk to reduce storage while maintaining fast access.
from zarr.codecs.blosc import BloscCodec
from zarr.codecs import GzipCodec, ZstdCodec
# Default: Blosc with Zstandard
z = zarr.zeros((1000, 1000), chunks=(100, 100)) # Uses default compression
# Configure Blosc codec
z = zarr.create_array(
store='data.zarr',
shape=(1000, 1000),
chunks=(100, 100),
dtype='f4',
codecs=[BloscCodec(cname='zstd', clevel=5, shuffle='shuffle')]
)
# Available Blosc compressors: 'blosclz', 'lz4', 'lz4hc', 'snappy', 'zlib', 'zstd'
# Use Gzip compression
z = zarr.create_array(
store='data.zarr',
shape=(1000, 1000),
chunks=(100, 100),
dtype='f4',
codecs=[GzipCodec(level=6)]
)
# Disable compression
z = zarr.create_array(
store='data.zarr',
shape=(1000, 1000),
chunks=(100, 100),
dtype='f4',
codecs=[BytesCodec()] # No compression
)
Blosc (default): Fast compression/decompression, good for interactive workloads
Zstandard : Better compression ratios, slightly slower than LZ4
Gzip : Maximum compression, slower performance
LZ4 : Fastest compression, lower ratios
Shuffle : Enable shuffle filter for better compression on numeric data
codecs=[BloscCodec(cname='zstd', clevel=5, shuffle='shuffle')]
codecs=[BloscCodec(cname='lz4', clevel=1)]
codecs=[GzipCodec(level=9)]
Zarr supports multiple storage backends through a flexible storage interface.
from zarr.storage import LocalStore
# Explicit store creation
store = LocalStore('data/my_array.zarr')
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
# Or use string path (creates LocalStore automatically)
z = zarr.open_array('data/my_array.zarr', mode='w', shape=(1000, 1000),
chunks=(100, 100))
from zarr.storage import MemoryStore
# Create in-memory store
store = MemoryStore()
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
# Data exists only in memory, not persisted
from zarr.storage import ZipStore
# Write to ZIP file
store = ZipStore('data.zip', mode='w')
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
z[:] = np.random.random((1000, 1000))
store.close() # IMPORTANT: Must close ZipStore
# Read from ZIP file
store = ZipStore('data.zip', mode='r')
z = zarr.open_array(store=store)
data = z[:]
store.close()
import s3fs
import zarr
# S3 storage
s3 = s3fs.S3FileSystem(anon=False) # Use credentials
store = s3fs.S3Map(root='my-bucket/path/to/array.zarr', s3=s3)
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
z[:] = data
# Google Cloud Storage
import gcsfs
gcs = gcsfs.GCSFileSystem(project='my-project')
store = gcsfs.GCSMap(root='my-bucket/path/to/array.zarr', gcs=gcs)
z = zarr.open_array(store=store, mode='w', shape=(1000, 1000), chunks=(100, 100))
Cloud Storage Best Practices :
zarr.consolidate_metadata(store)Groups organize multiple arrays hierarchically, similar to directories or HDF5 groups.
# Create root group
root = zarr.group(store='data/hierarchy.zarr')
# Create sub-groups
temperature = root.create_group('temperature')
precipitation = root.create_group('precipitation')
# Create arrays within groups
temp_array = temperature.create_array(
name='t2m',
shape=(365, 720, 1440),
chunks=(1, 720, 1440),
dtype='f4'
)
precip_array = precipitation.create_array(
name='prcp',
shape=(365, 720, 1440),
chunks=(1, 720, 1440),
dtype='f4'
)
# Access using paths
array = root['temperature/t2m']
# Visualize hierarchy
print(root.tree())
# Output:
# /
# ├── temperature
# │ └── t2m (365, 720, 1440) f4
# └── precipitation
# └── prcp (365, 720, 1440) f4
Zarr provides an h5py-compatible interface for familiar HDF5 users:
# Create group with h5py-style methods
root = zarr.group('data.zarr')
dataset = root.create_dataset('my_data', shape=(1000, 1000), chunks=(100, 100),
dtype='f4')
# Access like h5py
grp = root.require_group('subgroup')
arr = grp.require_dataset('array', shape=(500, 500), chunks=(50, 50), dtype='i4')
Attach custom metadata to arrays and groups using attributes:
# Add attributes to array
z = zarr.zeros((1000, 1000), chunks=(100, 100))
z.attrs['description'] = 'Temperature data in Kelvin'
z.attrs['units'] = 'K'
z.attrs['created'] = '2024-01-15'
z.attrs['processing_version'] = 2.1
# Attributes are stored as JSON
print(z.attrs['units']) # Output: K
# Add attributes to groups
root = zarr.group('data.zarr')
root.attrs['project'] = 'Climate Analysis'
root.attrs['institution'] = 'Research Institute'
# Attributes persist with the array/group
z2 = zarr.open('data.zarr')
print(z2.attrs['description'])
Important : Attributes must be JSON-serializable (strings, numbers, lists, dicts, booleans, null).
Zarr arrays implement the NumPy array interface:
import numpy as np
import zarr
z = zarr.zeros((1000, 1000), chunks=(100, 100))
# Use NumPy functions directly
result = np.sum(z, axis=0) # NumPy operates on Zarr array
mean = np.mean(z[:100, :100])
# Convert to NumPy array
numpy_array = z[:] # Loads entire array into memory
Dask provides lazy, parallel computation on Zarr arrays:
import dask.array as da
import zarr
# Create large Zarr array
z = zarr.open('data.zarr', mode='w', shape=(100000, 100000),
chunks=(1000, 1000), dtype='f4')
# Load as Dask array (lazy, no data loaded)
dask_array = da.from_zarr('data.zarr')
# Perform computations (parallel, out-of-core)
result = dask_array.mean(axis=0).compute() # Parallel computation
# Write Dask array to Zarr
large_array = da.random.random((100000, 100000), chunks=(1000, 1000))
da.to_zarr(large_array, 'output.zarr')
Benefits :
Xarray provides labeled, multidimensional arrays with Zarr backend:
import xarray as xr
import zarr
# Open Zarr store as Xarray Dataset (lazy loading)
ds = xr.open_zarr('data.zarr')
# Dataset includes coordinates and metadata
print(ds)
# Access variables
temperature = ds['temperature']
# Perform labeled operations
subset = ds.sel(time='2024-01', lat=slice(30, 60))
# Write Xarray Dataset to Zarr
ds.to_zarr('output.zarr')
# Create from scratch with coordinates
ds = xr.Dataset(
{
'temperature': (['time', 'lat', 'lon'], data),
'precipitation': (['time', 'lat', 'lon'], data2)
},
coords={
'time': pd.date_range('2024-01-01', periods=365),
'lat': np.arange(-90, 91, 1),
'lon': np.arange(-180, 180, 1)
}
)
ds.to_zarr('climate_data.zarr')
Benefits :
from zarr import ThreadSynchronizer
import zarr
# For multi-threaded writes
synchronizer = ThreadSynchronizer()
z = zarr.open_array('data.zarr', mode='r+', shape=(10000, 10000),
chunks=(1000, 1000), synchronizer=synchronizer)
# Safe for concurrent writes from multiple threads
# (when writes don't span chunk boundaries)
from zarr import ProcessSynchronizer
import zarr
# For multi-process writes
synchronizer = ProcessSynchronizer('sync_data.sync')
z = zarr.open_array('data.zarr', mode='r+', shape=(10000, 10000),
chunks=(1000, 1000), synchronizer=synchronizer)
# Safe for concurrent writes from multiple processes
Note :
For hierarchical stores with many arrays, consolidate metadata into a single file to reduce I/O operations:
import zarr
# After creating arrays/groups
root = zarr.group('data.zarr')
# ... create multiple arrays/groups ...
# Consolidate metadata
zarr.consolidate_metadata('data.zarr')
# Open with consolidated metadata (faster, especially on cloud storage)
root = zarr.open_consolidated('data.zarr')
Benefits :
tree() operations and group traversalCautions :
Chunk Size : Aim for 1-10 MB per chunk
# For float32: 1MB = 262,144 elements
chunks = (512, 512) # 512×512×4 bytes = ~1MB
Chunk Shape : Align with access patterns
# Row-wise access → chunk spans columns: (small, large)
# Column-wise access → chunk spans rows: (large, small)
# Random access → balanced: (medium, medium)
Compression : Choose based on workload
# Interactive/fast: BloscCodec(cname='lz4')
# Balanced: BloscCodec(cname='zstd', clevel=5)
# Maximum compression: GzipCodec(level=9)
Storage Backend : Match to environment
# Local: LocalStore (default)
# Cloud: S3Map/GCSMap with consolidated metadata
# Temporary: MemoryStore
Sharding : Use for large-scale datasets
# When you have millions of small chunks
shards=(10*chunk_size, 10*chunk_size)
# Print detailed array information
print(z.info)
# Output includes:
# - Type, shape, chunks, dtype
# - Compression codec and level
# - Storage size (compressed vs uncompressed)
# - Storage location
# Check storage size
print(f"Compressed size: {z.nbytes_stored / 1e6:.2f} MB")
print(f"Uncompressed size: {z.nbytes / 1e6:.2f} MB")
print(f"Compression ratio: {z.nbytes / z.nbytes_stored:.2f}x")
# Store time series with time as first dimension
# This allows efficient appending of new time steps
z = zarr.open('timeseries.zarr', mode='a',
shape=(0, 720, 1440), # Start with 0 time steps
chunks=(1, 720, 1440), # One time step per chunk
dtype='f4')
# Append new time steps
new_data = np.random.random((1, 720, 1440))
z.append(new_data, axis=0)
import dask.array as da
# Create large matrix in Zarr
z = zarr.open('matrix.zarr', mode='w',
shape=(100000, 100000),
chunks=(1000, 1000),
dtype='f8')
# Use Dask for parallel computation
dask_z = da.from_zarr('matrix.zarr')
result = (dask_z @ dask_z.T).compute() # Parallel matrix multiply
import s3fs
import zarr
# Write to S3
s3 = s3fs.S3FileSystem()
store = s3fs.S3Map(root='s3://my-bucket/data.zarr', s3=s3)
# Create array with appropriate chunking for cloud
z = zarr.open_array(store=store, mode='w',
shape=(10000, 10000),
chunks=(500, 500), # ~1MB chunks
dtype='f4')
z[:] = data
# Consolidate metadata for faster reads
zarr.consolidate_metadata(store)
# Read from S3 (anywhere, anytime)
store_read = s3fs.S3Map(root='s3://my-bucket/data.zarr', s3=s3)
z_read = zarr.open_consolidated(store_read)
subset = z_read[0:100, 0:100]
# HDF5 to Zarr
import h5py
import zarr
with h5py.File('data.h5', 'r') as h5:
dataset = h5['dataset_name']
z = zarr.array(dataset[:],
chunks=(1000, 1000),
store='data.zarr')
# NumPy to Zarr
import numpy as np
data = np.load('data.npy')
z = zarr.array(data, chunks='auto', store='data.zarr')
# Zarr to NetCDF (via Xarray)
import xarray as xr
ds = xr.open_zarr('data.zarr')
ds.to_netcdf('data.nc')
Diagnosis : Check chunk size and alignment
print(z.chunks) # Are chunks appropriate size?
print(z.info) # Check compression ratio
Solutions :
Cause : Loading entire array or large chunks into memory
Solutions :
# Don't load entire array
# Bad: data = z[:]
# Good: Process in chunks
for i in range(0, z.shape[0], 1000):
chunk = z[i:i+1000, :]
process(chunk)
# Or use Dask for automatic chunking
import dask.array as da
dask_z = da.from_zarr('data.zarr')
result = dask_z.mean().compute() # Processes in chunks
Solutions :
# 1. Consolidate metadata
zarr.consolidate_metadata(store)
z = zarr.open_consolidated(store)
# 2. Use appropriate chunk sizes (5-100 MB for cloud)
chunks = (2000, 2000) # Larger chunks for cloud
# 3. Enable sharding
shards = (10000, 10000) # Groups many chunks
Solution : Use synchronizers or ensure non-overlapping writes
from zarr import ProcessSynchronizer
sync = ProcessSynchronizer('sync.sync')
z = zarr.open_array('data.zarr', mode='r+', synchronizer=sync)
# Or design workflow so each process writes to separate chunks
For detailed API documentation, advanced usage, and the latest updates:
Related Libraries :
Weekly Installs
124
Repository
GitHub Stars
22.6K
First Seen
Jan 21, 2026
Security Audits
Gen Agent Trust HubPassSocketPassSnykPass
Installed on
claude-code107
opencode99
cursor95
gemini-cli92
antigravity84
codex83
FastAPI官方技能:Python Web开发最佳实践与CLI工具使用指南
1,000 周安装
ASP.NET Core 开发指南:Web API、身份验证、中间件与性能优化实战
134 周安装
agent-browser 浏览器自动化工具 - 快速网页交互与测试命令行工具
134 周安装
find-skills技能:AI智能体技能搜索与安装工具,扩展Claude能力
134 周安装
Azure Functions 最佳实践指南:独立工作进程、Node.js/Python 编程模型与反模式详解
134 周安装
gentle-teaching 温和教学框架:AI辅助学习指南,培养独立解决问题能力
134 周安装
Symfony Scheduler 异步任务调度器:实现稳定重试与失败传输的工作流
134 周安装
Parallel I/O : Use Dask for large operations
import dask.array as da
dask_array = da.from_zarr('data.zarr')
result = dask_array.compute(scheduler='threads', num_workers=8)