Convert EarthCARE MSI Level 2 products to HEALPix at depth 29 using the same approach as the Sentinel-3 pipeline in legacy-converters:
CellPointResamplerfromhealpix-resamplemaps each pixel to a unique depth-29 cellDense chunks at depth 19 — each chunk contains all 4^10 = 1,048,576 child cells
No aggregation — bijective mapping, every observation preserved
Output as DGGS-Zarr with the same metadata conventions
This matches the Sentinel-3 OLCI/SLSTR conversion strategy where:
Refinement level = 29 (~12 mm cell edge)
Chunk level = 19 (~3.6 km, ~1M cells per chunk)
Indexing scheme = nested
Ellipsoid = WGS84
import earthcarekit as eck
import healpix_geo as hpxg
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import xdggs
import zarr
from healpix_resample import CellPointResampler
from earthcare_dggs.settings import ELLIPSOID
ORBIT = "06109D"
DEPTH = 29 # Cell-point: finest HEALPix depth
CHUNK_DEPTH = 19 # Dense chunks at this depth (same as Sentinel-3)
CHUNK_SIZE = 4 ** (DEPTH - CHUNK_DEPTH) # 4^10 = 1,048,576 cells per chunk
print(f"Depth: {DEPTH} (~12 mm cells)")
print(f"Chunk depth: {CHUNK_DEPTH} (~3.6 km)")
print(f"Cells per chunk: {CHUNK_SIZE:,}")Depth: 29 (~12 mm cells)
Chunk depth: 19 (~3.6 km)
Cells per chunk: 1,048,576
Load MSI products¶
products = {}
for file_type in ["MSI_CM__2A", "MSI_COP_2A", "MSI_AOT_2A"]:
result = eck.search_product(file_type=file_type, orbit_and_frame=ORBIT)
with eck.read_product(result.filepath[0]) as ds:
products[file_type] = ds.load()
print(f"{file_type}: loaded")
msi_cm = products["MSI_CM__2A"]
print(f"\nSwath shape: {msi_cm['latitude_swath'].shape}")
print(f"Total pixels: {msi_cm['latitude_swath'].size:,}")MSI_CM__2A: loaded
MSI_COP_2A: loaded
MSI_AOT_2A: loaded
Swath shape: (11216, 357)
Total pixels: 4,004,112
Cell-point mapping with CellPointResampler¶
This is the same resampler used by legacy-converters for Sentinel-3.
It maps each (lon, lat) sample to the HEALPix cell at depth 29 that contains it.
lat = msi_cm["latitude_swath"].values
lon = msi_cm["longitude_swath"].values
valid = np.isfinite(lat) & np.isfinite(lon)
lon_valid = lon[valid].ravel().astype(np.float64)
lat_valid = lat[valid].ravel().astype(np.float64)
print(f"Valid pixels: {len(lon_valid):,} / {lat.size:,}")
# Create the CellPointResampler (same as Sentinel-3 pipeline)
resampler = CellPointResampler(
lon_deg=lon_valid,
lat_deg=lat_valid,
nest=True,
ellipsoid=ELLIPSOID,
verbose=True,
)
print(f"\nUnique depth-29 cells: {resampler.K:,}")
print(f"Input samples: {resampler.N:,}")
print(f"Bijective: {resampler.K == resampler.N}")Valid pixels: 4,004,112 / 4,004,112
Unique depth-29 cells: 4,004,112
Input samples: 4,004,112
Bijective: True
Resample MSI variables¶
For each variable, call resampler.resample() to place pixel values
into their depth-29 cells. Since the mapping is bijective, there is no
aggregation — each cell gets exactly one value.
from earthcare_dggs.convert import detect_fill_value
from earthcare_dggs.settings import MSI_VARIABLES
resampled = {}
for var_name, config in MSI_VARIABLES.items():
source_ds = products.get(config["source"])
if source_ds is None or var_name not in source_ds:
continue
raw = source_ds[var_name].values.ravel()
raw_valid = raw[valid.ravel()].astype(np.float32)
# Replace fill values with NaN
fill = config.get("fill") or detect_fill_value(raw)
if fill is not None:
raw_valid[raw_valid == fill] = np.nan
elif raw.dtype in (np.float32, np.float64):
raw_valid[~np.isfinite(raw_valid) | (raw_valid <= -1e30)] = np.nan
# Resample using CellPointResampler
result = resampler.resample(raw_valid.astype(np.float64))
resampled[var_name] = result.cell_data.astype(np.float32)
n_valid = np.isfinite(resampled[var_name]).sum()
print(f" {var_name}: {n_valid:,} valid / {len(resampled[var_name]):,} cells")
# Cell IDs from the resampler (sorted unique depth-29 cells)
cell_ids_d29 = resampler.cell_ids
if hasattr(cell_ids_d29, 'cpu'):
cell_ids_d29 = cell_ids_d29.cpu().numpy()
cell_ids_d29 = cell_ids_d29.astype(np.uint64)
print(f"\nTotal variables: {len(resampled)}")
print(f"Cell IDs range: [{cell_ids_d29.min()}, {cell_ids_d29.max()}]") cloud_mask: 3,987,502 valid / 4,004,112 cells
cloud_type: 3,981,555 valid / 4,004,112 cells
cloud_phase: 954,386 valid / 4,004,112 cells
surface_classification: 4,004,112 valid / 4,004,112 cells
quality_status: 4,004,112 valid / 4,004,112 cells
cloud_top_height: 741,865 valid / 4,004,112 cells
cloud_top_temperature: 741,865 valid / 4,004,112 cells
cloud_top_pressure: 741,865 valid / 4,004,112 cells
cloud_optical_thickness: 748,392 valid / 4,004,112 cells
cloud_effective_radius: 748,392 valid / 4,004,112 cells
cloud_water_path: 748,392 valid / 4,004,112 cells
cloud_top_temperature_error: 741,865 valid / 4,004,112 cells
isccp_cloud_type: 4,004,112 valid / 4,004,112 cells
aerosol_optical_thickness_670nm: 684,230 valid / 4,004,112 cells
aerosol_optical_thickness_865nm: 341,654 valid / 4,004,112 cells
angstrom_parameter_670nm_865nm: 341,603 valid / 4,004,112 cells
aerosol_optical_thickness_670nm_error: 684,230 valid / 4,004,112 cells
aerosol_optical_thickness_865nm_error: 341,654 valid / 4,004,112 cells
Total variables: 18
Cell IDs range: [189338797623254494, 1440401429421314188]
Write dense chunks to Zarr (Sentinel-3 style)¶
Following the Sentinel-3 DenseChunkConverter, we write one chunk at a time
to Zarr — never holding more than one chunk in memory. Each depth-19 chunk
contains all 4^10 = 1,048,576 depth-29 child cells. Unoccupied cells are NaN.
# Determine which depth-19 chunks are needed
chunk_ids_d19 = hpxg.nested.zoom_to(
cell_ids_d29, depth=DEPTH, new_depth=CHUNK_DEPTH,
)
unique_chunks = np.unique(chunk_ids_d19)
n_chunks = len(unique_chunks)
total_cells = n_chunks * CHUNK_SIZE
var_names = list(resampled.keys())
print(f"Depth-19 chunks needed: {n_chunks}")
print(f"Total output cells: {total_cells:,} ({n_chunks} x {CHUNK_SIZE:,})")
print(f"Occupied cells: {len(cell_ids_d29):,} ({100 * len(cell_ids_d29) / total_cells:.2f}% fill)")
print(f"Memory per chunk: {CHUNK_SIZE * 4 * len(var_names) / 1e6:.0f} MB")Depth-19 chunks needed: 4004112
Total output cells: 4,198,615,744,512 (4004112 x 1,048,576)
Occupied cells: 4,004,112 (0.00% fill)
Memory per chunk: 75 MB
output_path = f"earthcare_msi_cellpoint_s3style_{ORBIT}.zarr"
# --- Create Zarr store with empty arrays (like DenseChunkConverter) ---
root = zarr.open_group(output_path, mode="w")
# Coordinate array: cell_ids
root.create_array(
"cell_ids", shape=(total_cells,), dtype=np.uint64,
chunks=(CHUNK_SIZE,), fill_value=0,
)
# Data variable arrays: one per MSI variable
for name in var_names:
root.create_array(
name, shape=(total_cells,), dtype=np.float32,
chunks=(CHUNK_SIZE,), fill_value=np.nan,
)
print(f"Created Zarr store: {output_path}")
print(f" {len(var_names)} variables + cell_ids, {n_chunks} chunks of {CHUNK_SIZE:,}")Created Zarr store: earthcare_msi_cellpoint_s3style_06109D.zarr
18 variables + cell_ids, 4004112 chunks of 1,048,576
Fill chunks one at a time¶
For each depth-19 chunk:
Expand to all depth-29 children (contiguous range in nested scheme)
Find which children have data using vectorized numpy indexing
Write the chunk to Zarr immediately
Free memory before the next chunk
This is exactly how DenseChunkConverter.convert() works in legacy-converters.
# --- Write chunk by chunk to Zarr ---
# Build a sorted index for fast lookup of occupied cells
sort_order = np.argsort(cell_ids_d29)
sorted_cell_ids = cell_ids_d29[sort_order]
for ci, chunk_id in enumerate(unique_chunks):
chunk_slice = slice(ci * CHUNK_SIZE, (ci + 1) * CHUNK_SIZE)
# All depth-29 children of this depth-19 chunk (contiguous in nested scheme)
# zoom_to returns (1, N) when given a single cell — use [0] to flatten
# (same as legacy-converters DenseChunkConverter)
children = hpxg.nested.zoom_to(
np.array([chunk_id], dtype=np.uint64),
depth=CHUNK_DEPTH,
new_depth=DEPTH,
)[0]
# Write cell IDs for this chunk
root["cell_ids"][chunk_slice] = children
# Find which children have data (vectorized via searchsorted)
positions = np.searchsorted(sorted_cell_ids, children)
positions = np.clip(positions, 0, len(sorted_cell_ids) - 1)
found = sorted_cell_ids[positions] == children
if found.any():
src_indices = sort_order[positions[found]]
for name in var_names:
chunk_data = np.full(CHUNK_SIZE, np.nan, dtype=np.float32)
chunk_data[found] = resampled[name][src_indices]
root[name][chunk_slice] = chunk_data
# If no data in this chunk, the fill_value (NaN) is already set
if (ci + 1) % 500 == 0 or ci == n_chunks - 1:
print(f" Chunk {ci + 1}/{n_chunks}")
print(f"\nDone: {total_cells:,} cells written in {n_chunks} chunks")# --- Add DGGS Zarr convention metadata ---
root.attrs["source"] = "EarthCARE MSI L2A"
root.attrs["orbit_and_frame"] = ORBIT
root.attrs["resampler"] = "cell-point (CellPointResampler)"
root.attrs["zarr_conventions"] = [{
"uuid": "7b255807-140c-42ca-97f6-7a1cfecdbc38",
"name": "dggs",
"schema_url": "https://raw.githubusercontent.com/zarr-conventions/dggs/refs/tags/v1/schema.json",
"spec_url": "https://github.com/zarr-conventions/dggs/blob/v1/README.md",
"description": "Discrete Global Grid Systems convention for zarr",
}]
root.attrs["dggs"] = {
"name": "healpix",
"refinement_level": DEPTH,
"indexing_scheme": "nested",
"ellipsoid": {
"name": "wgs84",
"semimajor_axis": 6378137.0,
"inverse_flattening": 298.257223563,
},
"spatial_dimension": "cell_ids",
"coordinate": "cell_ids",
"compression": "none",
"chunk_healpix": {
"refinement_level": CHUNK_DEPTH,
"indexing_scheme": "nested",
},
}
print(f"Saved to {output_path}")
print(f"DGGS metadata written")Verify: re-open and decode¶
ds_check = xr.open_zarr(output_path)
print(ds_check)
print(f"\nChunk sizes: {dict(ds_check.chunks)}")
print(f"Total size: {ds_check.nbytes / 1e9:.1f} GB (mostly NaN — sparse swath in dense chunks)")Visualization: coarsen on the fly¶
Since the data is at depth 29, we coarsen to depth 10 for interactive
visualization — same as what xdggs.explore() would do.
from earthcare_dggs.convert import aggregate_to_healpix
VIS_DEPTH = 10
# Coarsen from the resampled data (already in memory) rather than loading
# the full dense Zarr — the dense representation is too large for memory
vis_ids = hpxg.nested.zoom_to(cell_ids_d29, depth=DEPTH, new_depth=VIS_DEPTH)
unique_vis = np.unique(vis_ids)
vis_to_idx = {int(c): i for i, c in enumerate(unique_vis)}
vis_data = {}
for var_name in ["cloud_mask", "cloud_type", "cloud_top_height", "aerosol_optical_thickness_670nm"]:
if var_name not in resampled:
continue
vals = resampled[var_name]
mask = np.isfinite(vals)
if mask.sum() == 0:
continue
var_vis = vis_ids[mask]
group_idx = np.array([vis_to_idx.get(int(c), -1) for c in var_vis])
keep = group_idx >= 0
agg = MSI_VARIABLES.get(var_name, {}).get("agg", "mean")
vis_data[var_name] = aggregate_to_healpix(
vals[mask][keep], group_idx[keep], len(unique_vis), method=agg,
)
vis_ds = xr.Dataset(
{name: ("cell_ids", data) for name, data in vis_data.items()},
coords={"cell_ids": unique_vis},
)
vis_ds = xdggs.decode(
vis_ds,
grid_info=xdggs.HealpixInfo(level=VIS_DEPTH, indexing_scheme="nested"),
)
print(f"Visualization at depth {VIS_DEPTH}: {len(unique_vis):,} cells")
print(vis_ds)vis_ds.dggs.explore()Static comparison: full swath¶
import cartopy.crs as ccrs
import cartopy.feature as cfeature
var_name = "cloud_mask"
data = vis_ds[var_name].values
mask = np.isfinite(data)
c_lon, c_lat = hpxg.nested.healpix_to_lonlat(
vis_ds.cell_ids.values[mask], depth=VIS_DEPTH, ellipsoid=ELLIPSOID,
)
fig, ax = plt.subplots(
figsize=(8, 10), subplot_kw={"projection": ccrs.PlateCarree()},
)
ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
ax.add_feature(cfeature.BORDERS, linewidth=0.3)
ax.add_feature(cfeature.LAND, alpha=0.1)
gl = ax.gridlines(draw_labels=True, linewidth=0.3)
gl.top_labels = False
gl.right_labels = False
sc = ax.scatter(
c_lon, c_lat, c=data[mask],
s=1, cmap="Set1", vmin=0, vmax=3,
transform=ccrs.PlateCarree(),
)
plt.colorbar(sc, ax=ax, shrink=0.6, label=var_name)
ax.set_title(
f"EarthCARE MSI {var_name} — depth {DEPTH} stored, shown at depth {VIS_DEPTH}\n"
f"Orbit {ORBIT} ({ELLIPSOID}) — Sentinel-3 style dense chunks at depth {CHUNK_DEPTH}",
fontsize=11,
)
plt.tight_layout()
plt.show()