MSI Cell-Point at Depth 29 (Sentinel-3 style)

Convert EarthCARE MSI Level 2 products to HEALPix at depth 29 using the same approach as the Sentinel-3 pipeline in legacy-converters:

CellPointResampler from healpix-resample maps each pixel to a unique depth-29 cell
Dense chunks at depth 19 — each chunk contains all 4^10 = 1,048,576 child cells
No aggregation — bijective mapping, every observation preserved
Output as DGGS-Zarr with the same metadata conventions

This matches the Sentinel-3 OLCI/SLSTR conversion strategy where:

Refinement level = 29 (~12 mm cell edge)
Chunk level = 19 (~3.6 km, ~1M cells per chunk)
Indexing scheme = nested
Ellipsoid = WGS84

import earthcarekit as eck
import healpix_geo as hpxg
import matplotlib.pyplot as plt
import numpy as np
import xarray as xr
import xdggs
import zarr
from healpix_resample import CellPointResampler

from earthcare_dggs.settings import ELLIPSOID

ORBIT = "06109D"
DEPTH = 29           # Cell-point: finest HEALPix depth
CHUNK_DEPTH = 19     # Dense chunks at this depth (same as Sentinel-3)
CHUNK_SIZE = 4 ** (DEPTH - CHUNK_DEPTH)  # 4^10 = 1,048,576 cells per chunk

print(f"Depth: {DEPTH} (~12 mm cells)")
print(f"Chunk depth: {CHUNK_DEPTH} (~3.6 km)")
print(f"Cells per chunk: {CHUNK_SIZE:,}")

Depth: 29 (~12 mm cells)
Chunk depth: 19 (~3.6 km)
Cells per chunk: 1,048,576

Load MSI products¶

products = {}
for file_type in ["MSI_CM__2A", "MSI_COP_2A", "MSI_AOT_2A"]:
    result = eck.search_product(file_type=file_type, orbit_and_frame=ORBIT)
    with eck.read_product(result.filepath[0]) as ds:
        products[file_type] = ds.load()
    print(f"{file_type}: loaded")

msi_cm = products["MSI_CM__2A"]
print(f"\nSwath shape: {msi_cm['latitude_swath'].shape}")
print(f"Total pixels: {msi_cm['latitude_swath'].size:,}")

MSI_CM__2A: loaded
MSI_COP_2A: loaded
MSI_AOT_2A: loaded

Swath shape: (11216, 357)
Total pixels: 4,004,112

Cell-point mapping with CellPointResampler¶

This is the same resampler used by legacy-converters for Sentinel-3. It maps each (lon, lat) sample to the HEALPix cell at depth 29 that contains it.

lat = msi_cm["latitude_swath"].values
lon = msi_cm["longitude_swath"].values
valid = np.isfinite(lat) & np.isfinite(lon)

lon_valid = lon[valid].ravel().astype(np.float64)
lat_valid = lat[valid].ravel().astype(np.float64)

print(f"Valid pixels: {len(lon_valid):,} / {lat.size:,}")

# Create the CellPointResampler (same as Sentinel-3 pipeline)
resampler = CellPointResampler(
    lon_deg=lon_valid,
    lat_deg=lat_valid,
    nest=True,
    ellipsoid=ELLIPSOID,
    verbose=True,
)

print(f"\nUnique depth-29 cells: {resampler.K:,}")
print(f"Input samples: {resampler.N:,}")
print(f"Bijective: {resampler.K == resampler.N}")

Valid pixels: 4,004,112 / 4,004,112

Unique depth-29 cells: 4,004,112
Input samples: 4,004,112
Bijective: True

Resample MSI variables¶

For each variable, call resampler.resample() to place pixel values into their depth-29 cells. Since the mapping is bijective, there is no aggregation — each cell gets exactly one value.

from earthcare_dggs.convert import detect_fill_value
from earthcare_dggs.settings import MSI_VARIABLES

resampled = {}

for var_name, config in MSI_VARIABLES.items():
    source_ds = products.get(config["source"])
    if source_ds is None or var_name not in source_ds:
        continue

    raw = source_ds[var_name].values.ravel()
    raw_valid = raw[valid.ravel()].astype(np.float32)

    # Replace fill values with NaN
    fill = config.get("fill") or detect_fill_value(raw)
    if fill is not None:
        raw_valid[raw_valid == fill] = np.nan
    elif raw.dtype in (np.float32, np.float64):
        raw_valid[~np.isfinite(raw_valid) | (raw_valid <= -1e30)] = np.nan

    # Resample using CellPointResampler
    result = resampler.resample(raw_valid.astype(np.float64))
    resampled[var_name] = result.cell_data.astype(np.float32)

    n_valid = np.isfinite(resampled[var_name]).sum()
    print(f"  {var_name}: {n_valid:,} valid / {len(resampled[var_name]):,} cells")

# Cell IDs from the resampler (sorted unique depth-29 cells)
cell_ids_d29 = resampler.cell_ids
if hasattr(cell_ids_d29, 'cpu'):
    cell_ids_d29 = cell_ids_d29.cpu().numpy()
cell_ids_d29 = cell_ids_d29.astype(np.uint64)

print(f"\nTotal variables: {len(resampled)}")
print(f"Cell IDs range: [{cell_ids_d29.min()}, {cell_ids_d29.max()}]")

  cloud_mask: 3,987,502 valid / 4,004,112 cells
  cloud_type: 3,981,555 valid / 4,004,112 cells
  cloud_phase: 954,386 valid / 4,004,112 cells
  surface_classification: 4,004,112 valid / 4,004,112 cells
  quality_status: 4,004,112 valid / 4,004,112 cells
  cloud_top_height: 741,865 valid / 4,004,112 cells
  cloud_top_temperature: 741,865 valid / 4,004,112 cells
  cloud_top_pressure: 741,865 valid / 4,004,112 cells
  cloud_optical_thickness: 748,392 valid / 4,004,112 cells
  cloud_effective_radius: 748,392 valid / 4,004,112 cells
  cloud_water_path: 748,392 valid / 4,004,112 cells
  cloud_top_temperature_error: 741,865 valid / 4,004,112 cells
  isccp_cloud_type: 4,004,112 valid / 4,004,112 cells
  aerosol_optical_thickness_670nm: 684,230 valid / 4,004,112 cells
  aerosol_optical_thickness_865nm: 341,654 valid / 4,004,112 cells
  angstrom_parameter_670nm_865nm: 341,603 valid / 4,004,112 cells
  aerosol_optical_thickness_670nm_error: 684,230 valid / 4,004,112 cells
  aerosol_optical_thickness_865nm_error: 341,654 valid / 4,004,112 cells

Total variables: 18
Cell IDs range: [189338797623254494, 1440401429421314188]

Write dense chunks to Zarr (Sentinel-3 style)¶

Following the Sentinel-3 DenseChunkConverter, we write one chunk at a time to Zarr — never holding more than one chunk in memory. Each depth-19 chunk contains all 4^10 = 1,048,576 depth-29 child cells. Unoccupied cells are NaN.

# Determine which depth-19 chunks are needed
chunk_ids_d19 = hpxg.nested.zoom_to(
    cell_ids_d29, depth=DEPTH, new_depth=CHUNK_DEPTH,
)
unique_chunks = np.unique(chunk_ids_d19)
n_chunks = len(unique_chunks)
total_cells = n_chunks * CHUNK_SIZE
var_names = list(resampled.keys())

print(f"Depth-19 chunks needed: {n_chunks}")
print(f"Total output cells: {total_cells:,} ({n_chunks} x {CHUNK_SIZE:,})")
print(f"Occupied cells: {len(cell_ids_d29):,} ({100 * len(cell_ids_d29) / total_cells:.2f}% fill)")
print(f"Memory per chunk: {CHUNK_SIZE * 4 * len(var_names) / 1e6:.0f} MB")

Depth-19 chunks needed: 4004112
Total output cells: 4,198,615,744,512 (4004112 x 1,048,576)
Occupied cells: 4,004,112 (0.00% fill)
Memory per chunk: 75 MB

output_path = f"earthcare_msi_cellpoint_s3style_{ORBIT}.zarr"

# --- Create Zarr store with empty arrays (like DenseChunkConverter) ---
root = zarr.open_group(output_path, mode="w")

# Coordinate array: cell_ids
root.create_array(
    "cell_ids", shape=(total_cells,), dtype=np.uint64,
    chunks=(CHUNK_SIZE,), fill_value=0,
)

# Data variable arrays: one per MSI variable
for name in var_names:
    root.create_array(
        name, shape=(total_cells,), dtype=np.float32,
        chunks=(CHUNK_SIZE,), fill_value=np.nan,
    )

print(f"Created Zarr store: {output_path}")
print(f"  {len(var_names)} variables + cell_ids, {n_chunks} chunks of {CHUNK_SIZE:,}")

Created Zarr store: earthcare_msi_cellpoint_s3style_06109D.zarr
  18 variables + cell_ids, 4004112 chunks of 1,048,576

Fill chunks one at a time¶

For each depth-19 chunk:

Expand to all depth-29 children (contiguous range in nested scheme)
Find which children have data using vectorized numpy indexing
Write the chunk to Zarr immediately
Free memory before the next chunk

This is exactly how DenseChunkConverter.convert() works in legacy-converters.

# --- Write chunk by chunk to Zarr ---
# Build a sorted index for fast lookup of occupied cells
sort_order = np.argsort(cell_ids_d29)
sorted_cell_ids = cell_ids_d29[sort_order]

for ci, chunk_id in enumerate(unique_chunks):
    chunk_slice = slice(ci * CHUNK_SIZE, (ci + 1) * CHUNK_SIZE)

    # All depth-29 children of this depth-19 chunk (contiguous in nested scheme)
    # zoom_to returns (1, N) when given a single cell — use [0] to flatten
    # (same as legacy-converters DenseChunkConverter)
    children = hpxg.nested.zoom_to(
        np.array([chunk_id], dtype=np.uint64),
        depth=CHUNK_DEPTH,
        new_depth=DEPTH,
    )[0]

    # Write cell IDs for this chunk
    root["cell_ids"][chunk_slice] = children

    # Find which children have data (vectorized via searchsorted)
    positions = np.searchsorted(sorted_cell_ids, children)
    positions = np.clip(positions, 0, len(sorted_cell_ids) - 1)
    found = sorted_cell_ids[positions] == children

    if found.any():
        src_indices = sort_order[positions[found]]
        for name in var_names:
            chunk_data = np.full(CHUNK_SIZE, np.nan, dtype=np.float32)
            chunk_data[found] = resampled[name][src_indices]
            root[name][chunk_slice] = chunk_data
    # If no data in this chunk, the fill_value (NaN) is already set

    if (ci + 1) % 500 == 0 or ci == n_chunks - 1:
        print(f"  Chunk {ci + 1}/{n_chunks}")

print(f"\nDone: {total_cells:,} cells written in {n_chunks} chunks")

# --- Add DGGS Zarr convention metadata ---
root.attrs["source"] = "EarthCARE MSI L2A"
root.attrs["orbit_and_frame"] = ORBIT
root.attrs["resampler"] = "cell-point (CellPointResampler)"

root.attrs["zarr_conventions"] = [{
    "uuid": "7b255807-140c-42ca-97f6-7a1cfecdbc38",
    "name": "dggs",
    "schema_url": "https://raw.githubusercontent.com/zarr-conventions/dggs/refs/tags/v1/schema.json",
    "spec_url": "https://github.com/zarr-conventions/dggs/blob/v1/README.md",
    "description": "Discrete Global Grid Systems convention for zarr",
}]

root.attrs["dggs"] = {
    "name": "healpix",
    "refinement_level": DEPTH,
    "indexing_scheme": "nested",
    "ellipsoid": {
        "name": "wgs84",
        "semimajor_axis": 6378137.0,
        "inverse_flattening": 298.257223563,
    },
    "spatial_dimension": "cell_ids",
    "coordinate": "cell_ids",
    "compression": "none",
    "chunk_healpix": {
        "refinement_level": CHUNK_DEPTH,
        "indexing_scheme": "nested",
    },
}

print(f"Saved to {output_path}")
print(f"DGGS metadata written")

Verify: re-open and decode¶

ds_check = xr.open_zarr(output_path)
print(ds_check)
print(f"\nChunk sizes: {dict(ds_check.chunks)}")
print(f"Total size: {ds_check.nbytes / 1e9:.1f} GB (mostly NaN — sparse swath in dense chunks)")

Visualization: coarsen on the fly¶

Since the data is at depth 29, we coarsen to depth 10 for interactive visualization — same as what xdggs.explore() would do.

from earthcare_dggs.convert import aggregate_to_healpix

VIS_DEPTH = 10

# Coarsen from the resampled data (already in memory) rather than loading
# the full dense Zarr — the dense representation is too large for memory
vis_ids = hpxg.nested.zoom_to(cell_ids_d29, depth=DEPTH, new_depth=VIS_DEPTH)
unique_vis = np.unique(vis_ids)
vis_to_idx = {int(c): i for i, c in enumerate(unique_vis)}

vis_data = {}
for var_name in ["cloud_mask", "cloud_type", "cloud_top_height", "aerosol_optical_thickness_670nm"]:
    if var_name not in resampled:
        continue
    vals = resampled[var_name]
    mask = np.isfinite(vals)
    if mask.sum() == 0:
        continue
    var_vis = vis_ids[mask]
    group_idx = np.array([vis_to_idx.get(int(c), -1) for c in var_vis])
    keep = group_idx >= 0
    agg = MSI_VARIABLES.get(var_name, {}).get("agg", "mean")
    vis_data[var_name] = aggregate_to_healpix(
        vals[mask][keep], group_idx[keep], len(unique_vis), method=agg,
    )

vis_ds = xr.Dataset(
    {name: ("cell_ids", data) for name, data in vis_data.items()},
    coords={"cell_ids": unique_vis},
)
vis_ds = xdggs.decode(
    vis_ds,
    grid_info=xdggs.HealpixInfo(level=VIS_DEPTH, indexing_scheme="nested"),
)

print(f"Visualization at depth {VIS_DEPTH}: {len(unique_vis):,} cells")
print(vis_ds)

vis_ds.dggs.explore()

Static comparison: full swath¶

import cartopy.crs as ccrs
import cartopy.feature as cfeature

var_name = "cloud_mask"
data = vis_ds[var_name].values
mask = np.isfinite(data)

c_lon, c_lat = hpxg.nested.healpix_to_lonlat(
    vis_ds.cell_ids.values[mask], depth=VIS_DEPTH, ellipsoid=ELLIPSOID,
)

fig, ax = plt.subplots(
    figsize=(8, 10), subplot_kw={"projection": ccrs.PlateCarree()},
)
ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
ax.add_feature(cfeature.BORDERS, linewidth=0.3)
ax.add_feature(cfeature.LAND, alpha=0.1)
gl = ax.gridlines(draw_labels=True, linewidth=0.3)
gl.top_labels = False
gl.right_labels = False

sc = ax.scatter(
    c_lon, c_lat, c=data[mask],
    s=1, cmap="Set1", vmin=0, vmax=3,
    transform=ccrs.PlateCarree(),
)
plt.colorbar(sc, ax=ax, shrink=0.6, label=var_name)
ax.set_title(
    f"EarthCARE MSI {var_name} — depth {DEPTH} stored, shown at depth {VIS_DEPTH}\n"
    f"Orbit {ORBIT} ({ELLIPSOID}) — Sentinel-3 style dense chunks at depth {CHUNK_DEPTH}",
    fontsize=11,
)
plt.tight_layout()
plt.show()