"""Time coordinate normalisation for ISMIP6 xarray datasets.
ISMIP6 models encode the ``time`` dimension in a wide variety of formats:
``cftime.DatetimeNoLeap``, ``cftime.Datetime360Day``, "days since" numeric
offsets, plain ``numpy.datetime64``, or integer year labels. Before any
spatial or sector-level processing can be performed, all datasets must share
a uniform ``numpy.datetime64`` time axis covering 2015-2100 (86 years).
This module exposes a single function, ``convert_and_subset_times``, that
handles all known ISMIP6 time encodings and edge cases encountered in the
GHub dataset collection, including:
- ``cftime`` calendar types (NoLeap, 360-day) → ``pandas.DatetimeIndex``
- Numeric "days since X" offsets → ``numpy.datetime64``
- VUW PISM "seconds since 0001-01-01" offsets
- UAF every-5-years datasets (assume 2015-2100)
- Datasets with duplicate time stamps (de-duplicated by unique index)
- Datasets shorter than 86 years (padded with forward-fill)
- Datasets longer than 86 years (trimmed to the last 86 steps)
Usage
-----
::
import xarray as xr
from ise.data.utils import convert_and_subset_times
ds = xr.open_dataset("lithk_AIS_NCAR_CISM_exp01.nc", decode_times=False)
ds = convert_and_subset_times(ds)
# ds.time is now numpy.datetime64 with 86 annual steps from 2015 to 2100
This function is called internally by ``ForcingFile.format_timestamps()``,
``ProjectionProcessor._calculate_ivaf_single_file()``, and the sector
aggregation functions in ``ise.data.process``.
"""
import warnings
from datetime import datetime
import cftime
import numpy as np
[docs]
def convert_and_subset_times(dataset):
"""
Converts time variables in an xarray dataset to a uniform format and subsets time to the range 2015-2100.
Args:
dataset (xarray.Dataset): The dataset with time values to be converted and subset.
Returns:
xarray.Dataset: The dataset with standardized time format and subset to the correct time range.
Raises:
ValueError: If time values are not in a recognizable format.
"""
# This part of your code remains the same
if "time" not in dataset and "year" in dataset:
dataset = dataset.rename({"year": "time"})
dataset["time"] = np.array(
[np.datetime64(f"{int(x)}-01-01") for x in dataset.time.values], dtype="datetime64[ns]"
)
# Check if the length is less than 86 and needs padding
if len(dataset.time) < 86:
warnings.warn(
f"Dataset has {len(dataset.time)} time points. Padding to 86 by repeating the last value."
)
# 1. Determine the first year to create a full 86-year index
start_year = dataset.time.dt.year.min().item()
# 2. Create the target 86-year time index
target_years = np.arange(start_year, start_year + 86)
target_time_index = [np.datetime64(f"{year}-01-01", "ns") for year in target_years]
# 3. Reindex the dataset to the new time coordinate, filling missing values
# by carrying forward the last available data point ('ffill').
dataset = dataset.reindex(time=target_time_index, method="ffill")
# Optional: You can also warn if the dataset is unexpectedly long
elif len(dataset.time) > 86:
warnings.warn(
f"Dataset has {len(dataset.time)} time points, which is more than the expected 86."
)
if isinstance(dataset.time.values[0], cftime._cftime.DatetimeNoLeap) or isinstance(
dataset.time.values[0], cftime._cftime.Datetime360Day
):
datetimeindex = dataset.indexes["time"].to_datetimeindex()
dataset["time"] = datetimeindex
elif (
isinstance(dataset.time.values[0], np.float32)
or isinstance(dataset.time.values[0], np.float64)
or isinstance(dataset.time.values[0], np.int32)
or isinstance(dataset.time.values[0], np.int64)
):
try:
units = dataset.time.attrs["units"]
except KeyError:
units = dataset.time.attrs["unit"]
units = units.replace("days since ", "").split(" ")[0]
if units == "2000-1-0": # VUB AISMPALEO
units = "2000-1-1"
elif units == "day": # NCAR CISM exp7 - "day as %Y%m%d.%f"?
units = "2014-1-1"
if units == "seconds": # VUW PISM -- seconds since 1-1-1 00:00:00
start_date = np.datetime64(
datetime.strptime("0001-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")
)
dataset["time"] = np.array(
[start_date + np.timedelta64(int(x), "s") for x in dataset.time.values]
)
elif units == "2008-1-1" and dataset.time[-1] == 157785.0: # UAF?
# every 5 years but still len(time) == 86.. assume we keep them all for 2015-2100
dataset["time"] = np.array(
[
np.datetime64(datetime.strptime(f"{x}-01-01 00:00:00", "%Y-%m-%d %H:%M:%S"))
for x in range(2015, 2101)
]
)
else:
try:
start_date = np.datetime64(
datetime.strptime(units.replace("days since ", ""), "%Y-%m-%d")
)
except ValueError:
start_date = np.datetime64(
datetime.strptime(units.replace("days since ", ""), "%d-%m-%Y")
)
time_values = np.array(
[start_date + np.timedelta64(int(x), "D") for x in dataset.time.values],
dtype="datetime64[ns]",
)
dataset["time"] = time_values
elif isinstance(dataset.time.values[0], np.datetime64):
pass
else:
raise ValueError(f"Time values are not recognized: {type(dataset.time.values[0])}")
if len(dataset.time) > 86:
# make sure the max date is 2100
# dataset = dataset.sel(time=slice(np.datetime64('2014-01-01'), np.datetime64('2101-01-01')))
dataset = dataset.sortby("time")
dataset = dataset.sel(time=slice("2012-01-01", "2101-01-01"))
# if you still have more than 86, take the previous 86 values from 2100
if len(dataset.time) > 86:
# LSCE GRISLI has two 2015 measurements
# dataset = dataset.sel(time=slice(dataset.time.values[len(dataset.time) - 86], dataset.time.values[-1]))
start_idx = len(dataset.time) - 86
dataset = dataset.isel(time=slice(start_idx, len(dataset.time)))
elif len(dataset.time) < 86:
warnings.warn(
f"Dataset has {len(dataset.time)} time points after subsetting. Padding to 86 by repeating the last value."
)
_, unique_indices = np.unique(dataset["time"], return_index=True)
dataset = dataset.isel(time=unique_indices)
# 1. Determine the first year to create a full 86-year index
start_year = dataset.time.dt.year.min().item()
# 2. Create the target 86-year time index
target_years = np.arange(start_year, start_year + 86)
target_time_index = [np.datetime64(f"{year}-01-01", "ns") for year in target_years]
# 3. Reindex the dataset to the new time coordinate, filling missing values
# by carrying forward the last available data point ('ffill').
dataset = dataset.reindex(time=target_time_index, method="ffill")
if len(dataset.time) != 86:
warnings.warn(
"After subsetting there are still not 86 time points. Go back and check logs."
)
print(f"dataset_length={len(dataset.time)} -- {dataset.attrs}")
return dataset