From 7b5f32363385742ada7d900a57caac4e56733efd Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Thu, 10 Oct 2024 16:29:14 +0200 Subject: [PATCH 001/134] implement default_precision_timestamp, refactor coding/times.py and core/variable.py to use any-precision datetime/timedelta with autmatic inferring of resolution --- xarray/coding/cftime_offsets.py | 18 +-- xarray/coding/times.py | 187 ++++++++++++++++++++++++-------- xarray/core/options.py | 11 ++ xarray/core/pdcompat.py | 24 ++-- xarray/core/variable.py | 82 +++----------- 5 files changed, 185 insertions(+), 137 deletions(-) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index c503e8ebcd3..0bf02345404 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -64,7 +64,7 @@ from xarray.core.pdcompat import ( NoDefault, count_not_none, - nanosecond_precision_timestamp, + default_precision_timestamp, no_default, ) from xarray.core.utils import emit_user_level_warning @@ -83,21 +83,13 @@ T_FreqStr = TypeVar("T_FreqStr", str, None) -def _nanosecond_precision_timestamp(*args, **kwargs): - # As of pandas version 3.0, pd.to_datetime(Timestamp(...)) will try to - # infer the appropriate datetime precision. Until xarray supports - # non-nanosecond precision times, we will use this constructor wrapper to - # explicitly create nanosecond-precision Timestamp objects. - return pd.Timestamp(*args, **kwargs).as_unit("ns") - - def get_date_type(calendar, use_cftime=True): """Return the cftime date type for a given calendar name.""" if cftime is None: raise ImportError("cftime is required for dates with non-standard calendars") else: if _is_standard_calendar(calendar) and not use_cftime: - return _nanosecond_precision_timestamp + return default_precision_timestamp calendars = { "noleap": cftime.DatetimeNoLeap, @@ -1475,10 +1467,8 @@ def date_range_like(source, calendar, use_cftime=None): if is_np_datetime_like(source.dtype): # We want to use datetime fields (datetime64 object don't have them) source_calendar = "standard" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - source_start = nanosecond_precision_timestamp(source_start) - source_end = nanosecond_precision_timestamp(source_end) + source_start = default_precision_timestamp(source_start) + source_end = default_precision_timestamp(source_end) else: if isinstance(source, CFTimeIndex): source_calendar = source.calendar diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 9306bde47a3..aa9566e100e 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,7 +24,8 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.pdcompat import nanosecond_precision_timestamp +from xarray.core.options import _get_datetime_resolution +from xarray.core.pdcompat import default_precision_timestamp from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type @@ -193,9 +194,7 @@ def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: # same us _unpack_netcdf_time_units but finalizes ref_date for # processing in encode_cf_datetime time_units, _ref_date = _unpack_netcdf_time_units(units) - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(_ref_date) + ref_date = pd.Timestamp(_ref_date) # If the ref_date Timestamp is timezone-aware, convert to UTC and # make it timezone-naive (GH 2649). if ref_date.tz is not None: @@ -245,6 +244,47 @@ def _decode_datetime_with_cftime( return np.array([], dtype=object) +def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: + return date.as_unit(unit) if hasattr(date, "as_unit") else date._as_unit(unit) + + +def _check_date_for_units_since_refdate( + date, unit: str, ref_date: pd.Timestamp +) -> pd.Timestamp: + delta = date * np.timedelta64(1, unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if date.dtype.kind in "iu" and not np.int64(delta) == date: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow if ref_date + delta + # can't be represented in the current ref_date resolution + ref_date_unit = np.datetime_data(ref_date.asm8)[0] + return _timestamp_as_unit(ref_date + delta, ref_date_unit) + else: + return ref_date + + +def _get_timeunit(time: pd.Timestamp | pd.Timedelta) -> str: + return np.datetime_data(time.asm8)[0] + + +def _align_reference_date_and_unit(ref_date_str: str, unit: str) -> pd.Timestamp: + ref_date = pd.Timestamp(ref_date_str) + # strip tz information + if ref_date.tz is not None: + ref_date = ref_date.tz_convert(None) + # get ref_date and unit delta + ref_date_unit = np.datetime_data(ref_date.asm8)[0] + ref_date_delta = np.timedelta64(1, ref_date_unit) + unit_delta = np.timedelta64(1, unit) + new_unit = ref_date_unit if ref_date_delta < unit_delta else unit + # transform to the highest needed resolution + # this will raise accordingly + return _timestamp_as_unit(ref_date, new_unit) + + def _decode_datetime_with_pandas( flat_num_dates: np.ndarray, units: str, calendar: str ) -> np.ndarray: @@ -266,20 +306,48 @@ def _decode_datetime_with_pandas( time_units, ref_date_str = _unpack_netcdf_time_units(units) time_units = _netcdf_to_numpy_timeunit(time_units) try: - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - ref_date = nanosecond_precision_timestamp(ref_date_str) + ref_date = _align_reference_date_and_unit(ref_date_str, time_units) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err + if calendar != "proleptic_gregorian": + # if we have gregorian/standard we need to raise + # if we are outside the well defined date range + # proleptic_gregorian and standard/gregorian are only equivalent + # if reference date and date range is >= 1582-10-15 + if ref_date < pd.Timestamp("1582-10-15"): + raise OutOfBoundsDatetime( + f"Dates cannot be decoded using {calendar!r} calendar." + ) + pass + + dunit = _get_timeunit(ref_date) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: + # avoid size 0 datetimes GH1329 - pd.to_timedelta(flat_num_dates.min(), time_units) + ref_date - pd.to_timedelta(flat_num_dates.max(), time_units) + ref_date + dec_min = _check_date_for_units_since_refdate( + flat_num_dates.min(), time_units, ref_date + ) + dec_max = _check_date_for_units_since_refdate( + flat_num_dates.max(), time_units, ref_date + ) + # if we have gregorian/standard we need to raise + # if we are outside the well defined date range + # proleptic_gregorian and standard/gregorian are only equivalent + # if reference date and date range is >= 1582-10-15 + # todo: check if test for minimum date is enough + if ( + calendar != "proleptic_gregorian" + and (np.array([dec_min, dec_max]) < pd.Timestamp("1582-10-15")).any() + ): + raise OutOfBoundsTimedelta( + f"Decoded date is out of range for {calendar} calendar." + ) # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype @@ -292,20 +360,25 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind in "f": flat_num_dates = flat_num_dates.astype(np.float64) - # Cast input ordinals to integers of nanoseconds because pd.to_timedelta - # works much faster when dealing with integers (GH 1399). - # properly handle NaN/NaT to prevent casting NaN to int + # keep NaT/nan mask nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min) - flat_num_dates = flat_num_dates * _NS_PER_TIME_DELTA[time_units] - flat_num_dates_ns_int = np.zeros_like(flat_num_dates, dtype=np.int64) - flat_num_dates_ns_int[nan] = np.iinfo(np.int64).min - flat_num_dates_ns_int[~nan] = flat_num_dates[~nan].astype(np.int64) - # Use pd.to_timedelta to safely cast integer values to timedeltas, - # and add those to a Timestamp to safely produce a DatetimeIndex. This - # ensures that we do not encounter integer overflow at any point in the - # process without raising OutOfBoundsDatetime. - return (pd.to_timedelta(flat_num_dates_ns_int, "ns") + ref_date).values + # in case we need to change the unit, we fix the numbers here + # this should be safe, as errors would have been raised above + ns_time_unit = _NS_PER_TIME_DELTA[time_units] + ns_dunit = _NS_PER_TIME_DELTA[dunit] + if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_dunit): + flat_num_dates *= np.int64(ns_time_unit / ns_dunit) + time_units = dunit + + # Cast input ordinals to integers and properly handle NaN/NaT + # to prevent casting NaN to int + flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) + flat_num_dates_int[nan] = np.iinfo(np.int64).min + flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) + + # cast to timedelta64[time_units] and add to ref_date + return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_units}]") def decode_cf_datetime( @@ -337,10 +410,14 @@ def decode_cf_datetime( dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) - + # retrieve cftype + cftype = type(dates[np.nanargmin(num_dates)]) + # create first day of gregorian calendar in current cf calendar type + border = cftype(1582, 10, 15) + # todo: check if test for minimum date is enough if ( - dates[np.nanargmin(num_dates)].year < 1678 - or dates[np.nanargmax(num_dates)].year >= 2262 + dates[np.nanargmin(num_dates)] < border + or dates[np.nanargmax(num_dates)] < border ): if _is_standard_calendar(calendar): warnings.warn( @@ -364,17 +441,18 @@ def decode_cf_datetime( def to_timedelta_unboxed(value, **kwargs): result = pd.to_timedelta(value, **kwargs).to_numpy() - assert result.dtype == "timedelta64[ns]" + assert np.issubdtype(result.dtype, "timedelta64") return result def to_datetime_unboxed(value, **kwargs): result = pd.to_datetime(value, **kwargs).to_numpy() - assert result.dtype == "datetime64[ns]" + assert np.issubdtype(result.dtype, "datetime64") return result def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: + # todo: check, if this still returns ns """Given an array of numeric timedeltas in netCDF format, convert it into a numpy timedelta64[ns] array. """ @@ -390,10 +468,16 @@ def _unit_timedelta_cftime(units: str) -> timedelta: def _unit_timedelta_numpy(units: str) -> np.timedelta64: numpy_units = _netcdf_to_numpy_timeunit(units) - return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns") + default_unit = _get_datetime_resolution() + return np.timedelta64( + int(_NS_PER_TIME_DELTA[numpy_units] / _NS_PER_TIME_DELTA[default_unit]), + default_unit, + ) def _infer_time_units_from_diff(unique_timedeltas) -> str: + # todo: check, if this function works as intended + # especially, if it not just returns "second" unit_timedelta: Callable[[str], timedelta] | Callable[[str], np.timedelta64] zero_timedelta: timedelta | np.timedelta64 if unique_timedeltas.dtype == np.dtype("O"): @@ -411,7 +495,7 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str: def _time_units_to_timedelta64(units: str) -> np.timedelta64: - return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)).astype("timedelta64[ns]") + return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)) def infer_calendar_name(dates) -> CFCalendar: @@ -440,13 +524,11 @@ def infer_datetime_units(dates) -> str: unique time deltas in `dates`) """ dates = ravel(np.asarray(dates)) - if np.asarray(dates).dtype == "datetime64[ns]": + if np.issubdtype(np.asarray(dates).dtype, "datetime64"): dates = to_datetime_unboxed(dates) dates = dates[pd.notnull(dates)] reference_date = dates[0] if len(dates) > 0 else "1970-01-01" - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. - reference_date = nanosecond_precision_timestamp(reference_date) + reference_date = pd.Timestamp(reference_date) else: reference_date = dates[0] if len(dates) > 0 else "1970-01-01" reference_date = format_cftime_datetime(reference_date) @@ -479,17 +561,17 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: If raise_on_invalid is True (default), invalid dates trigger a ValueError. Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) - # TODO: the strict enforcement of nanosecond precision datetime values can - # be relaxed when addressing GitHub issue #7493. - new = np.empty(times.shape, dtype="M8[ns]") + new = [] dt: pd.Timestamp | Literal["NaT"] - for i, t in np.ndenumerate(times): + for _i, t in np.ndenumerate(times): try: + # todo: decide how to work with this + # as pd.Timestamp is defined from year 0001-01-01 to 9999-12-31 # Use pandas.Timestamp in place of datetime.datetime, because # NumPy casts it safely it np.datetime64[ns] for dates outside # 1678 to 2262 (this is not currently the case for # datetime.datetime). - dt = nanosecond_precision_timestamp( + dt = pd.Timestamp( t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond ) except ValueError as e: @@ -500,8 +582,8 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: ) from e else: dt = "NaT" - new[i] = np.datetime64(dt) - return new + new.append(np.datetime64(dt)) + return np.asarray(new).reshape(times.shape) def convert_times(times, date_type, raise_on_invalid: bool = True) -> np.ndarray: @@ -546,10 +628,8 @@ def convert_time_or_go_back(date, date_type): This is meant to convert end-of-month dates into a new calendar. """ - # TODO: the strict enforcement of nanosecond precision Timestamps can be - # relaxed when addressing GitHub issue #7493. if date_type == pd.Timestamp: - date_type = nanosecond_precision_timestamp + date_type = default_precision_timestamp try: return date_type( date.year, @@ -742,9 +822,7 @@ def _eagerly_encode_cf_datetime( allow_units_modification: bool = True, ) -> tuple[T_DuckArray, str, str]: dates = asarray(dates) - data_units = infer_datetime_units(dates) - if units is None: units = data_units else: @@ -757,16 +835,25 @@ def _eagerly_encode_cf_datetime( if not _is_standard_calendar(calendar) or dates.dtype.kind == "O": # parse with cftime instead raise OutOfBoundsDatetime - assert dates.dtype == "datetime64[ns]" + assert np.issubdtype(dates.dtype, "datetime64") time_units, ref_date = _unpack_time_units_and_ref_date(units) + # calendar equivalence only for days after the reform + if calendar != "proleptic_gregorian" and ref_date < pd.Timestamp("1582-10-15"): + # out of range reference date + raise OutOfBoundsDatetime time_delta = _time_units_to_timedelta64(time_units) # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from # dates to be encoded (GH 2272). + # DatetimeIndex will convert to units of ["s", "ms", "us", "ns"] dates_as_index = pd.DatetimeIndex(ravel(dates)) time_deltas = dates_as_index - ref_date + # get resolution of TimedeltaIndex and align time_delta + deltas_unit = np.datetime_data(time_deltas.dtype)[0] + # todo: check, if this works in any case + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units) @@ -800,6 +887,7 @@ def _eagerly_encode_cf_datetime( ) units = new_units time_delta = needed_time_delta + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") floor_division = True num = _division(time_deltas, time_delta, floor_division) @@ -812,6 +900,7 @@ def _eagerly_encode_cf_datetime( num = cast_to_int_if_safe(num) if dtype is not None: + # todo: check, if this is really needed for all dtypes num = _cast_to_dtype_if_safe(num, dtype) return num, units, calendar @@ -886,12 +975,15 @@ def _eagerly_encode_cf_timedelta( allow_units_modification: bool = True, ) -> tuple[T_DuckArray, str]: data_units = infer_timedelta_units(timedeltas) - if units is None: units = data_units time_delta = _time_units_to_timedelta64(units) time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) + # get resolution of TimedeltaIndex and align time_delta + deltas_unit = np.datetime_data(time_deltas.dtype)[0] + # todo: check, if this works in any case + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 needed_units = data_units @@ -920,12 +1012,14 @@ def _eagerly_encode_cf_timedelta( ) units = needed_units time_delta = needed_time_delta + time_delta = time_delta.astype(f"=m8[{deltas_unit}]") floor_division = True num = _division(time_deltas, time_delta, floor_division) num = reshape(num.values, timedeltas.shape) if dtype is not None: + # todo: check, if this is needed for all dtypes num = _cast_to_dtype_if_safe(num, dtype) return num, units @@ -1033,6 +1127,7 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") transform = partial(decode_cf_timedelta, units=units) + # todo: check, if we can relax this one here, too dtype = np.dtype("timedelta64[ns]") data = lazy_elemwise_func(data, transform, dtype=dtype) diff --git a/xarray/core/options.py b/xarray/core/options.py index 2d69e4b6584..dd6a1620061 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -32,6 +32,7 @@ "use_numbagg", "use_opt_einsum", "use_flox", + "time_resolution", ] class T_Options(TypedDict): @@ -59,6 +60,7 @@ class T_Options(TypedDict): use_flox: bool use_numbagg: bool use_opt_einsum: bool + time_resolution: Literal["s", "ms", "us", "ns"] OPTIONS: T_Options = { @@ -86,10 +88,12 @@ class T_Options(TypedDict): "use_flox": True, "use_numbagg": True, "use_opt_einsum": True, + "time_resolution": "ns", } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) _DISPLAY_OPTIONS = frozenset(["text", "html"]) +_TIME_RESOLUTION_OPTIONS = frozenset(["s", "ms", "us", "ns"]) def _positive_integer(value: Any) -> bool: @@ -117,6 +121,7 @@ def _positive_integer(value: Any) -> bool: "use_opt_einsum": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), "warn_for_unclosed_files": lambda value: isinstance(value, bool), + "time_resolution": _TIME_RESOLUTION_OPTIONS.__contains__, } @@ -158,6 +163,10 @@ def _get_keep_attrs(default: bool) -> bool: return _get_boolean_with_default("keep_attrs", default) +def _get_datetime_resolution() -> Literal["s", "ms", "us", "ns"]: + return OPTIONS["time_resolution"] + + class set_options: """ Set options for xarray in a controlled context. @@ -258,6 +267,8 @@ class set_options: warn_for_unclosed_files : bool, default: False Whether or not to issue a warning when unclosed files are deallocated. This is mostly useful for debugging. + time_resolution : {"s", "ms", "us", "ns"}, default: "ns" + Time resolution used for CF encoding/decoding. Examples -------- diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index ae4febd6beb..ed03e170f87 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -38,8 +38,10 @@ from enum import Enum from typing import Literal +import numpy as np import pandas as pd -from packaging.version import Version + +from xarray.core.options import _get_datetime_resolution def count_not_none(*args) -> int: @@ -73,13 +75,17 @@ def __repr__(self) -> str: NoDefault = Literal[_NoDefault.no_default] # For typing following pandas -def nanosecond_precision_timestamp(*args, **kwargs) -> pd.Timestamp: - """Return a nanosecond-precision Timestamp object. +def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: + """Return a Timestamp object with the default precision. - Note this function should no longer be needed after addressing GitHub issue - #7493. + Xarray default is "ns". This can be overridden by setting + set_options(time_resolution="us") or any other resolution + of {"s", "ms", "us", "ns"}. """ - if Version(pd.__version__) >= Version("2.0.0"): - return pd.Timestamp(*args, **kwargs).as_unit("ns") - else: - return pd.Timestamp(*args, **kwargs) + dt = pd.Timestamp(*args, **kwargs) + units = ["s", "ms", "us", "ns"] + default = _get_datetime_resolution() + unit = np.datetime_data(dt.asm8)[0] + if units.index(default) > units.index(unit): + dt = dt.as_unit(default) + return dt diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 13053faff58..97d43ba56e0 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -6,7 +6,6 @@ import numbers import warnings from collections.abc import Callable, Hashable, Mapping, Sequence -from datetime import timedelta from functools import partial from types import EllipsisType from typing import TYPE_CHECKING, Any, NoReturn, cast @@ -71,13 +70,11 @@ from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint -NON_NANOSECOND_WARNING = ( - "Converting non-nanosecond precision {case} values to nanosecond precision. " - "This behavior can eventually be relaxed in xarray, as it is an artifact from " - "pandas which is now beginning to support non-nanosecond precision values. " - "This warning is caused by passing non-nanosecond np.datetime64 or " +NON_DEFAULTPRECISION_WARNING = ( + "Converting non-default precision {case} values to default precision. " + "This warning is caused by passing non-default np.datetime64 or " "np.timedelta64 values to the DataArray or Variable constructor; it can be " - "silenced by converting the values to nanosecond precision ahead of time." + "silenced by converting the values to default precision {res!r} ahead of time." ) @@ -198,45 +195,11 @@ def _maybe_wrap_data(data): return data -def _as_nanosecond_precision(data): - dtype = data.dtype - non_ns_datetime64 = ( - dtype.kind == "M" - and isinstance(dtype, np.dtype) - and dtype != np.dtype("datetime64[ns]") - ) - non_ns_datetime_tz_dtype = ( - isinstance(dtype, pd.DatetimeTZDtype) and dtype.unit != "ns" - ) - if non_ns_datetime64 or non_ns_datetime_tz_dtype: - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="datetime")) - if isinstance(dtype, pd.DatetimeTZDtype): - nanosecond_precision_dtype = pd.DatetimeTZDtype("ns", dtype.tz) - else: - nanosecond_precision_dtype = "datetime64[ns]" - return duck_array_ops.astype(data, nanosecond_precision_dtype) - elif dtype.kind == "m" and dtype != np.dtype("timedelta64[ns]"): - utils.emit_user_level_warning(NON_NANOSECOND_WARNING.format(case="timedelta")) - return duck_array_ops.astype(data, "timedelta64[ns]") - else: - return data - - def _possibly_convert_objects(values): """Convert arrays of datetime.datetime and datetime.timedelta objects into - datetime64 and timedelta64, according to the pandas convention. For the time - being, convert any non-nanosecond precision DatetimeIndex or TimedeltaIndex - objects to nanosecond precision. While pandas is relaxing this in version - 2.0.0, in xarray we will need to make sure we are ready to handle - non-nanosecond precision datetimes or timedeltas in our code before allowing - such values to pass through unchanged. Converting to nanosecond precision - through pandas.Series objects ensures that datetimes and timedeltas are - within the valid date range for ns precision, as pandas will raise an error - if they are not. + datetime64 and timedelta64, according to the pandas convention. """ as_series = pd.Series(values.ravel(), copy=False) - if as_series.dtype.kind in "mM": - as_series = _as_nanosecond_precision(as_series) result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default @@ -247,28 +210,13 @@ def _possibly_convert_objects(values): return result -def _possibly_convert_datetime_or_timedelta_index(data): - """For the time being, convert any non-nanosecond precision DatetimeIndex or - TimedeltaIndex objects to nanosecond precision. While pandas is relaxing - this in version 2.0.0, in xarray we will need to make sure we are ready to - handle non-nanosecond precision datetimes or timedeltas in our code - before allowing such values to pass through unchanged.""" - if isinstance(data, PandasIndexingAdapter): - if isinstance(data.array, pd.DatetimeIndex | pd.TimedeltaIndex): - data = PandasIndexingAdapter(_as_nanosecond_precision(data.array)) - elif isinstance(data, pd.DatetimeIndex | pd.TimedeltaIndex): - data = _as_nanosecond_precision(data) - return data - - def as_compatible_data( data: T_DuckArray | ArrayLike, fastpath: bool = False ) -> T_DuckArray: """Prepare and wrap data to put in a Variable. - If data does not have the necessary attributes, convert it to ndarray. - - If data has dtype=datetime64, ensure that it has ns precision. If it's a - pandas.Timestamp, convert it to datetime64. + - If it's a pandas.Timestamp, convert it to datetime64. - If data is already a pandas or xarray object (other than an Index), just use the values. @@ -288,7 +236,6 @@ def as_compatible_data( return cast("T_DuckArray", data._variable._data) def convert_non_numpy_type(data): - data = _possibly_convert_datetime_or_timedelta_index(data) return cast("T_DuckArray", _maybe_wrap_data(data)) if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES): @@ -298,11 +245,7 @@ def convert_non_numpy_type(data): data = utils.to_0d_object_array(data) if isinstance(data, pd.Timestamp): - # TODO: convert, handle datetime objects, too - data = np.datetime64(data.value, "ns") - - if isinstance(data, timedelta): - data = np.timedelta64(getattr(data, "value", data), "ns") + data = data.to_numpy() # we don't want nested self-described arrays if isinstance(data, pd.Series | pd.DataFrame): @@ -351,10 +294,13 @@ def _as_array_or_item(data): """ data = np.asarray(data) if data.ndim == 0: - if data.dtype.kind == "M": - data = np.datetime64(data, "ns") - elif data.dtype.kind == "m": - data = np.timedelta64(data, "ns") + kind = data.dtype.kind + if kind in "mM": + unit = np.datetime_data(data.dtype)[0] + if kind == "M": + data = np.datetime64(data, unit) + elif kind == "m": + data = np.timedelta64(data, unit) return data From 8784f33a739c873088034020a666385f6949c84a Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Thu, 10 Oct 2024 16:30:31 +0200 Subject: [PATCH 002/134] align tests with new time resolution behaviour --- xarray/tests/__init__.py | 2 +- xarray/tests/test_backends.py | 22 ++- xarray/tests/test_cftime_offsets.py | 2 +- xarray/tests/test_cftimeindex.py | 11 +- xarray/tests/test_coding_times.py | 67 +++++--- xarray/tests/test_concat.py | 2 +- xarray/tests/test_conventions.py | 10 +- xarray/tests/test_dataarray.py | 4 +- xarray/tests/test_dataset.py | 20 +-- xarray/tests/test_groupby.py | 6 +- xarray/tests/test_interp.py | 2 +- xarray/tests/test_plot.py | 2 +- xarray/tests/test_variable.py | 245 ++++++++++++++-------------- 13 files changed, 218 insertions(+), 177 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index bd7ec6297b9..a55b377d2c0 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -323,7 +323,7 @@ def create_test_data( f'Not enough letters for filling this dimension size ({_dims["dim3"]})' ) obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]])) - obj["time"] = ("time", pd.date_range("2000-01-01", periods=20)) + obj["time"] = ("time", pd.date_range("2000-01-01", periods=20, unit="s")) for v, dims in sorted(_vars.items()): data = rs.normal(size=tuple(_dims[d] for d in dims)) obj[v] = (dims, data) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8ac451dc7c8..b4f1914f983 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -584,6 +584,12 @@ def test_roundtrip_cftime_datetime_data(self) -> None: warnings.filterwarnings("ignore", "Unable to decode time axis") with self.roundtrip(expected, save_kwargs=kwargs) as actual: + # proleptic gregorian will be decoded into numpy datetime64 + # fixing to expectations + if actual.t.dtype.kind == "M": + dtype = f"datetime64[{np.datetime_data(actual.t)[0]}]" + expected_decoded_t = expected_decoded_t.astype(dtype) + expected_decoded_t0 = expected_decoded_t0.astype(dtype) abs_diff = abs(actual.t.values - expected_decoded_t) assert (abs_diff <= np.timedelta64(1, "s")).all() assert ( @@ -598,7 +604,8 @@ def test_roundtrip_cftime_datetime_data(self) -> None: assert actual.t.encoding["calendar"] == expected_calendar def test_roundtrip_timedelta_data(self) -> None: - time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]) # type: ignore[arg-type, unused-ignore] + # todo: check, if default unit "s" is enough + time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore] expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]}) with self.roundtrip(expected) as actual: assert_identical(expected, actual) @@ -1583,7 +1590,8 @@ def test_open_encodings(self) -> None: expected = Dataset() - time = pd.date_range("1999-01-05", periods=10) + # todo: check, if specifying "s" is enough + time = pd.date_range("1999-01-05", periods=10, unit="s") encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) @@ -5379,11 +5387,12 @@ def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: @requires_cftime @requires_scipy_or_netCDF4 -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_standard_calendar_default_out_of_range( calendar, units_year ) -> None: + # todo: check, if we still need to test for two dates import cftime x = [0, 1] @@ -5472,9 +5481,10 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: @requires_scipy_or_netCDF4 -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: + # todo: check, if we still need to check for two dates x = [0, 1] time = [0, 720] units = f"days since {units_year}-01-01" diff --git a/xarray/tests/test_cftime_offsets.py b/xarray/tests/test_cftime_offsets.py index f6f97108c1d..2076121e515 100644 --- a/xarray/tests/test_cftime_offsets.py +++ b/xarray/tests/test_cftime_offsets.py @@ -1496,7 +1496,7 @@ def test_date_range_like_same_calendar(): assert src is out -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_date_range_like_errors(): src = date_range("1899-02-03", periods=20, freq="D", use_cftime=False) src = src[np.arange(20) != 10] # Remove 1 day so the frequency is not inferable. diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 116487e2bcf..9ae75c70557 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1159,7 +1159,7 @@ def test_strftime_of_cftime_array(calendar): @pytest.mark.parametrize("unsafe", [False, True]) def test_to_datetimeindex(calendar, unsafe): index = xr.cftime_range("2000", periods=5, calendar=calendar) - expected = pd.date_range("2000", periods=5) + expected = pd.date_range("2000", periods=5, unit="us") if calendar in _NON_STANDARD_CALENDARS and not unsafe: with pytest.warns(RuntimeWarning, match="non-standard"): @@ -1176,7 +1176,11 @@ def test_to_datetimeindex(calendar, unsafe): @pytest.mark.parametrize("calendar", _ALL_CALENDARS) def test_to_datetimeindex_out_of_range(calendar): index = xr.cftime_range("0001", periods=5, calendar=calendar) - with pytest.raises(ValueError, match="0001"): + # todo: needs discussion, do we need this test? + if calendar in _NON_STANDARD_CALENDARS: + with pytest.warns(RuntimeWarning, match="non-standard"): + index.to_datetimeindex() + else: index.to_datetimeindex() @@ -1199,7 +1203,8 @@ def test_multiindex(): @pytest.mark.parametrize("freq", ["3663s", "33min", "2h"]) @pytest.mark.parametrize("method", ["floor", "ceil", "round"]) def test_rounding_methods_against_datetimeindex(freq, method): - expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s") + # todo: check, if setting to "us" is enough + expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="us") expected = getattr(expected, method)(freq) result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777s") result = getattr(result, method)(freq).to_datetimeindex() diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index bb0dd1dd25c..151efdcaa46 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -129,15 +129,20 @@ def test_cf_datetime(num_dates, units, calendar) -> None: expected = cftime.num2date( num_dates, units, calendar, only_use_cftime_datetimes=True ) - min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)].year - max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)].year - if min_y >= 1678 and max_y < 2262: + print("0:", expected) + + min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)] # .year + max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)] # .year + typ = type(min_y) + border = typ(1582, 10, 15) + if calendar == "proleptic_gregorian" or (min_y >= border and max_y >= border): expected = cftime_to_nptime(expected) + print("1:", expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(num_dates, units, calendar) - + print("2:", actual, type(actual), actual.dtype) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -146,6 +151,8 @@ def test_cf_datetime(num_dates, units, calendar) -> None: # https://github.com/Unidata/netcdf4-python/issues/355 assert (abs_diff <= np.timedelta64(1, "s")).all() encoded1, _, _ = encode_cf_datetime(actual, units, calendar) + + print("1:", encoded1) assert_duckarray_allclose(num_dates, encoded1) if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units: @@ -211,14 +218,21 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", freq="h") + unit = "us" + times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") + print("A:", times) time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values - expected_dtype = np.dtype("M8[ns]") - + if calendar == "proleptic_gregorian": + unit = "s" + expected_dtype = np.dtype(f"M8[{unit}]") + print("0:", time[0], units, calendar) + print("1:", expected.astype("int64")[0], expected.dtype) actual = decode_cf_datetime(time, units, calendar=calendar) + print("2:", actual[0], actual.astype("int64")[0], actual.dtype) assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) + print("3:", abs_diff[0], abs_diff.dtype) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 @@ -262,6 +276,9 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: expected = cftime.num2date( time, units, calendar=calendar, only_use_cftime_datetimes=True ) + # special case proleptic_gregorian + if calendar == "proleptic_gregorian": + expected = expected.astype("=M8[us]") expected_date_type = type(expected[0]) with warnings.catch_warnings(): @@ -272,7 +289,7 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 - assert (abs_diff <= np.timedelta64(1, "s")).all() + assert (abs_diff <= np.timedelta64(1, "us")).all() @requires_cftime @@ -281,11 +298,13 @@ def test_decode_standard_calendar_single_element_inside_timestamp_range( calendar, ) -> None: units = "days since 0001-01-01" + unit = "s" if calendar == "proleptic_gregorian" else "us" for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(num_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + + assert actual.dtype == np.dtype(f"M8[{unit}]") @requires_cftime @@ -327,6 +346,7 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( import cftime units = "days since 0001-01-01" + unit = "s" if calendar == "proleptic_gregorian" else "us" times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) @@ -339,7 +359,7 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( expected2 = times2.values actual = decode_cf_datetime(mdim_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + assert actual.dtype == np.dtype(f"M8[{unit}]") abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -414,7 +434,14 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(mdim_time, units, calendar=calendar) - assert actual.dtype == np.dtype("O") + if calendar == "proleptic_gregorian": + dtype = np.dtype("=M8[s]") + expected1 = expected1.astype(dtype) + expected2 = expected2.astype(dtype) + else: + dtype = np.dtype("O") + + assert actual.dtype == dtype abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -509,7 +536,7 @@ def test_decoded_cf_datetime_array_2d() -> None: ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) result = CFDatetimeCoder().decode(variable) - assert result.dtype == "datetime64[ns]" + assert result.dtype == "datetime64[s]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -675,7 +702,7 @@ def test_decode_cf(calendar) -> None: if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype("M8[ns]") + assert ds.test.dtype == np.dtype("M8[s]") def test_decode_cf_time_bounds() -> None: @@ -700,7 +727,7 @@ def test_decode_cf_time_bounds() -> None: "calendar": "standard", } dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype("M8[ns]") + assert dsc.time_bnds.dtype == np.dtype("M8[s]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -917,8 +944,8 @@ def test_use_cftime_default_standard_calendar_in_range(calendar) -> None: @requires_cftime -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1580]) def test_use_cftime_default_standard_calendar_out_of_range( calendar, units_year ) -> None: @@ -980,8 +1007,8 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("units_year", [1500, 2500]) +@pytest.mark.parametrize("calendar", ["standard", "gregorian"]) +@pytest.mark.parametrize("units_year", [1500, 1582]) def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: numerical_dates = [0, 1] units = f"days since {units_year}-01-01" @@ -1266,7 +1293,7 @@ def test_roundtrip_datetime64_nanosecond_precision( encoding = {} var = Variable(["time"], times, encoding=encoding) - assert var.dtype == np.dtype("=M8[ns]") + assert var.dtype == np.dtype(f"=M8[{timeunit}]") encoded_var = conventions.encode_cf_variable(var) assert ( @@ -1277,7 +1304,7 @@ def test_roundtrip_datetime64_nanosecond_precision( assert encoded_var.data.dtype == dtype decoded_var = conventions.decode_cf_variable("foo", encoded_var) - assert decoded_var.dtype == np.dtype("=M8[ns]") + assert decoded_var.dtype == np.dtype(f"=M8[{timeunit}]") assert ( decoded_var.encoding["units"] == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index 226f376b581..e47c389c015 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -319,7 +319,7 @@ def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index e6c69fc1ee1..7d86cb7c036 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -213,7 +213,7 @@ def test_deterministic_coords_encoding(self) -> None: vars, attrs = conventions.encode_dataset_coordinates(ds) assert attrs["coordinates"] == "bar baz" - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -231,7 +231,7 @@ def test_emit_coordinates_attribute_in_attrs(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -364,7 +364,7 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: attrs = {"units": "days since 1900-01-01"} ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)})) - assert "(time) datetime64[ns]" in repr(ds) + assert "(time) datetime64[s]" in repr(ds) @requires_cftime def test_decode_cf_datetime_transition_to_invalid(self) -> None: @@ -447,13 +447,13 @@ def test_decode_cf_time_kwargs(self) -> None: dsc = conventions.decode_cf(ds) assert dsc.timedelta.dtype == np.dtype("m8[ns]") - assert dsc.time.dtype == np.dtype("M8[ns]") + assert dsc.time.dtype == np.dtype("M8[s]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") dsc = conventions.decode_cf(ds, decode_times=True, decode_timedelta=False) assert dsc.timedelta.dtype == np.dtype("int64") - assert dsc.time.dtype == np.dtype("M8[ns]") + assert dsc.time.dtype == np.dtype("M8[s]") dsc = conventions.decode_cf(ds, decode_times=False, decode_timedelta=True) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 78db39c194e..50704a7570a 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3657,7 +3657,7 @@ def test_to_and_from_dict( actual_no_data = da.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_time_dim(self) -> None: x = np.random.randn(10, 3) t = pd.date_range("20130101", periods=10) @@ -3666,7 +3666,7 @@ def test_to_and_from_dict_with_time_dim(self) -> None: roundtripped = DataArray.from_dict(da.to_dict()) assert_identical(da, roundtripped) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_nan_nat(self) -> None: y = np.random.randn(10, 3) y[2] = np.nan diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c89cfa85622..87c0c724edd 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -122,7 +122,7 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: bool_var_to_append = np.array([False, True], dtype=bool) with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Converting non-nanosecond") + warnings.filterwarnings("ignore", "Converting non-default") ds = xr.Dataset( data_vars={ "da": xr.DataArray( @@ -289,7 +289,7 @@ def test_repr(self) -> None: Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' - * time (time) datetime64[ns] 160B 2000-01-01 2000-01-02 ... 2000-01-20 + * time (time) datetime64[s] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 Data variables: @@ -449,7 +449,7 @@ def test_info(self) -> None: variables: \tfloat64 dim2(dim2) ; - \tdatetime64[ns] time(time) ; + \tdatetime64[s] time(time) ; \tfloat64 var1(dim1, dim2) ; \t\tvar1:foo = variable ; \tfloat64 var2(dim1, dim2) ; @@ -497,7 +497,7 @@ def test_constructor_1d(self) -> None: actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: @@ -3547,9 +3547,9 @@ def test_expand_dims_create_index_from_iterable(self): def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 - with pytest.warns(UserWarning, match="non-nanosecond precision"): - ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) - assert ds.time.dtype == np.dtype("datetime64[ns]") + # todo: test still needed? + ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) + assert ds.time.dtype == np.dtype("datetime64[s]") def test_set_index(self) -> None: expected = create_test_multiindex() @@ -6068,7 +6068,7 @@ def test_dataset_math_auto_align(self) -> None: expected = ds + other.reindex_like(ds) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() @@ -7180,7 +7180,7 @@ def test_differentiate(dask, edge_order) -> None: da.differentiate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.RandomState(42) @@ -7375,7 +7375,7 @@ def test_cumulative_integrate(dask) -> None: da.cumulative_integrate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index dc869cc3a34..b16c41b79bf 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -607,7 +607,7 @@ def test_groupby_repr_datetime(obj) -> None: assert actual == expected -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning") @pytest.mark.filterwarnings("ignore:No index created for dimension id:UserWarning") def test_groupby_drops_nans() -> None: @@ -2124,9 +2124,9 @@ def test_upsample_interpolate(self) -> None: assert_allclose(expected, actual, rtol=1e-16) @requires_scipy - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_upsample_interpolate_bug_2197(self) -> None: - dates = pd.date_range("2007-02-01", "2007-03-01", freq="D") + dates = pd.date_range("2007-02-01", "2007-03-01", freq="D", unit="s") da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) result = da.resample(time="ME").interpolate("linear") expected_times = np.array( diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 6722e8d9404..59217ed49bd 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -640,7 +640,7 @@ def test_interp_like() -> None: pytest.param("2000-01-01T12:00", 0.5, marks=pytest.mark.xfail), ], ) -@pytest.mark.filterwarnings("ignore:Converting non-nanosecond") +@pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(x_new, expected) -> None: da = xr.DataArray( np.arange(24), diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 2605e387360..2516d9ec547 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2963,7 +2963,7 @@ def test_datetime_plot1d(self) -> None: # mpl.dates.AutoDateLocator passes and no other subclasses: assert type(ax.xaxis.get_major_locator()) is mpl.dates.AutoDateLocator - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime_plot2d(self) -> None: # Test that matplotlib-native datetime works: da = DataArray( diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 1d430b6b27e..dd20b1739ef 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -36,7 +36,6 @@ assert_equal, assert_identical, assert_no_warnings, - has_pandas_3, raise_if_dask_computes, requires_bottleneck, requires_cupy, @@ -200,24 +199,25 @@ def test_index_0d_string(self): x = self.cls(["x"], [value]) self._assertIndexedLikeNDArray(x, value, dtype) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(["x"], [d]) self._assertIndexedLikeNDArray(x, np.datetime64(d)) x = self.cls(["x"], [np.datetime64(d)]) - self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") + self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[us]") x = self.cls(["x"], pd.DatetimeIndex([d])) self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_timedelta64(self): td = timedelta(hours=1) - - x = self.cls(["x"], [np.timedelta64(td)]) - self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") + # todo: discussion needed + td64 = np.timedelta64(td, "ns") + x = self.cls(["x"], [td64]) + self._assertIndexedLikeNDArray(x, td64, np.dtype("timedelta64[ns]")) x = self.cls(["x"], pd.to_timedelta([td])) self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") @@ -253,7 +253,7 @@ def test_0d_object_array_with_list(self): assert_array_equal(x[0].data, listarray.squeeze()) assert_array_equal(x.squeeze().data, listarray.squeeze()) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_and_concat_datetime(self): # regression test for #125 date_range = pd.date_range("2011-09-01", periods=10) @@ -274,56 +274,52 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_conversion(self): + # todo: check, if this test is OK times = pd.date_range("2000-01-01", periods=3) - for values, preserve_source in [ - (times, True), - (times.values, True), - (times.values.astype("datetime64[s]"), False), - (times.to_pydatetime(), False), + for values, unit in [ + (times, "ns"), + (times.values, "ns"), + (times.values.astype("datetime64[s]"), "s"), + (times.to_pydatetime(), "ns"), ]: v = self.cls(["t"], values) - assert v.dtype == np.dtype("datetime64[ns]") + assert v.dtype == np.dtype(f"datetime64[{unit}]") assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("datetime64[ns]") - same_source = source_ndarray(v.values) is source_ndarray(values) - assert preserve_source == same_source + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_conversion(self): + # todo: check, if this test is OK times = pd.timedelta_range(start=0, periods=3) - for values, preserve_source in [ - (times, True), - (times.values, True), - (times.values.astype("timedelta64[s]"), False), - (times.to_pytimedelta(), False), + for values, unit in [ + (times, "ns"), + (times.values, "ns"), + (times.values.astype("timedelta64[s]"), "s"), + (times.to_pytimedelta(), "ns"), ]: v = self.cls(["t"], values) - assert v.dtype == np.dtype("timedelta64[ns]") + assert v.dtype == np.dtype(f"timedelta64[{unit}]") assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("timedelta64[ns]") - same_source = source_ndarray(v.values) is source_ndarray(values) - assert preserve_source == same_source + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_object_conversion(self): data = np.arange(5).astype(str).astype(object) actual = self.cls("x", data) assert actual.dtype == data.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_valid_range(self): + # todo: test still needed? data = np.datetime64("1250-01-01", "us") - pderror = pd.errors.OutOfBoundsDatetime - with pytest.raises(pderror, match=r"Out of bounds nanosecond"): - self.cls(["t"], [data]) + self.cls(["t"], [data]) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_valid_range(self): + # todo: test still needed? data = np.timedelta64("200000", "D") - pderror = pd.errors.OutOfBoundsTimedelta - with pytest.raises(pderror, match=r"Cannot convert"): - self.cls(["t"], [data]) + self.cls(["t"], [data]) def test_pandas_data(self): v = self.cls(["x"], pd.Series([0, 1, 2], index=[3, 2, 1])) @@ -1076,31 +1072,38 @@ def test_numpy_same_methods(self): v = IndexVariable("x", np.arange(5)) assert 2 == v.searchsorted(2) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_datetime64_conversion_scalar(self): - expected = np.datetime64("2000-01-01", "ns") - for values in [ - np.datetime64("2000-01-01"), - pd.Timestamp("2000-01-01T00"), - datetime(2000, 1, 1), - ]: - v = Variable([], values) - assert v.dtype == np.dtype("datetime64[ns]") - assert v.values == expected - assert v.values.dtype == np.dtype("datetime64[ns]") - - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_timedelta64_conversion_scalar(self): - expected = np.timedelta64(24 * 60 * 60 * 10**9, "ns") - for values in [ - np.timedelta64(1, "D"), - pd.Timedelta("1 day"), - timedelta(days=1), - ]: - v = Variable([], values) - assert v.dtype == np.dtype("timedelta64[ns]") - assert v.values == expected - assert v.values.dtype == np.dtype("timedelta64[ns]") + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", + [ + (np.datetime64("2000-01-01"), "s"), + (pd.Timestamp("2000-01-01T00"), "s"), + (datetime(2000, 1, 1), "ns"), + ], + ) + def test_datetime64_conversion_scalar(self, values, unit): + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert np.issubdtype(v.values, "datetime64") + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", + [ + (np.timedelta64(1, "D"), "s"), + (pd.Timedelta("1 day"), "ns"), + (timedelta(days=1), "ns"), + ], + ) + def test_timedelta64_conversion_scalar(self, values, unit): + # todo: discussion needed + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert np.issubdtype(v.values, "timedelta64") + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_0d_str(self): v = Variable([], "foo") @@ -1112,16 +1115,20 @@ def test_0d_str(self): assert v.values == "foo".encode("ascii") def test_0d_datetime(self): + # todo: check, if this test is OK v = Variable([], pd.Timestamp("2000-01-01")) - assert v.dtype == np.dtype("datetime64[ns]") - assert v.values == np.datetime64("2000-01-01", "ns") + assert v.dtype == np.dtype("datetime64[s]") + assert v.values == np.datetime64("2000-01-01", "s") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") - def test_0d_timedelta(self): - for td in [pd.to_timedelta("1s"), np.timedelta64(1, "s")]: - v = Variable([], td) - assert v.dtype == np.dtype("timedelta64[ns]") - assert v.values == np.timedelta64(10**9, "ns") + @pytest.mark.filterwarnings("ignore:Converting non-default") + @pytest.mark.parametrize( + "values, unit", [(pd.to_timedelta("1s"), "ns"), (np.timedelta64(1, "s"), "s")] + ) + def test_0d_timedelta(self, values, unit): + # todo: check, if this test is OK + v = Variable([], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert v.values == np.timedelta64(10**9, "ns") def test_equals_and_identical(self): d = np.random.rand(10, 3) @@ -1561,7 +1568,7 @@ def test_transpose(self): v.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_transpose_0d(self): for value in [ 3.5, @@ -2623,19 +2630,20 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert actual.dtype == expected.dtype - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") + @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(self): + # todo: check, if this test is OK expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected) assert expected == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[s]") == actual.dtype expected = np.array([np.datetime64("2000-01-01")]) actual = as_compatible_data(expected) assert np.asarray(expected) == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[s]") == actual.dtype expected = np.array([np.datetime64("2000-01-01", "ns")]) actual = as_compatible_data(expected) @@ -2651,6 +2659,7 @@ def test_datetime(self): assert np.dtype("datetime64[ns]") == actual.dtype def test_tz_datetime(self) -> None: + # todo: check, if this test is OK tz = pytz.timezone("America/New_York") times_ns = pd.date_range("2000", periods=1, tz=tz) @@ -2659,7 +2668,7 @@ def test_tz_datetime(self) -> None: warnings.simplefilter("ignore") actual: T_DuckArray = as_compatible_data(times_s) assert actual.array == times_s - assert actual.array.dtype == pd.DatetimeTZDtype("ns", tz) + assert actual.array.dtype == pd.DatetimeTZDtype("s", tz) series = pd.Series(times_s) with warnings.catch_warnings(): @@ -2667,7 +2676,7 @@ def test_tz_datetime(self) -> None: actual2: T_DuckArray = as_compatible_data(series) np.testing.assert_array_equal(actual2, np.asarray(series.values)) - assert actual2.dtype == np.dtype("datetime64[ns]") + assert actual2.dtype == np.dtype("datetime64[s]") def test_full_like(self) -> None: # For more thorough tests, see test_variable.py @@ -2943,37 +2952,32 @@ def test_from_pint_wrapping_dask(self, Var): @pytest.mark.parametrize( - ("values", "warns"), + ("values", "unit"), [ - (np.datetime64("2000-01-01", "ns"), False), - (np.datetime64("2000-01-01", "s"), True), - (np.array([np.datetime64("2000-01-01", "ns")]), False), - (np.array([np.datetime64("2000-01-01", "s")]), True), - (pd.date_range("2000", periods=1), False), - (datetime(2000, 1, 1), has_pandas_3), - (np.array([datetime(2000, 1, 1)]), has_pandas_3), - (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), False), + (np.datetime64("2000-01-01", "ns"), "ns"), + (np.datetime64("2000-01-01", "s"), "s"), + (np.array([np.datetime64("2000-01-01", "ns")]), "ns"), + (np.array([np.datetime64("2000-01-01", "s")]), "s"), + (pd.date_range("2000", periods=1), "ns"), + (datetime(2000, 1, 1), "ns"), + (np.array([datetime(2000, 1, 1)]), "ns"), + (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), "ns"), ( pd.Series( pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")) ), - False, + "ns", ), ], ids=lambda x: f"{x}", ) -def test_datetime_conversion_warning(values, warns) -> None: +def test_datetime_conversion_warning(values, unit) -> None: + # todo: needs discussion + # todo: check, if this test is OK dims = ["time"] if isinstance(values, np.ndarray | pd.Index | pd.Series) else [] - if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): - var = Variable(dims, values) - else: - with warnings.catch_warnings(): - warnings.simplefilter("error") - var = Variable(dims, values) - + var = Variable(dims, values) if var.dtype.kind == "M": - assert var.dtype == np.dtype("datetime64[ns]") + assert var.dtype == np.dtype(f"datetime64[{unit}]") else: # The only case where a non-datetime64 dtype can occur currently is in # the case that the variable is backed by a timezone-aware @@ -3011,53 +3015,48 @@ def test_datetime_conversion_warning(values, warns) -> None: def test_pandas_two_only_datetime_conversion_warnings( data: pd.DatetimeIndex | pd.Series, dtype: str | pd.DatetimeTZDtype ) -> None: - with pytest.warns(UserWarning, match="non-nanosecond precision datetime"): - var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] + # todo: check, if this test is OK + var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] if var.dtype.kind == "M": - assert var.dtype == np.dtype("datetime64[ns]") + assert var.dtype == np.dtype("datetime64[s]") else: # The only case where a non-datetime64 dtype can occur currently is in # the case that the variable is backed by a timezone-aware # DatetimeIndex, and thus is hidden within the PandasIndexingAdapter class. assert isinstance(var._data, PandasIndexingAdapter) - assert var._data.array.dtype == pd.DatetimeTZDtype("ns", tz_ny) + assert var._data.array.dtype == pd.DatetimeTZDtype("s", tz_ny) @pytest.mark.parametrize( - ("values", "warns"), + ("values", "unit"), [ - (np.timedelta64(10, "ns"), False), - (np.timedelta64(10, "s"), True), - (np.array([np.timedelta64(10, "ns")]), False), - (np.array([np.timedelta64(10, "s")]), True), - (pd.timedelta_range("1", periods=1), False), - (timedelta(days=1), False), - (np.array([timedelta(days=1)]), False), + (np.timedelta64(10, "ns"), "ns"), + (np.timedelta64(10, "s"), "s"), + (np.array([np.timedelta64(10, "ns")]), "ns"), + (np.array([np.timedelta64(10, "s")]), "s"), + (pd.timedelta_range("1", periods=1), "ns"), + (timedelta(days=1), "ns"), + (np.array([timedelta(days=1)]), "ns"), ], ids=lambda x: f"{x}", ) -def test_timedelta_conversion_warning(values, warns) -> None: +def test_timedelta_conversion_warning(values, unit) -> None: + # todo: needs discussion + # todo: check, if this test is OK dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] - if warns: - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): - var = Variable(dims, values) - else: - with warnings.catch_warnings(): - warnings.simplefilter("error") - var = Variable(dims, values) - - assert var.dtype == np.dtype("timedelta64[ns]") + var = Variable(dims, values) + assert var.dtype == np.dtype(f"timedelta64[{unit}]") def test_pandas_two_only_timedelta_conversion_warning() -> None: + # todo: test still needed? # Note this test relies on a pandas feature that is only present in pandas # 2.0.0 and above, and so for now cannot be parametrized. data = pd.timedelta_range("1", periods=1).astype("timedelta64[s]") - with pytest.warns(UserWarning, match="non-nanosecond precision timedelta"): - var = Variable(["time"], data) + var = Variable(["time"], data) - assert var.dtype == np.dtype("timedelta64[ns]") + assert var.dtype == np.dtype("timedelta64[s]") @pytest.mark.parametrize( @@ -3069,7 +3068,7 @@ def test_pandas_two_only_timedelta_conversion_warning() -> None: ids=lambda x: f"{x}", ) def test_pandas_indexing_adapter_non_nanosecond_conversion(index, dtype) -> None: + # todo: test still needed? data = PandasIndexingAdapter(index.astype(f"{dtype}[s]")) - with pytest.warns(UserWarning, match="non-nanosecond precision"): - var = Variable(["time"], data) - assert var.dtype == np.dtype(f"{dtype}[ns]") + var = Variable(["time"], data) + assert var.dtype == np.dtype(f"{dtype}[s]") From b45ab232ab17a8a99dcb6b9a89f5d884f5c1d237 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Thu, 10 Oct 2024 18:37:56 +0200 Subject: [PATCH 003/134] timedelta decoding, fsspec handling --- xarray/coding/times.py | 17 +++++++++++++---- xarray/tests/test_backends.py | 4 +++- xarray/tests/test_coding_times.py | 6 ++++-- xarray/tests/test_dataset.py | 9 +++++---- 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index aa9566e100e..4e99f3f57b0 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -440,7 +440,13 @@ def decode_cf_datetime( def to_timedelta_unboxed(value, **kwargs): + # todo: check, if the procedure here is correct result = pd.to_timedelta(value, **kwargs).to_numpy() + unique_timedeltas = np.unique(result[pd.notnull(result)]) + unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) + if unit not in ["s", "ms", "us", "ns"]: + unit = "s" + result = result.astype(f"timedelta64[{unit}]") assert np.issubdtype(result.dtype, "timedelta64") return result @@ -452,13 +458,16 @@ def to_datetime_unboxed(value, **kwargs): def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: - # todo: check, if this still returns ns + # todo: check, if this works as intended """Given an array of numeric timedeltas in netCDF format, convert it into a - numpy timedelta64[ns] array. + numpy timedelta64 ["s", "ms", "us", "ns"] array. """ num_timedeltas = np.asarray(num_timedeltas) - units = _netcdf_to_numpy_timeunit(units) - result = to_timedelta_unboxed(ravel(num_timedeltas), unit=units) + unit = _netcdf_to_numpy_timeunit(units) + as_unit = unit + if unit not in ["s", "ms", "us", "ns"]: + as_unit = "s" + result = pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() return reshape(result, num_timedeltas.shape) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b4f1914f983..45d56b63e52 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5588,7 +5588,9 @@ def test_open_fsspec() -> None: mm = m.get_mapper("out1.zarr") ds.to_zarr(mm) # old interface ds0 = ds.copy() - ds0["time"] = ds.time + pd.to_timedelta("1 day") + # pd.to_timedelta returns ns-precision, but the example data is in second precision + # so we need to fix this + ds0["time"] = ds.time + pd.to_timedelta("1 day").as_unit("s") mm = m.get_mapper("out2.zarr") ds0.to_zarr(mm) # old interface diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 151efdcaa46..45ac29163bf 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -618,8 +618,9 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None: ], ) def test_cf_timedelta(timedeltas, units, numbers) -> None: + # todo: check, if this test is OK if timedeltas == "NaT": - timedeltas = np.timedelta64("NaT", "ns") + timedeltas = np.timedelta64("NaT", "s") else: timedeltas = to_timedelta_unboxed(timedeltas) numbers = np.array(numbers) @@ -635,9 +636,10 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: assert_array_equal(expected, actual) assert expected.dtype == actual.dtype - expected = np.timedelta64("NaT", "ns") + expected = np.timedelta64("NaT", "s") actual = decode_cf_timedelta(np.array(np.nan), "days") assert_array_equal(expected, actual) + assert expected.dtype == actual.dtype def test_cf_timedelta_2d() -> None: diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 87c0c724edd..815e65821f4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -105,18 +105,19 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: lon = [0, 1, 2] nt1 = 3 nt2 = 2 - time1 = pd.date_range("2000-01-01", periods=nt1) - time2 = pd.date_range("2000-02-01", periods=nt2) + # todo: check, if all changes below are correct + time1 = pd.date_range("2000-01-01", periods=nt1).as_unit("ns") + time2 = pd.date_range("2000-02-01", periods=nt2).as_unit("ns") string_var = np.array(["a", "bc", "def"], dtype=object) string_var_to_append = np.array(["asdf", "asdfg"], dtype=object) string_var_fixed_length = np.array(["aa", "bb", "cc"], dtype="|S2") string_var_fixed_length_to_append = np.array(["dd", "ee"], dtype="|S2") unicode_var = np.array(["áó", "áó", "áó"]) datetime_var = np.array( - ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[s]" + ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[ns]" ) datetime_var_to_append = np.array( - ["2019-01-04", "2019-01-05"], dtype="datetime64[s]" + ["2019-01-04", "2019-01-05"], dtype="datetime64[ns]" ) bool_var = np.array([True, False, True], dtype=bool) bool_var_to_append = np.array([False, True], dtype=bool) From 39086ef3ba4366aa561ec052beda36792d4aaa70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 13 Oct 2024 21:29:42 +0200 Subject: [PATCH 004/134] fixes in coding/times.py --- xarray/coding/times.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 4e99f3f57b0..41793c1ab73 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,7 +24,6 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.options import _get_datetime_resolution from xarray.core.pdcompat import default_precision_timestamp from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable @@ -274,7 +273,7 @@ def _align_reference_date_and_unit(ref_date_str: str, unit: str) -> pd.Timestamp ref_date = pd.Timestamp(ref_date_str) # strip tz information if ref_date.tz is not None: - ref_date = ref_date.tz_convert(None) + ref_date = ref_date.tz_convert("UTC").tz_convert(None) # get ref_date and unit delta ref_date_unit = np.datetime_data(ref_date.asm8)[0] ref_date_delta = np.timedelta64(1, ref_date_unit) @@ -443,7 +442,7 @@ def to_timedelta_unboxed(value, **kwargs): # todo: check, if the procedure here is correct result = pd.to_timedelta(value, **kwargs).to_numpy() unique_timedeltas = np.unique(result[pd.notnull(result)]) - unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) + unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) if unit not in ["s", "ms", "us", "ns"]: unit = "s" result = result.astype(f"timedelta64[{unit}]") @@ -467,7 +466,9 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: as_unit = unit if unit not in ["s", "ms", "us", "ns"]: as_unit = "s" - result = pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() + result = ( + pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() + ) return reshape(result, num_timedeltas.shape) @@ -477,16 +478,11 @@ def _unit_timedelta_cftime(units: str) -> timedelta: def _unit_timedelta_numpy(units: str) -> np.timedelta64: numpy_units = _netcdf_to_numpy_timeunit(units) - default_unit = _get_datetime_resolution() - return np.timedelta64( - int(_NS_PER_TIME_DELTA[numpy_units] / _NS_PER_TIME_DELTA[default_unit]), - default_unit, - ) + return np.timedelta64(1, numpy_units) def _infer_time_units_from_diff(unique_timedeltas) -> str: - # todo: check, if this function works as intended - # especially, if it not just returns "second" + # todo: check, if this function works correctly wrt np.timedelta64 unit_timedelta: Callable[[str], timedelta] | Callable[[str], np.timedelta64] zero_timedelta: timedelta | np.timedelta64 if unique_timedeltas.dtype == np.dtype("O"): @@ -575,7 +571,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: for _i, t in np.ndenumerate(times): try: # todo: decide how to work with this - # as pd.Timestamp is defined from year 0001-01-01 to 9999-12-31 + # as initialized by string pd.Timestamp is defined only from year -9999-01-01 to 9999-12-31 # Use pandas.Timestamp in place of datetime.datetime, because # NumPy casts it safely it np.datetime64[ns] for dates outside # 1678 to 2262 (this is not currently the case for From df49a4084911c272592163822e94adaf6503263d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 13 Oct 2024 21:30:36 +0200 Subject: [PATCH 005/134] add docs on time coding --- doc/internals/index.rst | 1 + doc/internals/time-coding.rst | 434 ++++++++++++++++++++++++++++++++++ 2 files changed, 435 insertions(+) create mode 100644 doc/internals/time-coding.rst diff --git a/doc/internals/index.rst b/doc/internals/index.rst index b2a37900338..4c00376a7b4 100644 --- a/doc/internals/index.rst +++ b/doc/internals/index.rst @@ -26,3 +26,4 @@ The pages in this section are intended for: how-to-add-new-backend how-to-create-custom-index zarr-encoding-spec + time-coding diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst new file mode 100644 index 00000000000..3014c7a6c3d --- /dev/null +++ b/doc/internals/time-coding.rst @@ -0,0 +1,434 @@ +.. ipython:: python + :suppress: + + import numpy as np + import pandas as pd + import xarray as xr + + np.random.seed(123456) + np.set_printoptions(threshold=20) + int64_max = np.iinfo("int64").max + int64_min = np.iinfo("int64").min + 1 + uint64_max = np.iinfo("uint64").max + +.. internals.timecoding: + +Time Coding +=========== + +This page gives an overview how xarray encodes and decodes times and which conventions and functions are used. + +Pandas functionality +-------------------- + +to_datetime +~~~~~~~~~~~ + +The function :py:func:`pandas.to_datetime` is used within xarray for inferring units and for testing purposes. + +In normal operation :py:func:`pandas.to_datetime` returns :py:class:`pandas.Timestamp` (scalar input) or :py:class:`pandas.DatetimeIndex` (array-like input) which are datetime64 with inherited resolution (from the source). If no resolution can be inherited ``'ns'`` is assumed. That has the implication, that the maximum usable timerange for those cases is +-292 years centered around the epoch. To accommodate for that, we are carefully checking the units/resolution in the encoding and decoding step. + +When args are numeric (no strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. + +.. ipython:: python + + f"Maximum datetime range: ({pd.to_datetime(int64_min, unit="ns")}, {pd.to_datetime(int64_max, unit="ns")})" + +For input values which can't be represented in nanosecond resolution :py:class:`pandas.OutOfBoundsDatetime` exception is raised: + +.. ipython:: python + + try: + dtime = pd.to_datetime(int64_max, unit="us") + except Exception as err: + print(err) + try: + dtime = pd.to_datetime(uint64_max, unit="ns") + print("Wrong:", dtime) + dtime = pd.to_datetime([uint64_max], unit="ns") + except Exception as err: + print(err) + +Numpy datetime64 can be extracted with :py:meth:`pandas.Datetime.to_numpy` and :py:meth:`pandas.DatetimeIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Datetime.as_unit` +and :py:meth:`pandas.DatetimeIndex.as_unit` respectively. + +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as argument. That means we are able to represent datetimes with second, millisecond, microsecond or nanosecond resolution. + +.. ipython:: python + + time = pd.to_datetime(np.datetime64(0, "D")) + print("Datetime:", time, np.asarray([time.to_numpy()]).dtype) + print("Datetime as_unit('s'):", time.as_unit("s")) + print("Datetime to_numpy():", time.as_unit("s").to_numpy()) + time = pd.to_datetime(np.array([-1000, 1, 2], dtype="datetime64[Y]")) + print("DatetimeIndex:", time) + print("DatetimeIndex as_unit('s'):", time.as_unit("s")) + print("DatetimeIndex to_numpy():", time.as_unit("s").to_numpy()) + +.. warning:: + Input data with resolution higher than ``'ns'`` (eg. ``'ps'``, ``'fs'``, ``'as'``) is truncated (not rounded) at the ``'ns'``-level. This is currently broken for the ``'ps'`` input, where it is interpreted as ``'ns'``. + + .. ipython:: python + + try: + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "as")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "fs")])) + print(" Bad:", pd.to_datetime([np.datetime64(1901901901901, "ps")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ns")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "us")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ms")])) + print( + "Good:", pd.to_datetime(np.array([np.datetime64(1901901901901, "s")])) + ) + print("Bad:", pd.to_datetime([np.datetime64(1901901901901, "s")])) + except Exception as err: + print("Raise:", err) + +.. warning:: + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_datetime` when providing :py:class:`numpy.datetime64` as scalar or numpy array as input. + + .. ipython:: python + + print( + "Works:", + np.datetime64(1901901901901, "s"), + pd.to_datetime(np.datetime64(1901901901901, "s")), + ) + print( + "Works:", + np.array([np.datetime64(1901901901901, "s")]), + pd.to_datetime(np.array([np.datetime64(1901901901901, "s")])), + ) + try: + pd.to_datetime([np.datetime64(1901901901901, "s")]) + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime(1901901901901, unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime([1901901901901], unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_datetime(np.array([1901901901901]), unit="s") + except Exception as err: + print("Raises:", err) + + +to_timedelta +~~~~~~~~~~~~ + +The function :py:func:`pandas.to_timedelta` is used within xarray for inferring units and for testing purposes. + +In normal operation :py:func:`pandas.to_timedelta` returns :py:class:`pandas.Timedelta` (scalar input) or :py:class:`pandas.TimedeltaIndex` (array-like input) which are timedelta64 with ``ns`` resolution internally. That has the implication, that the usable timedelta covers only roughly 585 years. To accommodate for that, we are working around that limitation in the encoding and decoding step. + +.. ipython:: python + + f"Maximum timedelta range: ({pd.to_timedelta(int64_min, unit="ns")}, {pd.to_timedelta(int64_max, unit="ns")})" + +For input values which can't be represented in nanosecond resolution :py:class:`pandas.OutOfBoundsTimedelta` exception is raised: + +.. ipython:: python + + try: + delta = pd.to_timedelta(int64_max, unit="us") + except Exception as err: + print("First:", err) + try: + delta = pd.to_timedelta(uint64_max, unit="ns") + except Exception as err: + print("Second:", err) + +When args are numeric (no strings) "unit" can be anything from ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. + +Numpy timedelta64 can be extracted with :py:meth:`pandas.Timedelta.to_numpy` and :py:meth:`pandas.TimedeltaIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timedelta.as_unit` +and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. + +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as argument. That means we are able to represent timedeltas with second, millisecond, microsecond or nanosecond resolution. + +.. ipython:: python + + delta = pd.to_timedelta(1, unit="D") + print("Timedelta:", delta) + print("Timedelta as_unit('s'):", delta.as_unit("s")) + print("Timedelta to_numpy():", delta.as_unit("s").to_numpy()) + delta = pd.to_timedelta([0, 1, 2], unit="D") + print("TimedeltaIndex:", delta) + print("TimedeltaIndex as_unit('s'):", delta.as_unit("s")) + print("TimedeltaIndex to_numpy():", delta.as_unit("s").to_numpy()) + +.. note:: + For the functionality in xarray the output resolution is converted from ``'ns'`` to the lowest needed resolution. + +.. warning:: + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_datetime` when providing :py:class:`numpy.datetime64` as scalar or numpy array as input. + + .. ipython:: python + + print( + "Works:", + np.timedelta64(1901901901901, "s"), + pd.to_timedelta(np.timedelta64(1901901901901, "s")), + ) + print( + "Works:", + np.array([np.timedelta64(1901901901901, "s")]), + pd.to_timedelta(np.array([np.timedelta64(1901901901901, "s")])), + ) + try: + pd.to_timedelta([np.timedelta64(1901901901901, "s")]) + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta(1901901901901, unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta([1901901901901], unit="s") + except Exception as err: + print("Raises:", err) + try: + pd.to_timedelta(np.array([1901901901901]), unit="s") + except Exception as err: + print("Raises:", err) + +Timestamp +~~~~~~~~~ + +:py:class:`pandas.Timestamp` is used within xarray to wrap strings of CF reference times and datetime.datetime. + +When args are numeric (no strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. + +In normal operation :py:class:`pandas.Timestamp` holds the timestamp in the provided resolution, but only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. + +Same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see above). +Depending on the internal resolution Timestamps can be represented in the range: + +.. ipython:: python + + for unit in ["s", "ms", "us", "ns"]: + print( + f"unit: {unit!r} time range ({pd.Timestamp(int64_min, unit=unit)}, {pd.Timestamp(int64_max, unit=unit)})" + ) + +Since relaxing the resolution this enhances the range to several hundreds of thousands of centuries with microsecond representation. ``NaT`` will be at ``np.iinfo("int64").min`` for all of the different representations. + +.. warning:: + When initialized with a datetime string this is only defined from ``-9999-01-01`` to ``9999-12-31``. + + .. ipython:: python + + try: + print("Works:", pd.Timestamp("-9999-01-01 00:00:00")) + print("Works, too:", pd.Timestamp("9999-12-31 23:59:59")) + print(pd.Timestamp("10000-01-01 00:00:00")) + except Exception as err: + print("Errors:", err) + +.. note:: + :py:class:`pandas.Timestamp` is the only current possibility to correctly import time reference strings. It handles non-ISO formatted strings, keeps the resolution of the strings (``'s'``, ``''ms''`` etc.) and imports time zones. When initialized with :py:class:`numpy.datetime64` instead of a string it even overcomes the above limitation of the possible time range. + + .. ipython:: python + + try: + print("Handles non-ISO:", pd.Timestamp("92-1-8 151542")) + print( + "Keeps resolution 1:", + pd.Timestamp("1992-10-08 15:15:42"), + pd.Timestamp("1992-10-08 15:15:42").unit, + ) + print( + "Keeps resolution 2:", + pd.Timestamp("1992-10-08 15:15:42.5"), + pd.Timestamp("1992-10-08 15:15:42.5").unit, + ) + print( + "Keeps timezone:", + pd.Timestamp("1992-10-08 15:15:42.5 -6:00"), + pd.Timestamp("1992-10-08 15:15:42.5 -6:00").unit, + ) + print( + "Extends timerange :", + pd.Timestamp(np.datetime64("-10000-10-08 15:15:42.5001")), + pd.Timestamp(np.datetime64("-10000-10-08 15:15:42.5001")).unit, + ) + except Exception as err: + print("Errors:", err) + +DatetimeIndex +~~~~~~~~~~~~~ + +:py:class:`pandas.DatetimeIndex` is used to wrap numpy datetime64 or other datetime-likes, when encoding. The resolution of the DatetimeIndex depends on the input, but can be only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. +:py:class:`pandas.DatetimeIndex` will raise :py:class:`pandas.OutOfBoundsDatetime` if the input can't be represented in the given resolution. + +.. note:: + For xarray we assume that all :py:class:`numpy.datetime64` provided to :py:class:`pandas.DatetimeIndex` are up to the specs. This is especially true, when those values have been decoded upfront. If the data is provided by users, they should handle any issues before. + +.. ipython:: python + + try: + print( + "Works:", + pd.DatetimeIndex( + np.array(["1992-01-08", "1992-01-09"], dtype="datetime64[D]") + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1992-01-08 15:15:42", "1992-01-09 15:15:42"], + dtype="datetime64[s]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1992-01-08 15:15:42.5", "1992-01-09 15:15:42.0"], + dtype="datetime64[ms]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["1970-01-01 00:00:00.401501601701801901", "1970-01-01 00:00:00"], + dtype="datetime64[as]", + ) + ), + ) + print( + "Works:", + pd.DatetimeIndex( + np.array( + ["-10000-01-01 00:00:00.401501", "1970-01-01 00:00:00"], + dtype="datetime64[us]", + ) + ), + ) + except Exception as err: + print("Errors:", err) + +CF Conventions Time Handling +---------------------------- + +Xarray tries to adhere to the latest version of the `CF Conventions`_. Relevant is the section on `Time Coordinate`_ and the `Calendar`_ subsection. + +.. _CF Conventions: https://cfconventions.org +.. _Time Coordinate: https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#time-coordinate +.. _Calendar: https://cfconventions.org/Data/cf-conventions/cf-conventions-1.11/cf-conventions.html#calendar + +CF time decoding +~~~~~~~~~~~~~~~~ + +Decoding of ``values`` with time unit specification like ``seconds since 1992-10-8 15:15:42.5 -6:00`` into datetimes (using CF convention) is a multistage process. + +1. If we have a non-standard calendar (eg. ``noleap``) the decoding is done with ``cftime`` package (which is not covered in this section). For ``standard``/``gregorian`` calendar as well as ``proleptic_gregorian`` the above outlined pandas functionality is used. + +2. ``standard``/``gregorian`` calendar and ``proleptic_gregorian`` are equivalent for any dates and reference times >= ``1582-10-15``. First the reference time is checked and any timezone information stripped off and in a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For ``standard``/``gregorian`` calendar the dates are checked to be >= ``1582-10-15``. If anything fails, the decoding is done with ``cftime``). + +3. As the time unit (here ``seconds``) and the resolution of the reference time ``1992-10-8 15:15:42.5 -6:00`` (here ``milliseconds``) might be different, this has to be aligned to the higher resolution (retrieve new unit). This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To not break consistency for ``NaT`` a mask is kept and re-introduced after the multiplication. + +4. Finally, the ``values`` (``int64``) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. + +.. ipython:: python + + calendar = "proleptic_gregorian" + values = np.array([-1000 * 365, 0, 1000 * 365], dtype="int64") + units = "days since 2000-01-01 00:00:00.000001" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + print(dt) + assert dt.dtype == "datetime64[us]" + + units = "microseconds since 2000-01-01 00:00:00" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + print(dt) + assert dt.dtype == "datetime64[us]" + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "days since 2000-01-01 00:00:00.001" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + print(dt) + assert dt.dtype == "datetime64[ms]" + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "hours since 2000-01-01" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + print(dt) + assert dt.dtype == "datetime64[s]" + + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "hours since 2000-01-01 00:00:00 03:30" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + print(dt) + assert dt.dtype == "datetime64[s]" + + values = np.array([-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64") + units = "days since 0001-01-01 00:00:00" + dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + print(dt) + assert dt.dtype == "datetime64[s]" + +CF time encoding +~~~~~~~~~~~~~~~~ + +For encoding the process is more or less a reversal of the above, but we have to make some decisions on default values. + +1. Infer ``data_units`` from the given ``dates``. +2. Infer ``units`` (either cleanup given ``units`` or use ``data_units`` +3. Infer calendar name from given ``dates``. +4. If non standard calendar or object dates (CFTime) encode with ``cftime`` +5. Retrieve ``time_units`` and ``ref_date`` from ``units`` +6. Check ``ref_date`` >= ``1582-10-15``, otherwise -> ``cftime`` +7. Wrap ``dates`` with pd.DatetimeIndex +8. Subtracting ``ref_date`` (:py:class:`pandas.Timestamp`) from above :py:class:`pandas.DatetimeIndex` will return :py:class:`pandas.TimedeltaIndex` +9. Align resolution of :py:class:`pandas.TimedeltaIndex` with resolution of ``time_units`` +10. Retrieve needed ``units`` and ``delta`` to faithfully encode into int64 +11. Divide ``time_deltas`` by ``delta``, use floor division (integer) or normal division (float) +12. Return result + +.. ipython:: python + :okwarning: + + calendar = "proleptic_gregorian" + dates = np.array( + [ + "-2000-01-01T00:00:00", + "0000-01-01T00:00:00", + "0002-01-01T00:00:00", + "2000-01-01T00:00:00", + ], + dtype="datetime64[s]", + ) + orig_values = np.array( + [-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64" + ) + units = "days since 0001-01-01 00:00:00" + values, _, _ = xr.coding.times.encode_cf_datetime( + dates, units, calendar, dtype=np.dtype("int64") + ) + print(values) + np.testing.assert_array_equal(values, orig_values) + + dates = np.array( + [ + "-2000-01-01T01:00:00", + "0000-01-01T00:00:00", + "0002-01-01T00:00:00", + "2000-01-01T00:00:00", + ], + dtype="datetime64[s]", + ) + orig_values = np.array( + [-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64" + ) + units = "days since 0001-01-01 00:00:00" + values, units, _ = xr.coding.times.encode_cf_datetime( + dates, units, calendar, dtype=np.dtype("int64") + ) + print(values, units) From adb8ca32b5b1933f632ad4dcaea41d87f6649654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 13 Oct 2024 22:33:11 +0200 Subject: [PATCH 006/134] attempt fixing doc tests --- xarray/coding/cftimeindex.py | 2 +- xarray/core/common.py | 4 ++-- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index e85fa2736b2..18877defa73 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -646,7 +646,7 @@ def to_datetimeindex(self, unsafe=False): CFTimeIndex([2000-01-01 00:00:00, 2000-01-02 00:00:00], dtype='object', length=2, calendar='standard', freq=None) >>> times.to_datetimeindex() - DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[ns]', freq=None) + DatetimeIndex(['2000-01-01', '2000-01-02'], dtype='datetime64[us]', freq=None) """ if not self._data.size: diff --git a/xarray/core/common.py b/xarray/core/common.py index 9a6807faad2..1c732a8ee64 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -619,7 +619,7 @@ def assign_coords( lon (x, y) float64 32B 260.2 260.7 260.2 260.8 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 128B 20.0 20.8 21.6 ... 30.4 31.2 32.0 @@ -633,7 +633,7 @@ def assign_coords( lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 128B 20.0 20.8 21.6 ... 30.4 31.2 32.0 diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 3735bf1099c..d76b2f9f577 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -390,7 +390,7 @@ class DataArray( lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Attributes: description: Ambient temperature. @@ -405,7 +405,7 @@ class DataArray( lon float64 8B -99.32 lat float64 8B 42.21 time datetime64[ns] 8B 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Attributes: description: Ambient temperature. units: degC diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index d433cbcec18..3a0b728c57c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -646,7 +646,7 @@ class Dataset( lat (loc) float64 16B 42.25 42.21 * instrument (instrument) Self: lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: precipitation (x, y, time) float64 96B 5.68 9.256 0.7104 ... 4.615 7.805 @@ -8826,7 +8826,7 @@ def filter_by_attrs(self, **kwargs) -> Self: lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[ns] 8B 2014-09-05 + reference_time datetime64[s] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 96B 29.11 18.2 22.83 ... 16.15 26.63 From 266b1ed3ae85d98d6e717cedf1e42767ed00d892 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Mon, 14 Oct 2024 08:06:14 +0200 Subject: [PATCH 007/134] fix issue where out-of-bounds floating point values slipped in the processing, raise now early --- xarray/coding/times.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 41793c1ab73..c4f584327a7 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -250,6 +250,11 @@ def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: def _check_date_for_units_since_refdate( date, unit: str, ref_date: pd.Timestamp ) -> pd.Timestamp: + # check for out-of-bounds floats and raise + if date > np.iinfo("int64").max or date < np.iinfo("int64").min: + raise OutOfBoundsTimedelta( + f"Value {date} can't be represented as Datetime/Timedelta." + ) delta = date * np.timedelta64(1, unit) if not np.isnan(delta): # this will raise on dtype overflow for integer dtypes @@ -262,6 +267,8 @@ def _check_date_for_units_since_refdate( ref_date_unit = np.datetime_data(ref_date.asm8)[0] return _timestamp_as_unit(ref_date + delta, ref_date_unit) else: + # if date is exactly NaT (np.iinfo("int64").min) return refdate + # to make follow-up checks work return ref_date From 6d5f13bcea79599870058a31a7d89f6f347d08d8 Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Mon, 14 Oct 2024 08:17:20 +0200 Subject: [PATCH 008/134] convert to UTC first before stripping of tz in _unpack_time_units_and_ref_date --- xarray/coding/times.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index c4f584327a7..fdd20e056b7 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -197,7 +197,7 @@ def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: # If the ref_date Timestamp is timezone-aware, convert to UTC and # make it timezone-naive (GH 2649). if ref_date.tz is not None: - ref_date = ref_date.tz_convert(None) + ref_date = ref_date.tz_convert("UTC").tz_convert(None) return time_units, ref_date From 5d68bfe5d32d647e821d7a11d075d8655a3e8b88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 10:44:14 +0200 Subject: [PATCH 009/134] reorganize pandas compatibility code, remove unneeded code, attempt to fix mypy --- xarray/coding/times.py | 24 +++++++++--------------- xarray/core/pdcompat.py | 21 ++++++++++++++++----- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index fdd20e056b7..e5fe71fd600 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,7 +24,7 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.pdcompat import default_precision_timestamp +from xarray.core.pdcompat import _timestamp_as_unit, default_precision_timestamp from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable from xarray.namedarray.parallelcompat import T_ChunkedArray, get_chunked_array_type @@ -243,10 +243,6 @@ def _decode_datetime_with_cftime( return np.array([], dtype=object) -def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: - return date.as_unit(unit) if hasattr(date, "as_unit") else date._as_unit(unit) - - def _check_date_for_units_since_refdate( date, unit: str, ref_date: pd.Timestamp ) -> pd.Timestamp: @@ -264,7 +260,7 @@ def _check_date_for_units_since_refdate( ) # this will raise on overflow if ref_date + delta # can't be represented in the current ref_date resolution - ref_date_unit = np.datetime_data(ref_date.asm8)[0] + ref_date_unit = ref_date.unit return _timestamp_as_unit(ref_date + delta, ref_date_unit) else: # if date is exactly NaT (np.iinfo("int64").min) return refdate @@ -272,17 +268,15 @@ def _check_date_for_units_since_refdate( return ref_date -def _get_timeunit(time: pd.Timestamp | pd.Timedelta) -> str: - return np.datetime_data(time.asm8)[0] - - -def _align_reference_date_and_unit(ref_date_str: str, unit: str) -> pd.Timestamp: +def _align_reference_date_and_unit( + ref_date_str: str, unit: Literal["D", "h", "m", "s", "ms", "us", "ns"] +) -> pd.Timestamp: ref_date = pd.Timestamp(ref_date_str) # strip tz information if ref_date.tz is not None: ref_date = ref_date.tz_convert("UTC").tz_convert(None) # get ref_date and unit delta - ref_date_unit = np.datetime_data(ref_date.asm8)[0] + ref_date_unit = ref_date.unit ref_date_delta = np.timedelta64(1, ref_date_unit) unit_delta = np.timedelta64(1, unit) new_unit = ref_date_unit if ref_date_delta < unit_delta else unit @@ -329,7 +323,7 @@ def _decode_datetime_with_pandas( ) pass - dunit = _get_timeunit(ref_date) + dunit = ref_date.unit with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) @@ -863,7 +857,7 @@ def _eagerly_encode_cf_datetime( dates_as_index = pd.DatetimeIndex(ravel(dates)) time_deltas = dates_as_index - ref_date # get resolution of TimedeltaIndex and align time_delta - deltas_unit = np.datetime_data(time_deltas.dtype)[0] + deltas_unit = time_deltas.unit # todo: check, if this works in any case time_delta = time_delta.astype(f"=m8[{deltas_unit}]") @@ -993,7 +987,7 @@ def _eagerly_encode_cf_timedelta( time_delta = _time_units_to_timedelta64(units) time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) # get resolution of TimedeltaIndex and align time_delta - deltas_unit = np.datetime_data(time_deltas.dtype)[0] + deltas_unit = time_deltas.unit # todo: check, if this works in any case time_delta = time_delta.astype(f"=m8[{deltas_unit}]") diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index ed03e170f87..3dd6da61d4e 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -36,9 +36,8 @@ from __future__ import annotations from enum import Enum -from typing import Literal +from typing import Literal, cast -import numpy as np import pandas as pd from xarray.core.options import _get_datetime_resolution @@ -75,6 +74,19 @@ def __repr__(self) -> str: NoDefault = Literal[_NoDefault.no_default] # For typing following pandas +def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: + # compatibility function for pandas issue + # where "as_unit" is not defined for pandas.Timestamp + # in pandas versions < 2.2 + # can be removed minimum pandas version is >= 2.2 + unit = cast(Literal["s", "ms", "us", "ns"], unit) + if hasattr(date, "as_unit"): + date = date.as_unit(unit) + elif hasattr(date, "_as_unit"): + date = date._as_unit(unit) + return date + + def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: """Return a Timestamp object with the default precision. @@ -85,7 +97,6 @@ def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: dt = pd.Timestamp(*args, **kwargs) units = ["s", "ms", "us", "ns"] default = _get_datetime_resolution() - unit = np.datetime_data(dt.asm8)[0] - if units.index(default) > units.index(unit): - dt = dt.as_unit(default) + if units.index(default) > units.index(dt.unit): + dt = _timestamp_as_unit(dt, default) return dt From 07bba69fb206692f09f5e62220d3fa29edb59e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 10:58:16 +0200 Subject: [PATCH 010/134] another attempt to finally fix mypy --- xarray/tests/test_coding_times.py | 19 ++++++++----------- xarray/tests/test_variable.py | 2 +- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 45ac29163bf..1661ea73d64 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -3,7 +3,7 @@ import warnings from datetime import timedelta from itertools import product -from typing import Literal +from typing import Literal, cast import numpy as np import pandas as pd @@ -218,21 +218,16 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - unit = "us" + unit = cast(Literal["s", "ms", "us", "ns"], "us") times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") - print("A:", times) time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values if calendar == "proleptic_gregorian": unit = "s" expected_dtype = np.dtype(f"M8[{unit}]") - print("0:", time[0], units, calendar) - print("1:", expected.astype("int64")[0], expected.dtype) actual = decode_cf_datetime(time, units, calendar=calendar) - print("2:", actual[0], actual.astype("int64")[0], actual.dtype) assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) - print("3:", abs_diff[0], abs_diff.dtype) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: # https://github.com/Unidata/netcdf4-python/issues/355 @@ -434,6 +429,7 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(mdim_time, units, calendar=calendar) + dtype: np.dtype if calendar == "proleptic_gregorian": dtype = np.dtype("=M8[s]") expected1 = expected1.astype(dtype) @@ -657,13 +653,14 @@ def test_cf_timedelta_2d() -> None: @pytest.mark.parametrize( ["deltas", "expected"], [ - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] + (pd.to_timedelta(["1 day", "2 days"]), "days"), + (pd.to_timedelta(["1 day", "2 days"]), "days"), + (pd.to_timedelta(["1 day", "2 days"]), "days"), + (pd.to_timedelta(["1 day", "2 days"]), "days"), ], ) def test_infer_timedelta_units(deltas, expected) -> None: + # todo: why testing, the same thing four times? assert expected == infer_timedelta_units(deltas) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index dd20b1739ef..22b94f7a3ae 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -2668,7 +2668,7 @@ def test_tz_datetime(self) -> None: warnings.simplefilter("ignore") actual: T_DuckArray = as_compatible_data(times_s) assert actual.array == times_s - assert actual.array.dtype == pd.DatetimeTZDtype("s", tz) + assert actual.array.dtype == pd.DatetimeTZDtype("s", tz) # type: ignore[arg-type] series = pd.Series(times_s) with warnings.catch_warnings(): From 6e7f0bb81309470cd25fc4ca12309d2d2013e874 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 11:28:33 +0200 Subject: [PATCH 011/134] refactor out _check_date_is_after_shift --- xarray/coding/times.py | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index e5fe71fd600..0d898e6597b 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -285,6 +285,19 @@ def _align_reference_date_and_unit( return _timestamp_as_unit(ref_date, new_unit) +def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: + # if we have gregorian/standard we need to raise + # if we are outside the well defined date range + # proleptic_gregorian and standard/gregorian are only equivalent + # if reference date and date range is >= 1582-10-15 + if calendar != "proleptic_gregorian": + if date < pd.Timestamp("1582-10-15"): + raise OutOfBoundsDatetime( + f"Dates before 1582-10-15 cannot be decoded " + f"with pandas using {calendar!r} calendar." + ) + + def _decode_datetime_with_pandas( flat_num_dates: np.ndarray, units: str, calendar: str ) -> np.ndarray: @@ -312,17 +325,7 @@ def _decode_datetime_with_pandas( # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err - if calendar != "proleptic_gregorian": - # if we have gregorian/standard we need to raise - # if we are outside the well defined date range - # proleptic_gregorian and standard/gregorian are only equivalent - # if reference date and date range is >= 1582-10-15 - if ref_date < pd.Timestamp("1582-10-15"): - raise OutOfBoundsDatetime( - f"Dates cannot be decoded using {calendar!r} calendar." - ) - pass - + _check_date_is_after_shift(ref_date, calendar) dunit = ref_date.unit with warnings.catch_warnings(): @@ -333,21 +336,10 @@ def _decode_datetime_with_pandas( dec_min = _check_date_for_units_since_refdate( flat_num_dates.min(), time_units, ref_date ) - dec_max = _check_date_for_units_since_refdate( + _check_date_for_units_since_refdate( flat_num_dates.max(), time_units, ref_date ) - # if we have gregorian/standard we need to raise - # if we are outside the well defined date range - # proleptic_gregorian and standard/gregorian are only equivalent - # if reference date and date range is >= 1582-10-15 - # todo: check if test for minimum date is enough - if ( - calendar != "proleptic_gregorian" - and (np.array([dec_min, dec_max]) < pd.Timestamp("1582-10-15")).any() - ): - raise OutOfBoundsTimedelta( - f"Decoded date is out of range for {calendar} calendar." - ) + _check_date_is_after_shift(dec_min, calendar) # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer and unsigned integer dtype From b4a49bb29e2ef4b6c3ff91224c95c57ad72f9acc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 11:38:09 +0200 Subject: [PATCH 012/134] refactor out _maybe_strip_tz_from_timestamp --- xarray/coding/times.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 0d898e6597b..cf88be61c4c 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -189,15 +189,20 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]: return delta_units, ref_date +def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp: + # If the ref_date Timestamp is timezone-aware, convert to UTC and + # make it timezone-naive (GH 2649). + if date.tz is not None: + date = date.tz_convert("UTC").tz_convert(None) + return date + + def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: # same us _unpack_netcdf_time_units but finalizes ref_date for # processing in encode_cf_datetime time_units, _ref_date = _unpack_netcdf_time_units(units) ref_date = pd.Timestamp(_ref_date) - # If the ref_date Timestamp is timezone-aware, convert to UTC and - # make it timezone-naive (GH 2649). - if ref_date.tz is not None: - ref_date = ref_date.tz_convert("UTC").tz_convert(None) + ref_date = _maybe_strip_tz_from_timestamp(ref_date) return time_units, ref_date @@ -272,9 +277,7 @@ def _align_reference_date_and_unit( ref_date_str: str, unit: Literal["D", "h", "m", "s", "ms", "us", "ns"] ) -> pd.Timestamp: ref_date = pd.Timestamp(ref_date_str) - # strip tz information - if ref_date.tz is not None: - ref_date = ref_date.tz_convert("UTC").tz_convert(None) + ref_date = _maybe_strip_tz_from_timestamp(ref_date) # get ref_date and unit delta ref_date_unit = ref_date.unit ref_date_delta = np.timedelta64(1, ref_date_unit) From 2e1ff4f8cdddb104b4c5cdc24e0efdea363c92d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 13:14:57 +0200 Subject: [PATCH 013/134] more refactoring in coding.times.py --- xarray/coding/times.py | 66 +++++++++++++++++++----------------------- 1 file changed, 29 insertions(+), 37 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index cf88be61c4c..080bebf85fb 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -197,13 +197,16 @@ def _maybe_strip_tz_from_timestamp(date: pd.Timestamp) -> pd.Timestamp: return date -def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: - # same us _unpack_netcdf_time_units but finalizes ref_date for - # processing in encode_cf_datetime - time_units, _ref_date = _unpack_netcdf_time_units(units) +def _unpack_time_unit_and_ref_date( + units: str, +) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]: + # same us _unpack_netcdf_time_units but finalizes time_unit and ref_date + # for processing in encode_cf_datetime + time_unit, _ref_date = _unpack_netcdf_time_units(units) + time_unit = _netcdf_to_numpy_timeunit(time_unit) ref_date = pd.Timestamp(_ref_date) ref_date = _maybe_strip_tz_from_timestamp(ref_date) - return time_units, ref_date + return time_unit, ref_date def _decode_cf_datetime_dtype( @@ -273,11 +276,7 @@ def _check_date_for_units_since_refdate( return ref_date -def _align_reference_date_and_unit( - ref_date_str: str, unit: Literal["D", "h", "m", "s", "ms", "us", "ns"] -) -> pd.Timestamp: - ref_date = pd.Timestamp(ref_date_str) - ref_date = _maybe_strip_tz_from_timestamp(ref_date) +def _align_reference_date_and_unit(ref_date: pd.Timestamp, unit: str) -> pd.Timestamp: # get ref_date and unit delta ref_date_unit = ref_date.unit ref_date_delta = np.timedelta64(1, ref_date_unit) @@ -319,17 +318,15 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind == "u": flat_num_dates = flat_num_dates.astype(np.uint64) - time_units, ref_date_str = _unpack_netcdf_time_units(units) - time_units = _netcdf_to_numpy_timeunit(time_units) try: - ref_date = _align_reference_date_and_unit(ref_date_str, time_units) + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) + ref_date = _align_reference_date_and_unit(ref_date, time_unit) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime from err _check_date_is_after_shift(ref_date, calendar) - dunit = ref_date.unit with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) @@ -337,10 +334,10 @@ def _decode_datetime_with_pandas( # avoid size 0 datetimes GH1329 dec_min = _check_date_for_units_since_refdate( - flat_num_dates.min(), time_units, ref_date + flat_num_dates.min(), time_unit, ref_date ) _check_date_for_units_since_refdate( - flat_num_dates.max(), time_units, ref_date + flat_num_dates.max(), time_unit, ref_date ) _check_date_is_after_shift(dec_min, calendar) @@ -360,11 +357,11 @@ def _decode_datetime_with_pandas( # in case we need to change the unit, we fix the numbers here # this should be safe, as errors would have been raised above - ns_time_unit = _NS_PER_TIME_DELTA[time_units] - ns_dunit = _NS_PER_TIME_DELTA[dunit] - if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_dunit): - flat_num_dates *= np.int64(ns_time_unit / ns_dunit) - time_units = dunit + ns_time_unit = _NS_PER_TIME_DELTA[time_unit] + ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit] + if flat_num_dates.dtype.kind in "iuf" and (ns_time_unit > ns_ref_date_unit): + flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) + time_unit = ref_date.unit # Cast input ordinals to integers and properly handle NaN/NaT # to prevent casting NaN to int @@ -372,8 +369,8 @@ def _decode_datetime_with_pandas( flat_num_dates_int[nan] = np.iinfo(np.int64).min flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) - # cast to timedelta64[time_units] and add to ref_date - return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_units}]") + # cast to timedelta64[time_unit] and add to ref_date + return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]") def decode_cf_datetime( @@ -495,10 +492,6 @@ def _infer_time_units_from_diff(unique_timedeltas) -> str: return "seconds" -def _time_units_to_timedelta64(units: str) -> np.timedelta64: - return np.timedelta64(1, _netcdf_to_numpy_timeunit(units)) - - def infer_calendar_name(dates) -> CFCalendar: """Given an array of datetimes, infer the CF calendar name""" if is_np_datetime_like(dates.dtype): @@ -838,12 +831,10 @@ def _eagerly_encode_cf_datetime( raise OutOfBoundsDatetime assert np.issubdtype(dates.dtype, "datetime64") - time_units, ref_date = _unpack_time_units_and_ref_date(units) + time_unit, ref_date = _unpack_time_unit_and_ref_date(units) # calendar equivalence only for days after the reform - if calendar != "proleptic_gregorian" and ref_date < pd.Timestamp("1582-10-15"): - # out of range reference date - raise OutOfBoundsDatetime - time_delta = _time_units_to_timedelta64(time_units) + _check_date_is_after_shift(ref_date, calendar) + time_delta = np.timedelta64(1, time_unit) # Wrap the dates in a DatetimeIndex to do the subtraction to ensure # an OverflowError is raised if the ref_date is too far away from @@ -857,16 +848,17 @@ def _eagerly_encode_cf_datetime( time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 - needed_units, data_ref_date = _unpack_time_units_and_ref_date(data_units) + needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units) + needed_units = _numpy_to_netcdf_timeunit(needed_unit) if data_units != units: # this accounts for differences in the reference times ref_delta = abs(data_ref_date - ref_date).to_timedelta64() - data_delta = _time_units_to_timedelta64(needed_units) + data_delta = np.timedelta64(1, needed_unit) if (ref_delta % data_delta) > np.timedelta64(0, "ns"): needed_units = _infer_time_units_from_diff(ref_delta) # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) + needed_time_delta = _unit_timedelta_numpy(needed_units) floor_division = np.issubdtype(dtype, np.integer) or dtype is None if time_delta > needed_time_delta: @@ -979,7 +971,7 @@ def _eagerly_encode_cf_timedelta( if units is None: units = data_units - time_delta = _time_units_to_timedelta64(units) + time_delta = _unit_timedelta_numpy(units) time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) # get resolution of TimedeltaIndex and align time_delta deltas_unit = time_deltas.unit @@ -992,7 +984,7 @@ def _eagerly_encode_cf_timedelta( needed_units = _infer_time_units_from_diff(np.unique(time_deltas.dropna())) # needed time delta to encode faithfully to int64 - needed_time_delta = _time_units_to_timedelta64(needed_units) + needed_time_delta = _unit_timedelta_numpy(needed_units) floor_division = np.issubdtype(dtype, np.integer) or dtype is None if time_delta > needed_time_delta: From d5a7da0ea8e465e4a3777196751f0ec3c68eeffa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 13:42:15 +0200 Subject: [PATCH 014/134] more refactoring in coding.times.py --- xarray/coding/times.py | 32 ++++++++++++++----------------- xarray/tests/test_coding_times.py | 4 ---- 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 080bebf85fb..82e15b6ab35 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -268,8 +268,7 @@ def _check_date_for_units_since_refdate( ) # this will raise on overflow if ref_date + delta # can't be represented in the current ref_date resolution - ref_date_unit = ref_date.unit - return _timestamp_as_unit(ref_date + delta, ref_date_unit) + return _timestamp_as_unit(ref_date + delta, ref_date.unit) else: # if date is exactly NaT (np.iinfo("int64").min) return refdate # to make follow-up checks work @@ -277,19 +276,17 @@ def _check_date_for_units_since_refdate( def _align_reference_date_and_unit(ref_date: pd.Timestamp, unit: str) -> pd.Timestamp: - # get ref_date and unit delta - ref_date_unit = ref_date.unit - ref_date_delta = np.timedelta64(1, ref_date_unit) - unit_delta = np.timedelta64(1, unit) - new_unit = ref_date_unit if ref_date_delta < unit_delta else unit - # transform to the highest needed resolution - # this will raise accordingly - return _timestamp_as_unit(ref_date, new_unit) + # align to the highest needed resolution of ref_date or unit + if np.timedelta64(1, ref_date.unit) > np.timedelta64(1, unit): + # this will raise accordingly + # if data can't be represented in the higher resolution + return _timestamp_as_unit(ref_date, unit) + return ref_date def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: # if we have gregorian/standard we need to raise - # if we are outside the well defined date range + # if we are outside the well-defined date range # proleptic_gregorian and standard/gregorian are only equivalent # if reference date and date range is >= 1582-10-15 if calendar != "proleptic_gregorian": @@ -842,10 +839,6 @@ def _eagerly_encode_cf_datetime( # DatetimeIndex will convert to units of ["s", "ms", "us", "ns"] dates_as_index = pd.DatetimeIndex(ravel(dates)) time_deltas = dates_as_index - ref_date - # get resolution of TimedeltaIndex and align time_delta - deltas_unit = time_deltas.unit - # todo: check, if this works in any case - time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 needed_unit, data_ref_date = _unpack_time_unit_and_ref_date(data_units) @@ -871,6 +864,7 @@ def _eagerly_encode_cf_datetime( f"Set encoding['dtype'] to floating point dtype to silence this warning." ) elif np.issubdtype(dtype, np.integer) and allow_units_modification: + floor_division = True new_units = f"{needed_units} since {format_timestamp(ref_date)}" emit_user_level_warning( f"Times can't be serialized faithfully to int64 with requested units {units!r}. " @@ -880,10 +874,12 @@ def _eagerly_encode_cf_datetime( ) units = new_units time_delta = needed_time_delta - time_delta = time_delta.astype(f"=m8[{deltas_unit}]") - floor_division = True - num = _division(time_deltas, time_delta, floor_division) + # get resolution of TimedeltaIndex and align time_delta + # todo: check, if this works in any case + num = _division( + time_deltas, time_delta.astype(f"=m8[{time_deltas.unit}]"), floor_division + ) num = reshape(num.values, dates.shape) except (OutOfBoundsDatetime, OverflowError, ValueError): diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 1661ea73d64..10de701412e 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -129,7 +129,6 @@ def test_cf_datetime(num_dates, units, calendar) -> None: expected = cftime.num2date( num_dates, units, calendar, only_use_cftime_datetimes=True ) - print("0:", expected) min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)] # .year max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)] # .year @@ -137,12 +136,10 @@ def test_cf_datetime(num_dates, units, calendar) -> None: border = typ(1582, 10, 15) if calendar == "proleptic_gregorian" or (min_y >= border and max_y >= border): expected = cftime_to_nptime(expected) - print("1:", expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(num_dates, units, calendar) - print("2:", actual, type(actual), actual.dtype) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -152,7 +149,6 @@ def test_cf_datetime(num_dates, units, calendar) -> None: assert (abs_diff <= np.timedelta64(1, "s")).all() encoded1, _, _ = encode_cf_datetime(actual, units, calendar) - print("1:", encoded1) assert_duckarray_allclose(num_dates, encoded1) if hasattr(num_dates, "ndim") and num_dates.ndim == 1 and "1000" not in units: From 821b68de7144ec45389df9ca0518cfad6e008399 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 14:18:16 +0200 Subject: [PATCH 015/134] minor fix in time-coding.rst --- doc/internals/time-coding.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 3014c7a6c3d..9bbd282d271 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -163,7 +163,7 @@ and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. For the functionality in xarray the output resolution is converted from ``'ns'`` to the lowest needed resolution. .. warning:: - Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_datetime` when providing :py:class:`numpy.datetime64` as scalar or numpy array as input. + Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_timedelta` when providing :py:class:`numpy.timedelta64` as scalar or numpy array as input. .. ipython:: python From d066edfbf44a7989a8f6fbdde53be26068f9659b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 16:03:09 +0200 Subject: [PATCH 016/134] set default resolution to "s", which actually means, use pandas lowest resolution, fix code and tests to allow this --- xarray/coding/times.py | 11 +++++++- xarray/core/options.py | 2 +- xarray/tests/__init__.py | 7 +++-- xarray/tests/test_backends.py | 7 ++--- xarray/tests/test_coding_times.py | 44 ++++++++++++++++++++++--------- xarray/tests/test_conventions.py | 7 ++--- xarray/tests/test_dataset.py | 16 ++++++----- 7 files changed, 65 insertions(+), 29 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 82e15b6ab35..96641a83687 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,6 +24,7 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item +from xarray.core.options import _get_datetime_resolution from xarray.core.pdcompat import _timestamp_as_unit, default_precision_timestamp from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable @@ -98,6 +99,13 @@ def _is_numpy_compatible_time_range(times): tmin = times.min() tmax = times.max() try: + # before relaxing the nanosecond constrained + # this raised OutOfBoundsDatetime for + # times < 1678 and times > 2262 + # this isn't the case anymore for other resolutions like "s" + # now, we raise for dates before 1582-10-15 + _check_date_is_after_shift(tmin, "standard") + _check_date_is_after_shift(tmax, "standard") convert_time_or_go_back(tmin, pd.Timestamp) convert_time_or_go_back(tmax, pd.Timestamp) except pd.errors.OutOfBoundsDatetime: @@ -290,7 +298,7 @@ def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: # proleptic_gregorian and standard/gregorian are only equivalent # if reference date and date range is >= 1582-10-15 if calendar != "proleptic_gregorian": - if date < pd.Timestamp("1582-10-15"): + if date < type(date)(1582, 10, 15): raise OutOfBoundsDatetime( f"Dates before 1582-10-15 cannot be decoded " f"with pandas using {calendar!r} calendar." @@ -318,6 +326,7 @@ def _decode_datetime_with_pandas( try: time_unit, ref_date = _unpack_time_unit_and_ref_date(units) ref_date = _align_reference_date_and_unit(ref_date, time_unit) + ref_date = _align_reference_date_and_unit(ref_date, _get_datetime_resolution()) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime diff --git a/xarray/core/options.py b/xarray/core/options.py index dd6a1620061..f185987a88e 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -267,7 +267,7 @@ class set_options: warn_for_unclosed_files : bool, default: False Whether or not to issue a warning when unclosed files are deallocated. This is mostly useful for debugging. - time_resolution : {"s", "ms", "us", "ns"}, default: "ns" + time_resolution : {"s", "ms", "us", "ns"}, default: "s" Time resolution used for CF encoding/decoding. Examples diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index a55b377d2c0..5d17624cc9d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -18,7 +18,7 @@ from xarray import Dataset from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401 from xarray.core.extension_array import PandasExtensionArray -from xarray.core.options import set_options +from xarray.core.options import _get_datetime_resolution, set_options from xarray.core.variable import IndexVariable from xarray.testing import ( # noqa: F401 assert_chunks_equal, @@ -323,7 +323,10 @@ def create_test_data( f'Not enough letters for filling this dimension size ({_dims["dim3"]})' ) obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]])) - obj["time"] = ("time", pd.date_range("2000-01-01", periods=20, unit="s")) + obj["time"] = ( + "time", + pd.date_range("2000-01-01", periods=20, unit=f"{_get_datetime_resolution()}"), + ) for v, dims in sorted(_vars.items()): data = rs.normal(size=tuple(_dims[d] for d in dims)) obj[v] = (dims, data) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 45d56b63e52..5d85e6c04e4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -53,7 +53,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing -from xarray.core.options import set_options +from xarray.core.options import _get_datetime_resolution, set_options from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type from xarray.tests import ( @@ -1590,8 +1590,9 @@ def test_open_encodings(self) -> None: expected = Dataset() - # todo: check, if specifying "s" is enough - time = pd.date_range("1999-01-05", periods=10, unit="s") + time = pd.date_range( + "1999-01-05", periods=10, unit=f"{_get_datetime_resolution()}" + ) encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 10de701412e..dbb014c14ff 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -39,6 +39,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes +from xarray.core.options import _get_datetime_resolution from xarray.core.utils import is_duck_dask_array from xarray.testing import assert_equal, assert_identical from xarray.tests import ( @@ -134,7 +135,9 @@ def test_cf_datetime(num_dates, units, calendar) -> None: max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)] # .year typ = type(min_y) border = typ(1582, 10, 15) - if calendar == "proleptic_gregorian" or (min_y >= border and max_y >= border): + if (calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns") or ( + min_y >= border and max_y >= border + ): expected = cftime_to_nptime(expected) with warnings.catch_warnings(): @@ -214,12 +217,15 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - unit = cast(Literal["s", "ms", "us", "ns"], "us") + unit = cast(Literal["s", "ms", "us", "ns"], _get_datetime_resolution()) times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") + # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values - if calendar == "proleptic_gregorian": - unit = "s" + # for cftime we get "us" resolution + # ns resolution is handled by cftime, too (OutOfBounds) + if calendar != "proleptic_gregorian" or _get_datetime_resolution() == "ns": + unit = "us" expected_dtype = np.dtype(f"M8[{unit}]") actual = decode_cf_datetime(time, units, calendar=calendar) assert actual.dtype == expected_dtype @@ -268,7 +274,7 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: time, units, calendar=calendar, only_use_cftime_datetimes=True ) # special case proleptic_gregorian - if calendar == "proleptic_gregorian": + if calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns": expected = expected.astype("=M8[us]") expected_date_type = type(expected[0]) @@ -289,7 +295,11 @@ def test_decode_standard_calendar_single_element_inside_timestamp_range( calendar, ) -> None: units = "days since 0001-01-01" - unit = "s" if calendar == "proleptic_gregorian" else "us" + unit = ( + _get_datetime_resolution() + if (calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns") + else "us" + ) for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") @@ -337,7 +347,11 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( import cftime units = "days since 0001-01-01" - unit = "s" if calendar == "proleptic_gregorian" else "us" + unit = ( + _get_datetime_resolution() + if (calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns") + else "us" + ) times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) @@ -426,8 +440,8 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: actual = decode_cf_datetime(mdim_time, units, calendar=calendar) dtype: np.dtype - if calendar == "proleptic_gregorian": - dtype = np.dtype("=M8[s]") + if calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns": + dtype = np.dtype(f"=M8[{_get_datetime_resolution()}]") expected1 = expected1.astype(dtype) expected2 = expected2.astype(dtype) else: @@ -528,7 +542,7 @@ def test_decoded_cf_datetime_array_2d() -> None: ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) result = CFDatetimeCoder().decode(variable) - assert result.dtype == "datetime64[s]" + assert result.dtype == f"datetime64[{_get_datetime_resolution()}]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -697,7 +711,7 @@ def test_decode_cf(calendar) -> None: if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype("M8[s]") + assert ds.test.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") def test_decode_cf_time_bounds() -> None: @@ -722,7 +736,7 @@ def test_decode_cf_time_bounds() -> None: "calendar": "standard", } dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype("M8[s]") + assert dsc.time_bnds.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -1299,7 +1313,11 @@ def test_roundtrip_datetime64_nanosecond_precision( assert encoded_var.data.dtype == dtype decoded_var = conventions.decode_cf_variable("foo", encoded_var) - assert decoded_var.dtype == np.dtype(f"=M8[{timeunit}]") + if _get_datetime_resolution() == "ns": + dtypeunit = "ns" + else: + dtypeunit = timeunit + assert decoded_var.dtype == np.dtype(f"=M8[{dtypeunit}]") assert ( decoded_var.encoding["units"] == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 7d86cb7c036..bcd49f1b608 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -19,6 +19,7 @@ from xarray.backends.common import WritableCFDataStore from xarray.backends.memory import InMemoryDataStore from xarray.conventions import decode_cf +from xarray.core.options import _get_datetime_resolution from xarray.testing import assert_identical from xarray.tests import ( assert_array_equal, @@ -364,7 +365,7 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: attrs = {"units": "days since 1900-01-01"} ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)})) - assert "(time) datetime64[s]" in repr(ds) + assert f"(time) datetime64[{_get_datetime_resolution()}]" in repr(ds) @requires_cftime def test_decode_cf_datetime_transition_to_invalid(self) -> None: @@ -447,13 +448,13 @@ def test_decode_cf_time_kwargs(self) -> None: dsc = conventions.decode_cf(ds) assert dsc.timedelta.dtype == np.dtype("m8[ns]") - assert dsc.time.dtype == np.dtype("M8[s]") + assert dsc.time.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") dsc = conventions.decode_cf(ds, decode_times=True, decode_timedelta=False) assert dsc.timedelta.dtype == np.dtype("int64") - assert dsc.time.dtype == np.dtype("M8[s]") + assert dsc.time.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") dsc = conventions.decode_cf(ds, decode_times=False, decode_timedelta=True) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 815e65821f4..d4a19883a54 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -39,6 +39,7 @@ from xarray.core.common import duck_array_ops, full_like from xarray.core.coordinates import Coordinates, DatasetCoordinates from xarray.core.indexes import Index, PandasIndex +from xarray.core.options import _get_datetime_resolution from xarray.core.types import ArrayLike from xarray.core.utils import is_scalar from xarray.groupers import TimeResampler @@ -290,7 +291,7 @@ def test_repr(self) -> None: Coordinates: * dim2 (dim2) float64 72B 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 * dim3 (dim3) {} 40B 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'j' - * time (time) datetime64[s] 160B 2000-01-01 2000-01-02 ... 2000-01-20 + * time (time) datetime64[{}] 160B 2000-01-01 2000-01-02 ... 2000-01-20 numbers (dim3) int64 80B 0 1 2 0 0 1 1 2 2 3 Dimensions without coordinates: dim1 Data variables: @@ -298,7 +299,10 @@ def test_repr(self) -> None: var2 (dim1, dim2) float64 576B 1.162 -1.097 -2.123 ... 1.267 0.3328 var3 (dim3, dim1) float64 640B 0.5565 -0.2121 0.4563 ... -0.2452 -0.3616 Attributes: - foo: bar""".format(data["dim3"].dtype) + foo: bar""".format( + data["dim3"].dtype, + _get_datetime_resolution(), + ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) print(actual) @@ -440,8 +444,8 @@ def test_info(self) -> None: ds.info(buf=buf) expected = dedent( - """\ - xarray.Dataset { + f"""\ + xarray.Dataset {{ dimensions: \tdim2 = 9 ; \ttime = 20 ; @@ -450,7 +454,7 @@ def test_info(self) -> None: variables: \tfloat64 dim2(dim2) ; - \tdatetime64[s] time(time) ; + \tdatetime64[{_get_datetime_resolution()}] time(time) ; \tfloat64 var1(dim1, dim2) ; \t\tvar1:foo = variable ; \tfloat64 var2(dim1, dim2) ; @@ -462,7 +466,7 @@ def test_info(self) -> None: // global attributes: \t:unicode_attr = ba® ; \t:string_attr = bar ; - }""" + }}""" ) actual = buf.getvalue() assert expected == actual From ed22da1af1a822a4994fc58bd21a57564526ab2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 18:08:15 +0200 Subject: [PATCH 017/134] Add section for default units, fix options --- doc/internals/time-coding.rst | 8 ++++++++ xarray/core/options.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 9bbd282d271..7c249bd2c6b 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -432,3 +432,11 @@ For encoding the process is more or less a reversal of the above, but we have to dates, units, calendar, dtype=np.dtype("int64") ) print(values, units) + + +Default Time Unit +~~~~~~~~~~~~~~~~~ + +The default time unit of xarray is ``'s'``. It aligns well with the lower resolution of pandas. For normal operation that has no consequences on the output as all decoded datetimes are already at least in second resolution. Setting the default time unit to ``'ns'`` (the former default) the datetimes will be converted to ``'ns'``-resolution, if possible. Same holds true for ``'us'`` and ``'ms'``. + +If the datetimes are decoded to ``'us'`` resolution, this resolution will be kept, even if the default resolution is set to ``'s'`` or ``'ms'``. diff --git a/xarray/core/options.py b/xarray/core/options.py index f185987a88e..93dec74d7c5 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -88,7 +88,7 @@ class T_Options(TypedDict): "use_flox": True, "use_numbagg": True, "use_opt_einsum": True, - "time_resolution": "ns", + "time_resolution": "s", } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) From 8bf23f426eed28fb9e630784152c1c00cefd9379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 18:29:19 +0200 Subject: [PATCH 018/134] attempt to fix typing --- xarray/core/pdcompat.py | 3 ++- xarray/core/types.py | 1 + xarray/tests/__init__.py | 8 +++++++- xarray/tests/test_backends.py | 3 +-- xarray/tests/test_coding_times.py | 4 ++-- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 3dd6da61d4e..55deef65b96 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -41,6 +41,7 @@ import pandas as pd from xarray.core.options import _get_datetime_resolution +from xarray.core.types import PDDatetimeUnitOptions def count_not_none(*args) -> int: @@ -79,7 +80,7 @@ def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: # where "as_unit" is not defined for pandas.Timestamp # in pandas versions < 2.2 # can be removed minimum pandas version is >= 2.2 - unit = cast(Literal["s", "ms", "us", "ns"], unit) + unit = cast(PDDatetimeUnitOptions, unit) if hasattr(date, "as_unit"): date = date.as_unit(unit) elif hasattr(date, "_as_unit"): diff --git a/xarray/core/types.py b/xarray/core/types.py index 64acc2c4aa4..a5a880575a8 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -239,6 +239,7 @@ def copy( "Y", "M", "W", "D", "h", "m", "s", "ms", "us", "μs", "ns", "ps", "fs", "as", None ] NPDatetimeUnitOptions = Literal["D", "h", "m", "s", "ms", "us", "ns"] +PDDatetimeUnitOptions = Literal["s", "ms", "us", "ns"] QueryEngineOptions = Literal["python", "numexpr", None] QueryParserOptions = Literal["pandas", "python"] diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 5d17624cc9d..613af1c242d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -5,6 +5,7 @@ import string import warnings from contextlib import contextmanager, nullcontext +from typing import cast from unittest import mock # noqa: F401 import numpy as np @@ -19,6 +20,7 @@ from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401 from xarray.core.extension_array import PandasExtensionArray from xarray.core.options import _get_datetime_resolution, set_options +from xarray.core.types import PDDatetimeUnitOptions from xarray.core.variable import IndexVariable from xarray.testing import ( # noqa: F401 assert_chunks_equal, @@ -325,7 +327,11 @@ def create_test_data( obj["dim3"] = ("dim3", list(string.ascii_lowercase[0 : _dims["dim3"]])) obj["time"] = ( "time", - pd.date_range("2000-01-01", periods=20, unit=f"{_get_datetime_resolution()}"), + pd.date_range( + "2000-01-01", + periods=20, + unit=f"{cast(PDDatetimeUnitOptions, _get_datetime_resolution())}", + ), ) for v, dims in sorted(_vars.items()): data = rs.normal(size=tuple(_dims[d] for d in dims)) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5d85e6c04e4..65f9c3f23c2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1589,9 +1589,8 @@ def test_open_encodings(self) -> None: ds.variables["time"][:] = np.arange(10) + 4 expected = Dataset() - time = pd.date_range( - "1999-01-05", periods=10, unit=f"{_get_datetime_resolution()}" + "1999-01-05", periods=10, unit=_get_datetime_resolution() ) encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index dbb014c14ff..08da012efea 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -3,7 +3,7 @@ import warnings from datetime import timedelta from itertools import product -from typing import Literal, cast +from typing import Literal import numpy as np import pandas as pd @@ -217,7 +217,7 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - unit = cast(Literal["s", "ms", "us", "ns"], _get_datetime_resolution()) + unit = _get_datetime_resolution() times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) From c3a2b395da8d2d359ff8681783d40cb130692c5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 14 Oct 2024 18:37:39 +0200 Subject: [PATCH 019/134] attempt to fix typing --- xarray/tests/__init__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 613af1c242d..3d5deead11c 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -5,7 +5,6 @@ import string import warnings from contextlib import contextmanager, nullcontext -from typing import cast from unittest import mock # noqa: F401 import numpy as np @@ -20,7 +19,6 @@ from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401 from xarray.core.extension_array import PandasExtensionArray from xarray.core.options import _get_datetime_resolution, set_options -from xarray.core.types import PDDatetimeUnitOptions from xarray.core.variable import IndexVariable from xarray.testing import ( # noqa: F401 assert_chunks_equal, @@ -330,7 +328,7 @@ def create_test_data( pd.date_range( "2000-01-01", periods=20, - unit=f"{cast(PDDatetimeUnitOptions, _get_datetime_resolution())}", + unit=_get_datetime_resolution(), ), ) for v, dims in sorted(_vars.items()): From 3c44aed5457a323f6e82d410ce48443d3c9e8673 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 15 Oct 2024 15:53:52 +0200 Subject: [PATCH 020/134] fix scalar datetime/timedelta --- xarray/core/variable.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 97d43ba56e0..10732ebf9a7 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -6,6 +6,7 @@ import numbers import warnings from collections.abc import Callable, Hashable, Mapping, Sequence +from datetime import datetime, timedelta from functools import partial from types import EllipsisType from typing import TYPE_CHECKING, Any, NoReturn, cast @@ -199,6 +200,16 @@ def _possibly_convert_objects(values): """Convert arrays of datetime.datetime and datetime.timedelta objects into datetime64 and timedelta64, according to the pandas convention. """ + if values.dtype.kind == "O": + typ = type( + values + if values.size == 0 + else (values[0] if values.ndim else values.item()) + ) + if issubclass(typ, datetime): + values = values.astype("datetime64") + elif issubclass(typ, timedelta): + values = values.astype("timedelta64") as_series = pd.Series(values.ravel(), copy=False) result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: @@ -247,6 +258,11 @@ def convert_non_numpy_type(data): if isinstance(data, pd.Timestamp): data = data.to_numpy() + if isinstance(data, datetime): + data = np.datetime64(data) + if isinstance(data, timedelta): + data = np.timedelta64(data) + # we don't want nested self-described arrays if isinstance(data, pd.Series | pd.DataFrame): pandas_data = data.values From 48be73acc0c928a23786f12571f4a397b9bfdd64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 15 Oct 2024 15:54:25 +0200 Subject: [PATCH 021/134] fix user docs --- doc/user-guide/time-series.rst | 34 ++++++++++++++++++------------ doc/user-guide/weather-climate.rst | 21 +++++++----------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 82172aa8998..e0f4efc7338 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -21,9 +21,10 @@ core functionality. Creating datetime64 data ------------------------ -Xarray uses the numpy dtypes ``datetime64[ns]`` and ``timedelta64[ns]`` to -represent datetime data, which offer vectorized (if sometimes buggy) operations -with numpy and smooth integration with pandas. +Xarray uses the numpy dtypes ``datetime64[unit]`` and ``timedelta64[unit]`` +(where unit is anything of "s", "ms", "us" and "ns") to represent datetime +data, which offer vectorized (if sometimes buggy) operations with numpy and +smooth integration with pandas. To convert to or create regular arrays of ``datetime64`` data, we recommend using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: @@ -31,10 +32,21 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: .. ipython:: python pd.to_datetime(["2000-01-01", "2000-02-02"]) + pd.DatetimeIndex( + ["2000-01-01 00:00:00", "2000-02-02 00:00:00"], dtype="datetime64[s]" + ) pd.date_range("2000-01-01", periods=365) + pd.date_range("2000-01-01", periods=365, unit="s") + +.. note:: + Care has to be taken to create the output with the wanted resolution. + For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified + and for :py:func:`pandas.to_datetime` the selection of the resolution + isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used + directly. Alternatively, you can supply arrays of Python ``datetime`` objects. These get -converted automatically when used as arguments in xarray objects: +converted automatically when used as arguments in xarray objects (with us-resolution): .. ipython:: python @@ -51,7 +63,7 @@ attribute like ``'days since 2000-01-01'``). .. note:: When decoding/encoding datetimes for non-standard calendars or for dates - before year 1678 or after year 2262, xarray uses the `cftime`_ library. + before 1582-10-15, xarray uses the `cftime`_ library. It was previously packaged with the ``netcdf4-python`` package under the name ``netcdftime`` but is now distributed separately. ``cftime`` is an :ref:`optional dependency` of xarray. @@ -68,15 +80,9 @@ You can manual decode arrays in this form by passing a dataset to ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) xr.decode_cf(ds) -One unfortunate limitation of using ``datetime64[ns]`` is that it limits the -native representation of dates to those that fall between the years 1678 and -2262. When a netCDF file contains dates outside of these bounds, dates will be -returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` -will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables a subset of -the indexing functionality of a :py:class:`pandas.DatetimeIndex` and is only -fully compatible with the standalone version of ``cftime`` (not the version -packaged with earlier versions ``netCDF4``). See :ref:`CFTimeIndex` for more -information. +From xarray 2024.11 the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +:py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex` and is only fully compatible with the standalone version of ``cftime`` (not the version packaged with earlier versions ``netCDF4``). +See :ref:`CFTimeIndex` for more information. Datetime indexing ----------------- diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 5014f5a8641..a82ad50ae2a 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -10,7 +10,7 @@ Weather and climate data import xarray as xr -Xarray can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include :ref:`automatic labelling of plots` with descriptive names and units if proper metadata is present and support for non-standard calendars used in climate science through the ``cftime`` module(Explained in the :ref:`CFTimeIndex` section). There are also a number of :ref:`geosciences-focused projects that build on xarray`. +Xarray can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include :ref:`automatic labelling of plots` with descriptive names and units if proper metadata is present and support for non-standard calendars used in climate science through the ``cftime`` module (explained in the :ref:`CFTimeIndex` section). There are also a number of :ref:`geosciences-focused projects that build on xarray`. .. _Climate and Forecast (CF) conventions: https://cfconventions.org @@ -64,8 +64,7 @@ Through the standalone ``cftime`` library and a custom subclass of :py:class:`pandas.Index`, xarray supports a subset of the indexing functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `nanosecond-precision range`_ -(approximately between years 1678 and 2262). +using a standard calendar, but outside the `precision range`_ and dates prior 1582-10-15. .. note:: @@ -75,18 +74,14 @@ using a standard calendar, but outside the `nanosecond-precision range`_ any of the following are true: - The dates are from a non-standard calendar - - Any dates are outside the nanosecond-precision range. + - Any dates are outside the nanosecond-precision range (prior xarray version 2024.11) + - Any dates are outside the time span limited by the resolution (from xarray version v2024.11) Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[ns]`` data type, enabling the use of a - :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[ns]`` - and their full set of associated features. + represented with the ``np.datetime64[unit]`` data type (where unit can be any of ["s", "ms", "us", "ns"], enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. For the time being, xarray still automatically casts datetime values - to nanosecond-precision for backwards compatibility with older pandas - versions; however, this is something we would like to relax going forward. - See :issue:`7493` for more discussion. + values. From xarray version 2024.11 the relaxed non-nanosecond precision datetime values will be used. For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a @@ -115,7 +110,7 @@ instance, we can create the same dates and DataArray we created above using: Mirroring pandas' method with the same name, :py:meth:`~xarray.infer_freq` allows one to infer the sampling frequency of a :py:class:`~xarray.CFTimeIndex` or a 1-D :py:class:`~xarray.DataArray` containing cftime objects. It also works transparently with -``np.datetime64[ns]`` and ``np.timedelta64[ns]`` data. +``np.datetime64`` and ``np.timedelta64`` data (with "s", "ms", "us" or "ns" resolution). .. ipython:: python @@ -137,7 +132,7 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use `pandas` when possible, i.e. when the calendar is standard and dates are within 1678 and 2262. +use `pandas` when possible, i.e. when the calendar is standard and dates starting with 1582-10-15. .. ipython:: python From 7ac9983138f2ca9f3822f65ae13a7d99c0aebc5e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 18 Oct 2024 05:34:26 +0000 Subject: [PATCH 022/134] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/coding/times.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 96641a83687..0737fae2801 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -337,7 +337,6 @@ def _decode_datetime_with_pandas( with warnings.catch_warnings(): warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) if flat_num_dates.size > 0: - # avoid size 0 datetimes GH1329 dec_min = _check_date_for_units_since_refdate( flat_num_dates.min(), time_unit, ref_date From d86ad042fcd3feb7f826d0e5771bc485bd5b7495 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 18 Oct 2024 08:00:56 +0200 Subject: [PATCH 023/134] Fix variable tests, mostly datetime/timedelta is inittialized with us-resolution --- xarray/tests/test_variable.py | 76 +++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 34 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 22b94f7a3ae..5642d898c24 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -274,35 +274,43 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected + dt64_data = pd.date_range("2000-01-01", periods=3) + @pytest.mark.filterwarnings("ignore:Converting non-default") - def test_datetime64_conversion(self): + @pytest.mark.parametrize( + "values, unit", + [ + (dt64_data, "ns"), + (dt64_data.values, "ns"), + (dt64_data.values.astype("datetime64[s]"), "s"), + (dt64_data.to_pydatetime(), "us"), + ], + ) + def test_datetime64_conversion(self, values, unit): # todo: check, if this test is OK - times = pd.date_range("2000-01-01", periods=3) - for values, unit in [ - (times, "ns"), - (times.values, "ns"), - (times.values.astype("datetime64[s]"), "s"), - (times.to_pydatetime(), "ns"), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype(f"datetime64[{unit}]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert_array_equal(v.values, self.dt64_data.values) + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + td64_data = pd.timedelta_range(start=0, periods=3) @pytest.mark.filterwarnings("ignore:Converting non-default") - def test_timedelta64_conversion(self): + @pytest.mark.parametrize( + "values, unit", + [ + (td64_data, "ns"), + (td64_data.values, "ns"), + (td64_data.values.astype("timedelta64[s]"), "s"), + (td64_data.to_pytimedelta(), "us"), + ], + ) + def test_timedelta64_conversion(self, values, unit): # todo: check, if this test is OK - times = pd.timedelta_range(start=0, periods=3) - for values, unit in [ - (times, "ns"), - (times.values, "ns"), - (times.values.astype("timedelta64[s]"), "s"), - (times.to_pytimedelta(), "ns"), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype(f"timedelta64[{unit}]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert_array_equal(v.values, self.td64_data.values) + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_object_conversion(self): data = np.arange(5).astype(str).astype(object) @@ -1078,7 +1086,7 @@ def test_numpy_same_methods(self): [ (np.datetime64("2000-01-01"), "s"), (pd.Timestamp("2000-01-01T00"), "s"), - (datetime(2000, 1, 1), "ns"), + (datetime(2000, 1, 1), "us"), ], ) def test_datetime64_conversion_scalar(self, values, unit): @@ -1093,8 +1101,8 @@ def test_datetime64_conversion_scalar(self, values, unit): "values, unit", [ (np.timedelta64(1, "D"), "s"), - (pd.Timedelta("1 day"), "ns"), - (timedelta(days=1), "ns"), + (pd.Timedelta("1 day"), "us"), + (timedelta(days=1), "us"), ], ) def test_timedelta64_conversion_scalar(self, values, unit): @@ -1122,7 +1130,7 @@ def test_0d_datetime(self): @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( - "values, unit", [(pd.to_timedelta("1s"), "ns"), (np.timedelta64(1, "s"), "s")] + "values, unit", [(pd.to_timedelta("1s"), "us"), (np.timedelta64(1, "s"), "s")] ) def test_0d_timedelta(self, values, unit): # todo: check, if this test is OK @@ -2652,11 +2660,11 @@ def test_datetime(self): assert np.dtype("datetime64[ns]") == actual.dtype assert expected is source_ndarray(np.asarray(actual)) - expected = np.datetime64("2000-01-01", "ns") + expected = np.datetime64("2000-01-01", "us") actual = as_compatible_data(datetime(2000, 1, 1)) assert np.asarray(expected) == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[ns]") == actual.dtype + assert np.dtype("datetime64[us]") == actual.dtype def test_tz_datetime(self) -> None: # todo: check, if this test is OK @@ -2959,8 +2967,8 @@ def test_from_pint_wrapping_dask(self, Var): (np.array([np.datetime64("2000-01-01", "ns")]), "ns"), (np.array([np.datetime64("2000-01-01", "s")]), "s"), (pd.date_range("2000", periods=1), "ns"), - (datetime(2000, 1, 1), "ns"), - (np.array([datetime(2000, 1, 1)]), "ns"), + (datetime(2000, 1, 1), "us"), + (np.array([datetime(2000, 1, 1)]), "us"), (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), "ns"), ( pd.Series( @@ -3036,8 +3044,8 @@ def test_pandas_two_only_datetime_conversion_warnings( (np.array([np.timedelta64(10, "ns")]), "ns"), (np.array([np.timedelta64(10, "s")]), "s"), (pd.timedelta_range("1", periods=1), "ns"), - (timedelta(days=1), "ns"), - (np.array([timedelta(days=1)]), "ns"), + (timedelta(days=1), "us"), + (np.array([timedelta(days=1)]), "us"), ], ids=lambda x: f"{x}", ) From b5d0795067dfb5566d3832c90ef851dbf0c4f8f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 18 Oct 2024 09:36:55 +0200 Subject: [PATCH 024/134] revert changes in _possible_convert_objects, this needs to be checked more carefully, for now using pd.Series to covert `OMm` type datetimes/timedeltas (will result in ns precision) --- xarray/core/variable.py | 10 ---------- xarray/tests/test_variable.py | 8 ++++---- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 10732ebf9a7..79357296012 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -200,16 +200,6 @@ def _possibly_convert_objects(values): """Convert arrays of datetime.datetime and datetime.timedelta objects into datetime64 and timedelta64, according to the pandas convention. """ - if values.dtype.kind == "O": - typ = type( - values - if values.size == 0 - else (values[0] if values.ndim else values.item()) - ) - if issubclass(typ, datetime): - values = values.astype("datetime64") - elif issubclass(typ, timedelta): - values = values.astype("timedelta64") as_series = pd.Series(values.ravel(), copy=False) result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 5642d898c24..32a4356acab 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -283,7 +283,7 @@ def test_0d_time_data(self): (dt64_data, "ns"), (dt64_data.values, "ns"), (dt64_data.values.astype("datetime64[s]"), "s"), - (dt64_data.to_pydatetime(), "us"), + (dt64_data.to_pydatetime(), "ns"), ], ) def test_datetime64_conversion(self, values, unit): @@ -302,7 +302,7 @@ def test_datetime64_conversion(self, values, unit): (td64_data, "ns"), (td64_data.values, "ns"), (td64_data.values.astype("timedelta64[s]"), "s"), - (td64_data.to_pytimedelta(), "us"), + (td64_data.to_pytimedelta(), "ns"), ], ) def test_timedelta64_conversion(self, values, unit): @@ -2968,7 +2968,7 @@ def test_from_pint_wrapping_dask(self, Var): (np.array([np.datetime64("2000-01-01", "s")]), "s"), (pd.date_range("2000", periods=1), "ns"), (datetime(2000, 1, 1), "us"), - (np.array([datetime(2000, 1, 1)]), "us"), + (np.array([datetime(2000, 1, 1)]), "ns"), (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), "ns"), ( pd.Series( @@ -3045,7 +3045,7 @@ def test_pandas_two_only_datetime_conversion_warnings( (np.array([np.timedelta64(10, "s")]), "s"), (pd.timedelta_range("1", periods=1), "ns"), (timedelta(days=1), "us"), - (np.array([timedelta(days=1)]), "us"), + (np.array([timedelta(days=1)]), "ns"), ], ids=lambda x: f"{x}", ) From 60324f00363e1710b7adc029cd8645a709704ea3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 18 Oct 2024 09:56:47 +0200 Subject: [PATCH 025/134] fix doc link --- doc/user-guide/weather-climate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index a82ad50ae2a..75c41f34ffe 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -236,6 +236,6 @@ For data indexed by a :py:class:`~xarray.CFTimeIndex` xarray currently supports: da.resample(time="81min", closed="right", label="right", offset="3min").mean() -.. _nanosecond-precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations +.. _precision range: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timestamp-limitations .. _ISO 8601 standard: https://en.wikipedia.org/wiki/ISO_8601 .. _partial datetime string indexing: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#partial-string-indexing From 1f0750029d4861a65b06f30af14ec8f2dae2b46c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 8 Nov 2024 14:29:24 +0100 Subject: [PATCH 026/134] Apply suggestions from code review Co-authored-by: Stephan Hoyer --- doc/user-guide/time-series.rst | 2 +- xarray/core/variable.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index e0f4efc7338..9d9292ad53c 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -81,7 +81,7 @@ You can manual decode arrays in this form by passing a dataset to xr.decode_cf(ds) From xarray 2024.11 the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. -:py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex` and is only fully compatible with the standalone version of ``cftime`` (not the version packaged with earlier versions ``netCDF4``). +:py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. See :ref:`CFTimeIndex` for more information. Datetime indexing diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 1b9d5ff826a..e36124072d5 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -302,7 +302,7 @@ def _as_array_or_item(data): if data.ndim == 0: kind = data.dtype.kind if kind in "mM": - unit = np.datetime_data(data.dtype)[0] + unit, _ = np.datetime_data(data.dtype) if kind == "M": data = np.datetime64(data, unit) elif kind == "m": From 20d6c9d0f0f6168685deae19bf96e88af495e158 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 16 Nov 2024 19:29:35 +0000 Subject: [PATCH 027/134] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_variable.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 43330ea27c3..4e26ff2b526 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -37,7 +37,6 @@ assert_identical, assert_no_warnings, has_dask_ge_2024_11_0, - has_pandas_3, raise_if_dask_computes, requires_bottleneck, requires_cupy, From 73919488ecf05561716fddfefce6bce7bf26a011 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 16 Nov 2024 21:36:32 +0100 Subject: [PATCH 028/134] remove outdated description --- doc/user-guide/time-series.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 9d9292ad53c..3f6b1203b70 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -23,8 +23,7 @@ Creating datetime64 data Xarray uses the numpy dtypes ``datetime64[unit]`` and ``timedelta64[unit]`` (where unit is anything of "s", "ms", "us" and "ns") to represent datetime -data, which offer vectorized (if sometimes buggy) operations with numpy and -smooth integration with pandas. +data, which offer vectorized operations with numpy and smooth integration with pandas. To convert to or create regular arrays of ``datetime64`` data, we recommend using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: From 308091cce27c0c1ba7ed1e54ff72f636871a3ba5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 16 Nov 2024 21:46:54 +0100 Subject: [PATCH 029/134] use set instead list --- xarray/coding/times.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 0737fae2801..5fa1ce11104 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -441,7 +441,7 @@ def to_timedelta_unboxed(value, **kwargs): result = pd.to_timedelta(value, **kwargs).to_numpy() unique_timedeltas = np.unique(result[pd.notnull(result)]) unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) - if unit not in ["s", "ms", "us", "ns"]: + if unit not in {"s", "ms", "us", "ns"}: unit = "s" result = result.astype(f"timedelta64[{unit}]") assert np.issubdtype(result.dtype, "timedelta64") @@ -462,7 +462,7 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: num_timedeltas = np.asarray(num_timedeltas) unit = _netcdf_to_numpy_timeunit(units) as_unit = unit - if unit not in ["s", "ms", "us", "ns"]: + if unit not in {"s", "ms", "us", "ns"}: as_unit = "s" result = ( pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() From 5f40b4e78b67ef7fe06eeacbe1f9cb7a2ca5691c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 16 Nov 2024 23:17:48 +0100 Subject: [PATCH 030/134] remove global option --- xarray/coding/times.py | 3 +- xarray/core/options.py | 9 ----- xarray/core/pdcompat.py | 7 ++-- xarray/tests/__init__.py | 4 +-- xarray/tests/test_backends.py | 6 ++-- xarray/tests/test_coding_times.py | 60 ++++++++++++++----------------- xarray/tests/test_conventions.py | 7 ++-- xarray/tests/test_dataset.py | 11 +++--- 8 files changed, 41 insertions(+), 66 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 5fa1ce11104..cc0a273dd02 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -24,7 +24,6 @@ from xarray.core.common import contains_cftime_datetimes, is_np_datetime_like from xarray.core.duck_array_ops import asarray, ravel, reshape from xarray.core.formatting import first_n_items, format_timestamp, last_item -from xarray.core.options import _get_datetime_resolution from xarray.core.pdcompat import _timestamp_as_unit, default_precision_timestamp from xarray.core.utils import emit_user_level_warning from xarray.core.variable import Variable @@ -326,7 +325,7 @@ def _decode_datetime_with_pandas( try: time_unit, ref_date = _unpack_time_unit_and_ref_date(units) ref_date = _align_reference_date_and_unit(ref_date, time_unit) - ref_date = _align_reference_date_and_unit(ref_date, _get_datetime_resolution()) + ref_date = _align_reference_date_and_unit(ref_date, "s") except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime diff --git a/xarray/core/options.py b/xarray/core/options.py index 93dec74d7c5..23ec5bb3f73 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -32,7 +32,6 @@ "use_numbagg", "use_opt_einsum", "use_flox", - "time_resolution", ] class T_Options(TypedDict): @@ -60,7 +59,6 @@ class T_Options(TypedDict): use_flox: bool use_numbagg: bool use_opt_einsum: bool - time_resolution: Literal["s", "ms", "us", "ns"] OPTIONS: T_Options = { @@ -88,12 +86,10 @@ class T_Options(TypedDict): "use_flox": True, "use_numbagg": True, "use_opt_einsum": True, - "time_resolution": "s", } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) _DISPLAY_OPTIONS = frozenset(["text", "html"]) -_TIME_RESOLUTION_OPTIONS = frozenset(["s", "ms", "us", "ns"]) def _positive_integer(value: Any) -> bool: @@ -121,7 +117,6 @@ def _positive_integer(value: Any) -> bool: "use_opt_einsum": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), "warn_for_unclosed_files": lambda value: isinstance(value, bool), - "time_resolution": _TIME_RESOLUTION_OPTIONS.__contains__, } @@ -163,10 +158,6 @@ def _get_keep_attrs(default: bool) -> bool: return _get_boolean_with_default("keep_attrs", default) -def _get_datetime_resolution() -> Literal["s", "ms", "us", "ns"]: - return OPTIONS["time_resolution"] - - class set_options: """ Set options for xarray in a controlled context. diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index 55deef65b96..ac6f6a17a35 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -40,7 +40,6 @@ import pandas as pd -from xarray.core.options import _get_datetime_resolution from xarray.core.types import PDDatetimeUnitOptions @@ -96,8 +95,6 @@ def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: of {"s", "ms", "us", "ns"}. """ dt = pd.Timestamp(*args, **kwargs) - units = ["s", "ms", "us", "ns"] - default = _get_datetime_resolution() - if units.index(default) > units.index(dt.unit): - dt = _timestamp_as_unit(dt, default) + if dt.unit != "ns": + dt = _timestamp_as_unit(dt, "ns") return dt diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index f692933f739..cbc142845fe 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -18,7 +18,7 @@ from xarray import Dataset from xarray.core.duck_array_ops import allclose_or_equiv # noqa: F401 from xarray.core.extension_array import PandasExtensionArray -from xarray.core.options import _get_datetime_resolution, set_options +from xarray.core.options import set_options from xarray.core.variable import IndexVariable from xarray.testing import ( # noqa: F401 assert_chunks_equal, @@ -331,7 +331,7 @@ def create_test_data( pd.date_range( "2000-01-01", periods=20, - unit=_get_datetime_resolution(), + unit="s", ), ) for v, dims in sorted(_vars.items()): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8c4e8224500..88c3f2c0bf6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -53,7 +53,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import encode_dataset_coordinates from xarray.core import indexing -from xarray.core.options import _get_datetime_resolution, set_options +from xarray.core.options import set_options from xarray.core.utils import module_available from xarray.namedarray.pycompat import array_type from xarray.tests import ( @@ -1618,9 +1618,7 @@ def test_open_encodings(self) -> None: ds.variables["time"][:] = np.arange(10) + 4 expected = Dataset() - time = pd.date_range( - "1999-01-05", periods=10, unit=_get_datetime_resolution() - ) + time = pd.date_range("1999-01-05", periods=10, unit="s") encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 578dd29e533..173b3bed781 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -39,7 +39,6 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes -from xarray.core.options import _get_datetime_resolution from xarray.core.utils import is_duck_dask_array from xarray.testing import assert_equal, assert_identical from xarray.tests import ( @@ -135,11 +134,12 @@ def test_cf_datetime(num_dates, units, calendar) -> None: max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)] # .year typ = type(min_y) border = typ(1582, 10, 15) - if (calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns") or ( - min_y >= border and max_y >= border + if ( + calendar == "proleptic_gregorian" + or calendar in _STANDARD_CALENDARS + and (min_y >= border and max_y >= border) ): expected = cftime_to_nptime(expected) - with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(num_dates, units, calendar) @@ -217,15 +217,15 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - unit = _get_datetime_resolution() + unit = "us" times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values # for cftime we get "us" resolution # ns resolution is handled by cftime, too (OutOfBounds) - if calendar != "proleptic_gregorian" or _get_datetime_resolution() == "ns": - unit = "us" + if calendar == "proleptic_gregorian": + unit = "s" expected_dtype = np.dtype(f"M8[{unit}]") actual = decode_cf_datetime(time, units, calendar=calendar) assert actual.dtype == expected_dtype @@ -273,9 +273,8 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: expected = cftime.num2date( time, units, calendar=calendar, only_use_cftime_datetimes=True ) - # special case proleptic_gregorian - if calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns": - expected = expected.astype("=M8[us]") + if calendar == "proleptic_gregorian": + expected = cftime_to_nptime(expected) expected_date_type = type(expected[0]) with warnings.catch_warnings(): @@ -295,11 +294,9 @@ def test_decode_standard_calendar_single_element_inside_timestamp_range( calendar, ) -> None: units = "days since 0001-01-01" - unit = ( - _get_datetime_resolution() - if (calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns") - else "us" - ) + unit = "us" + if calendar == "proleptic_gregorian": + unit = "s" for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") @@ -347,11 +344,9 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( import cftime units = "days since 0001-01-01" - unit = ( - _get_datetime_resolution() - if (calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns") - else "us" - ) + unit = "us" + if calendar == "proleptic_gregorian": + unit = "s" times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) @@ -435,17 +430,18 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: expected1 = cftime.num2date(time1, units, calendar, only_use_cftime_datetimes=True) expected2 = cftime.num2date(time2, units, calendar, only_use_cftime_datetimes=True) + if calendar == "proleptic_gregorian": + expected1 = cftime_to_nptime(expected1) + expected2 = cftime_to_nptime(expected2) + with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(mdim_time, units, calendar=calendar) dtype: np.dtype - if calendar == "proleptic_gregorian" and _get_datetime_resolution() != "ns": - dtype = np.dtype(f"=M8[{_get_datetime_resolution()}]") - expected1 = expected1.astype(dtype) - expected2 = expected2.astype(dtype) - else: - dtype = np.dtype("O") + dtype = np.dtype("O") + if calendar == "proleptic_gregorian": + dtype = np.dtype("M8[s]") assert actual.dtype == dtype @@ -542,7 +538,7 @@ def test_decoded_cf_datetime_array_2d() -> None: ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) result = CFDatetimeCoder().decode(variable) - assert result.dtype == f"datetime64[{_get_datetime_resolution()}]" + assert result.dtype == "datetime64[s]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -711,7 +707,7 @@ def test_decode_cf(calendar) -> None: if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") + assert ds.test.dtype == np.dtype("M8[s]") def test_decode_cf_time_bounds() -> None: @@ -736,7 +732,7 @@ def test_decode_cf_time_bounds() -> None: "calendar": "standard", } dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") + assert dsc.time_bnds.dtype == np.dtype("M8[s]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -1313,11 +1309,7 @@ def test_roundtrip_datetime64_nanosecond_precision( assert encoded_var.data.dtype == dtype decoded_var = conventions.decode_cf_variable("foo", encoded_var) - if _get_datetime_resolution() == "ns": - dtypeunit = "ns" - else: - dtypeunit = timeunit - assert decoded_var.dtype == np.dtype(f"=M8[{dtypeunit}]") + assert decoded_var.dtype == np.dtype(f"=M8[{timeunit}]") assert ( decoded_var.encoding["units"] == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index e43848776f6..abf4c950a9f 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -19,7 +19,6 @@ from xarray.backends.common import WritableCFDataStore from xarray.backends.memory import InMemoryDataStore from xarray.conventions import decode_cf -from xarray.core.options import _get_datetime_resolution from xarray.testing import assert_identical from xarray.tests import ( assert_array_equal, @@ -455,7 +454,7 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: attrs = {"units": "days since 1900-01-01"} ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)})) - assert f"(time) datetime64[{_get_datetime_resolution()}]" in repr(ds) + assert "(time) datetime64[s]" in repr(ds) @requires_cftime def test_decode_cf_datetime_transition_to_invalid(self) -> None: @@ -538,13 +537,13 @@ def test_decode_cf_time_kwargs(self) -> None: dsc = conventions.decode_cf(ds) assert dsc.timedelta.dtype == np.dtype("m8[ns]") - assert dsc.time.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") + assert dsc.time.dtype == np.dtype("M8[s]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") dsc = conventions.decode_cf(ds, decode_times=True, decode_timedelta=False) assert dsc.timedelta.dtype == np.dtype("int64") - assert dsc.time.dtype == np.dtype(f"M8[{_get_datetime_resolution()}]") + assert dsc.time.dtype == np.dtype("M8[s]") dsc = conventions.decode_cf(ds, decode_times=False, decode_timedelta=True) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index c5f282c13c7..5bdf530a696 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -39,7 +39,6 @@ from xarray.core.common import duck_array_ops, full_like from xarray.core.coordinates import Coordinates, DatasetCoordinates from xarray.core.indexes import Index, PandasIndex -from xarray.core.options import _get_datetime_resolution from xarray.core.types import ArrayLike from xarray.core.utils import is_scalar from xarray.groupers import TimeResampler @@ -301,7 +300,7 @@ def test_repr(self) -> None: Attributes: foo: bar""".format( data["dim3"].dtype, - _get_datetime_resolution(), + "s", ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) @@ -444,8 +443,8 @@ def test_info(self) -> None: ds.info(buf=buf) expected = dedent( - f"""\ - xarray.Dataset {{ + """\ + xarray.Dataset { dimensions: \tdim2 = 9 ; \ttime = 20 ; @@ -454,7 +453,7 @@ def test_info(self) -> None: variables: \tfloat64 dim2(dim2) ; - \tdatetime64[{_get_datetime_resolution()}] time(time) ; + \tdatetime64[s] time(time) ; \tfloat64 var1(dim1, dim2) ; \t\tvar1:foo = variable ; \tfloat64 var2(dim1, dim2) ; @@ -466,7 +465,7 @@ def test_info(self) -> None: // global attributes: \t:unicode_attr = ba® ; \t:string_attr = bar ; - }}""" + }""" ) actual = buf.getvalue() assert expected == actual From 2a65d8db05800321390e89cfed9d40da13387890 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 13:32:40 +0100 Subject: [PATCH 031/134] mypy thinks `unit` is Literal, because the pandas-stubs suggest so, but the pandas code tells us otherwise --- xarray/tests/test_coding_times.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 173b3bed781..153ae86643d 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -3,7 +3,7 @@ import warnings from datetime import timedelta from itertools import product -from typing import Literal +from typing import Final, Literal import numpy as np import pandas as pd @@ -217,7 +217,7 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - unit = "us" + unit: Final = "us" times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) From 43f7d61bdeadceb9f08e4ae00807d5a14d8dd6db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 14:06:49 +0100 Subject: [PATCH 032/134] ignore mypy arg-type --- xarray/tests/test_coding_times.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 153ae86643d..6a13abcafd1 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -3,7 +3,7 @@ import warnings from datetime import timedelta from itertools import product -from typing import Final, Literal +from typing import Literal import numpy as np import pandas as pd @@ -217,8 +217,8 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: import cftime units = "days since 0001-01-01" - unit: Final = "us" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") + unit = "us" + times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") # type: ignore[arg-type] # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values From 59934b935139aa1a35037a2394e7110511420297 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 14:17:12 +0100 Subject: [PATCH 033/134] fix docstring of `default_precision_timestamp` --- xarray/core/pdcompat.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/xarray/core/pdcompat.py b/xarray/core/pdcompat.py index ac6f6a17a35..271f4cadcb3 100644 --- a/xarray/core/pdcompat.py +++ b/xarray/core/pdcompat.py @@ -90,9 +90,7 @@ def _timestamp_as_unit(date: pd.Timestamp, unit: str) -> pd.Timestamp: def default_precision_timestamp(*args, **kwargs) -> pd.Timestamp: """Return a Timestamp object with the default precision. - Xarray default is "ns". This can be overridden by setting - set_options(time_resolution="us") or any other resolution - of {"s", "ms", "us", "ns"}. + Xarray default is "ns". """ dt = pd.Timestamp(*args, **kwargs) if dt.unit != "ns": From a01f9f3f2ba651fdb94d80a14e5a5b22bef45533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 21:46:47 +0100 Subject: [PATCH 034/134] add 'time_unit'-kwarg to decode_cf and descendent functions with "ns" as default. --- xarray/coding/times.py | 40 ++++++++++--- xarray/conventions.py | 15 ++++- xarray/tests/test_coding_times.py | 98 +++++++++++++++++++------------ 3 files changed, 106 insertions(+), 47 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index cc0a273dd02..59535011793 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -217,7 +217,11 @@ def _unpack_time_unit_and_ref_date( def _decode_cf_datetime_dtype( - data, units: str, calendar: str | None, use_cftime: bool | None + data, + units: str, + calendar: str | None, + use_cftime: bool | None, + time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> np.dtype: # Verify that at least the first and last date can be decoded # successfully. Otherwise, tracebacks end up swallowed by @@ -228,7 +232,9 @@ def _decode_cf_datetime_dtype( ) try: - result = decode_cf_datetime(example_value, units, calendar, use_cftime) + result = decode_cf_datetime( + example_value, units, calendar, use_cftime, time_unit + ) except Exception as err: calendar_msg = ( "the default calendar" if calendar is None else f"calendar {calendar!r}" @@ -305,7 +311,10 @@ def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: def _decode_datetime_with_pandas( - flat_num_dates: np.ndarray, units: str, calendar: str + flat_num_dates: np.ndarray, + units: str, + calendar: str, + time_resolution: Literal["s", "ms", "us", "ns"] = "ns", ) -> np.ndarray: if not _is_standard_calendar(calendar): raise OutOfBoundsDatetime( @@ -325,7 +334,8 @@ def _decode_datetime_with_pandas( try: time_unit, ref_date = _unpack_time_unit_and_ref_date(units) ref_date = _align_reference_date_and_unit(ref_date, time_unit) - ref_date = _align_reference_date_and_unit(ref_date, "s") + # here the highest wanted resolution is set + ref_date = _align_reference_date_and_unit(ref_date, time_resolution) except ValueError as err: # ValueError is raised by pd.Timestamp for non-ISO timestamp # strings, in which case we fall back to using cftime @@ -378,7 +388,11 @@ def _decode_datetime_with_pandas( def decode_cf_datetime( - num_dates, units: str, calendar: str | None = None, use_cftime: bool | None = None + num_dates, + units: str, + calendar: str | None = None, + use_cftime: bool | None = None, + time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> np.ndarray: """Given an array of numeric dates in netCDF format, convert it into a numpy array of date time objects. @@ -401,7 +415,9 @@ def decode_cf_datetime( if use_cftime is None: try: - dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) + dates = _decode_datetime_with_pandas( + flat_num_dates, units, calendar, time_unit + ) except (KeyError, OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError): dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar @@ -1060,8 +1076,13 @@ def _lazily_encode_cf_timedelta( class CFDatetimeCoder(VariableCoder): - def __init__(self, use_cftime: bool | None = None) -> None: + def __init__( + self, + use_cftime: bool | None = None, + time_unit: Literal["s", "ms", "us", "ns"] = "ns", + ) -> None: self.use_cftime = use_cftime + self.time_unit = time_unit def encode(self, variable: Variable, name: T_Name = None) -> Variable: if np.issubdtype( @@ -1088,12 +1109,15 @@ def decode(self, variable: Variable, name: T_Name = None) -> Variable: units = pop_to(attrs, encoding, "units") calendar = pop_to(attrs, encoding, "calendar") - dtype = _decode_cf_datetime_dtype(data, units, calendar, self.use_cftime) + dtype = _decode_cf_datetime_dtype( + data, units, calendar, self.use_cftime, self.time_unit + ) transform = partial( decode_cf_datetime, units=units, calendar=calendar, use_cftime=self.use_cftime, + time_unit=self.time_unit, ) data = lazy_elemwise_func(data, transform, dtype) diff --git a/xarray/conventions.py b/xarray/conventions.py index e4e71a481e8..8d517cda6a5 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -214,6 +214,7 @@ def decode_cf_variable( stack_char_dim: bool = True, use_cftime: bool | None = None, decode_timedelta: bool | None = None, + time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> Variable: """ Decodes a variable which may hold CF encoded information. @@ -254,6 +255,9 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + time_unit : Literal["s", "ms", "us", "ns], optional + Time unit to which resolution cf times should at least be decoded. + Defaults to "ns". Returns ------- @@ -291,7 +295,9 @@ def decode_cf_variable( if decode_timedelta: var = times.CFTimedeltaCoder().decode(var, name=name) if decode_times: - var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) + var = times.CFDatetimeCoder(use_cftime=use_cftime, time_unit=time_unit).decode( + var, name=name + ) if decode_endianness and not var.dtype.isnative: var = variables.EndianCoder().decode(var) @@ -407,6 +413,7 @@ def decode_cf_variables( drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, + time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> tuple[T_Variables, T_Attrs, set[Hashable]]: """ Decode several CF encoded variables. @@ -459,6 +466,7 @@ def stackable(dim: Hashable) -> bool: stack_char_dim=stack_char_dim, use_cftime=_item_or_default(use_cftime, k, None), decode_timedelta=_item_or_default(decode_timedelta, k, None), + time_unit=time_unit, ) except Exception as e: raise type(e)(f"Failed to decode variable {k!r}: {e}") from e @@ -544,6 +552,7 @@ def decode_cf( drop_variables: T_DropVariables = None, use_cftime: bool | None = None, decode_timedelta: bool | None = None, + time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> Dataset: """Decode the given Dataset or Datastore according to CF conventions into a new Dataset. @@ -588,6 +597,9 @@ def decode_cf( {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. + time_unit : Literal["s", "ms", "us", "ns], optional + Time unit to which resolution cf times should at least be decoded. + Defaults to "ns". Returns ------- @@ -622,6 +634,7 @@ def decode_cf( drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + time_unit=time_unit, ) ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 6a13abcafd1..ef2eabe9551 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -123,7 +123,8 @@ def _all_cftime_date_types(): @pytest.mark.filterwarnings("ignore:Ambiguous reference date string") @pytest.mark.filterwarnings("ignore:Times can't be serialized faithfully") @pytest.mark.parametrize(["num_dates", "units", "calendar"], _CF_DATETIME_TESTS) -def test_cf_datetime(num_dates, units, calendar) -> None: +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) +def test_cf_datetime(num_dates, units, calendar, time_unit) -> None: import cftime expected = cftime.num2date( @@ -134,15 +135,13 @@ def test_cf_datetime(num_dates, units, calendar) -> None: max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)] # .year typ = type(min_y) border = typ(1582, 10, 15) - if ( - calendar == "proleptic_gregorian" - or calendar in _STANDARD_CALENDARS - and (min_y >= border and max_y >= border) + if (calendar == "proleptic_gregorian" and time_unit != "ns") or ( + calendar in _STANDARD_CALENDARS and (min_y >= border and max_y >= border) ): expected = cftime_to_nptime(expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_dates, units, calendar) + actual = decode_cf_datetime(num_dates, units, calendar, time_unit=time_unit) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -213,21 +212,21 @@ def test_decode_cf_datetime_non_iso_strings() -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -def test_decode_standard_calendar_inside_timestamp_range(calendar) -> None: +@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) +def test_decode_standard_calendar_inside_timestamp_range(calendar, unit) -> None: import cftime units = "days since 0001-01-01" - unit = "us" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit=unit, freq="h") # type: ignore[arg-type] + times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit="us", freq="h") # type: ignore[arg-type] # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values # for cftime we get "us" resolution # ns resolution is handled by cftime, too (OutOfBounds) - if calendar == "proleptic_gregorian": - unit = "s" + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=unit) + if calendar != "proleptic_gregorian" or unit == "ns": + unit = "us" expected_dtype = np.dtype(f"M8[{unit}]") - actual = decode_cf_datetime(time, units, calendar=calendar) assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -261,7 +260,8 @@ def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_dates_outside_timestamp_range(calendar) -> None: +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) +def test_decode_dates_outside_timestamp_range(calendar, time_unit) -> None: from datetime import datetime import cftime @@ -273,13 +273,13 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: expected = cftime.num2date( time, units, calendar=calendar, only_use_cftime_datetimes=True ) - if calendar == "proleptic_gregorian": + if calendar == "proleptic_gregorian" and time_unit != "ns": expected = cftime_to_nptime(expected) expected_date_type = type(expected[0]) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(time, units, calendar=calendar) + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) assert all(isinstance(value, expected_date_type) for value in actual) abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -290,17 +290,20 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_decode_standard_calendar_single_element_inside_timestamp_range( - calendar, + calendar, time_unit ) -> None: units = "days since 0001-01-01" unit = "us" - if calendar == "proleptic_gregorian": - unit = "s" + if calendar == "proleptic_gregorian" and time_unit != "ns": + unit = time_unit for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_time, units, calendar=calendar) + actual = decode_cf_datetime( + num_time, units, calendar=calendar, time_unit=time_unit + ) assert actual.dtype == np.dtype(f"M8[{unit}]") @@ -338,15 +341,17 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_decode_standard_calendar_multidim_time_inside_timestamp_range( calendar, + time_unit, ) -> None: import cftime units = "days since 0001-01-01" unit = "us" - if calendar == "proleptic_gregorian": - unit = "s" + if calendar == "proleptic_gregorian" and time_unit != "ns": + unit = time_unit times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) @@ -358,7 +363,9 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( expected1 = times1.values expected2 = times2.values - actual = decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime( + mdim_time, units, calendar=calendar, time_unit=time_unit + ) assert actual.dtype == np.dtype(f"M8[{unit}]") abs_diff1 = abs(actual[:, 0] - expected1) @@ -413,7 +420,8 @@ def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) +def test_decode_multidim_time_outside_timestamp_range(calendar, time_unit) -> None: from datetime import datetime import cftime @@ -430,18 +438,20 @@ def test_decode_multidim_time_outside_timestamp_range(calendar) -> None: expected1 = cftime.num2date(time1, units, calendar, only_use_cftime_datetimes=True) expected2 = cftime.num2date(time2, units, calendar, only_use_cftime_datetimes=True) - if calendar == "proleptic_gregorian": + if calendar == "proleptic_gregorian" and time_unit != "ns": expected1 = cftime_to_nptime(expected1) expected2 = cftime_to_nptime(expected2) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(mdim_time, units, calendar=calendar) + actual = decode_cf_datetime( + mdim_time, units, calendar=calendar, time_unit=time_unit + ) dtype: np.dtype dtype = np.dtype("O") - if calendar == "proleptic_gregorian": - dtype = np.dtype("M8[s]") + if calendar == "proleptic_gregorian" and time_unit != "ns": + dtype = np.dtype(f"M8[{time_unit}]") assert actual.dtype == dtype @@ -532,13 +542,14 @@ def test_cf_datetime_nan(num_dates, units, expected_list) -> None: @requires_cftime -def test_decoded_cf_datetime_array_2d() -> None: +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) +def test_decoded_cf_datetime_array_2d(time_unit) -> None: # regression test for GH1229 variable = Variable( ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} ) - result = CFDatetimeCoder().decode(variable) - assert result.dtype == "datetime64[s]" + result = CFDatetimeCoder(time_unit=time_unit).decode(variable) + assert result.dtype == f"datetime64[{time_unit}]" expected = pd.date_range("2000-01-01", periods=4).values.reshape(2, 2) assert_array_equal(np.asarray(result), expected) @@ -688,7 +699,8 @@ def test_format_cftime_datetime(date_args, expected) -> None: @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -def test_decode_cf(calendar) -> None: +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) +def test_decode_cf(calendar, time_unit) -> None: days = [1.0, 2.0, 3.0] # TODO: GH5690 — do we want to allow this type for `coords`? da = DataArray(days, coords=[days], dims=["time"], name="test") @@ -702,15 +714,16 @@ def test_decode_cf(calendar) -> None: with pytest.raises(ValueError): ds = decode_cf(ds) else: - ds = decode_cf(ds) + ds = decode_cf(ds, time_unit=time_unit) if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype("M8[s]") + assert ds.test.dtype == np.dtype(f"M8[{time_unit}]") -def test_decode_cf_time_bounds() -> None: +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) +def test_decode_cf_time_bounds(time_unit) -> None: da = DataArray( np.arange(6, dtype="int64").reshape((3, 2)), coords={"time": [1, 2, 3]}, @@ -731,8 +744,8 @@ def test_decode_cf_time_bounds() -> None: "units": "days since 2001-01", "calendar": "standard", } - dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype("M8[s]") + dsc = decode_cf(ds, time_unit=time_unit) + assert dsc.time_bnds.dtype == np.dtype(f"M8[{time_unit}]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -1281,12 +1294,14 @@ def test_contains_cftime_lazy() -> None: ("1677-09-21T00:21:52.901038080", "ns", np.float32, 20.0, True), ], ) +@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_roundtrip_datetime64_nanosecond_precision( timestr: str, timeunit: Literal["ns", "us"], dtype: np.typing.DTypeLike, fill_value: int | float | None, use_encoding: bool, + time_unit: Literal["s", "ms", "us", "ns"], ) -> None: # test for GH7817 time = np.datetime64(timestr, timeunit) @@ -1307,9 +1322,16 @@ def test_roundtrip_datetime64_nanosecond_precision( ) assert encoded_var.attrs["calendar"] == "proleptic_gregorian" assert encoded_var.data.dtype == dtype + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, time_unit=time_unit + ) - decoded_var = conventions.decode_cf_variable("foo", encoded_var) - assert decoded_var.dtype == np.dtype(f"=M8[{timeunit}]") + result_unit = ( + timeunit + if np.timedelta64(1, timeunit) <= np.timedelta64(1, time_unit) + else time_unit + ) + assert decoded_var.dtype == np.dtype(f"=M8[{result_unit}]") assert ( decoded_var.encoding["units"] == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" From 8b9112843aec1727a2d7eb4228b0bc34bcadd318 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 22:07:15 +0100 Subject: [PATCH 035/134] fix tests --- xarray/tests/__init__.py | 2 +- xarray/tests/test_backends.py | 2 +- xarray/tests/test_coding_times.py | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index cbc142845fe..7f6cea693cf 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -331,7 +331,7 @@ def create_test_data( pd.date_range( "2000-01-01", periods=20, - unit="s", + unit="ns", ), ) for v, dims in sorted(_vars.items()): diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 88c3f2c0bf6..e4093213e64 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1618,7 +1618,7 @@ def test_open_encodings(self) -> None: ds.variables["time"][:] = np.arange(10) + 4 expected = Dataset() - time = pd.date_range("1999-01-05", periods=10, unit="s") + time = pd.date_range("1999-01-05", periods=10, unit="ns") encoding = {"units": units, "dtype": np.dtype("int32")} expected["time"] = ("time", time, {}, encoding) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index ef2eabe9551..b0b2a91e736 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -671,13 +671,12 @@ def test_cf_timedelta_2d() -> None: ["deltas", "expected"], [ (pd.to_timedelta(["1 day", "2 days"]), "days"), - (pd.to_timedelta(["1 day", "2 days"]), "days"), - (pd.to_timedelta(["1 day", "2 days"]), "days"), - (pd.to_timedelta(["1 day", "2 days"]), "days"), + (pd.to_timedelta(["1h", "1 day 1 hour"]), "hours"), + (pd.to_timedelta(["1m", "2m", np.nan]), "minutes"), + (pd.to_timedelta(["1m3s", "1m4s"]), "seconds"), ], ) def test_infer_timedelta_units(deltas, expected) -> None: - # todo: why testing, the same thing four times? assert expected == infer_timedelta_units(deltas) From 0e351cadca6551f1a46896c365cf4521732e69f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 22:35:31 +0100 Subject: [PATCH 036/134] fix more tests --- xarray/tests/test_coding_times.py | 2 +- xarray/tests/test_conventions.py | 20 ++++++++++++-------- xarray/tests/test_dataset.py | 4 ++-- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index b0b2a91e736..3392d189db0 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -217,7 +217,7 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar, unit) -> None import cftime units = "days since 0001-01-01" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit="us", freq="h") # type: ignore[arg-type] + times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit="us", freq="h") # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index abf4c950a9f..465130b79d4 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -444,7 +444,8 @@ def test_invalid_timedelta_units_do_not_decode(self, decode_times) -> None: assert_identical(expected, decode_cf(ds, decode_times=decode_times)) @requires_cftime - def test_dataset_repr_with_netcdf4_datetimes(self) -> None: + @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) + def test_dataset_repr_with_netcdf4_datetimes(self, time_unit) -> None: # regression test for #347 attrs = {"units": "days since 0001-01-01", "calendar": "noleap"} with warnings.catch_warnings(): @@ -453,8 +454,8 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: assert "(time) object" in repr(ds) attrs = {"units": "days since 1900-01-01"} - ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)})) - assert "(time) datetime64[s]" in repr(ds) + ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)}), time_unit=time_unit) + assert f"(time) datetime64[{time_unit}]" in repr(ds) @requires_cftime def test_decode_cf_datetime_transition_to_invalid(self) -> None: @@ -513,7 +514,8 @@ def test_decode_dask_times(self) -> None: conventions.decode_cf(original).chunk(), ) - def test_decode_cf_time_kwargs(self) -> None: + @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) + def test_decode_cf_time_kwargs(self, time_unit) -> None: ds = Dataset.from_dict( { "coords": { @@ -535,15 +537,17 @@ def test_decode_cf_time_kwargs(self) -> None: } ) - dsc = conventions.decode_cf(ds) + dsc = conventions.decode_cf(ds, time_unit=time_unit) assert dsc.timedelta.dtype == np.dtype("m8[ns]") - assert dsc.time.dtype == np.dtype("M8[s]") + assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") - dsc = conventions.decode_cf(ds, decode_times=True, decode_timedelta=False) + dsc = conventions.decode_cf( + ds, decode_times=True, time_unit=time_unit, decode_timedelta=False + ) assert dsc.timedelta.dtype == np.dtype("int64") - assert dsc.time.dtype == np.dtype("M8[s]") + assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False, decode_timedelta=True) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 5bdf530a696..8890aea85f4 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -300,7 +300,7 @@ def test_repr(self) -> None: Attributes: foo: bar""".format( data["dim3"].dtype, - "s", + "ns", ) ) actual = "\n".join(x.rstrip() for x in repr(data).split("\n")) @@ -453,7 +453,7 @@ def test_info(self) -> None: variables: \tfloat64 dim2(dim2) ; - \tdatetime64[s] time(time) ; + \tdatetime64[ns] time(time) ; \tfloat64 var1(dim1, dim2) ; \t\tvar1:foo = variable ; \tfloat64 var2(dim1, dim2) ; From 07a8e9cd299bb0113f364c5faec1e72e9bbf3528 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 23:05:51 +0100 Subject: [PATCH 037/134] fix docstring --- xarray/conventions.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 8d517cda6a5..ec31f3f1c9e 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -255,7 +255,7 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - time_unit : Literal["s", "ms", "us", "ns], optional + time_unit : Literal["s", "ms", "us", "ns"], optional Time unit to which resolution cf times should at least be decoded. Defaults to "ns". @@ -597,7 +597,7 @@ def decode_cf( {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. - time_unit : Literal["s", "ms", "us", "ns], optional + time_unit : Literal["s", "ms", "us", "ns"], optional Time unit to which resolution cf times should at least be decoded. Defaults to "ns". From 2be573989f9d2696f4d48012a2dab8bb56ea0836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 17 Nov 2024 23:31:27 +0100 Subject: [PATCH 038/134] use pd.Timestamp(np.datetime64(cftime)) to convert from cftime to numpy --- xarray/coding/times.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 59535011793..a612622ed4d 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -579,15 +579,11 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: dt: pd.Timestamp | Literal["NaT"] for _i, t in np.ndenumerate(times): try: - # todo: decide how to work with this - # as initialized by string pd.Timestamp is defined only from year -9999-01-01 to 9999-12-31 - # Use pandas.Timestamp in place of datetime.datetime, because - # NumPy casts it safely it np.datetime64[ns] for dates outside - # 1678 to 2262 (this is not currently the case for - # datetime.datetime). - dt = pd.Timestamp( - t.year, t.month, t.day, t.hour, t.minute, t.second, t.microsecond - ) + # When initialized by string pd.Timestamp is defined only + # from year -9999-01-01 to 9999-12-31. Therefore we wrap + # the times by np.datetime64 before. This works as long we do + # not overflow (eg. for dates outside 1678 to 2262). + dt = pd.Timestamp(np.datetime64(t)) except ValueError as e: if raise_on_invalid: raise ValueError( From b9d0a8ea72f047c7e496d8a7e260566c6cad634e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 10:23:21 +0100 Subject: [PATCH 039/134] use dt = np.datetime64(cftime.isoformat()) to convert from cftime to numpy datetime64 --- xarray/coding/times.py | 14 ++++++-------- xarray/tests/test_cftimeindex.py | 4 ++-- xarray/tests/test_coding_times.py | 4 ++-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index a612622ed4d..925d4d38a47 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -576,14 +576,12 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: Otherwise, the invalid element is replaced by np.NaT.""" times = np.asarray(times) new = [] - dt: pd.Timestamp | Literal["NaT"] + dt: np.datetime64 for _i, t in np.ndenumerate(times): try: - # When initialized by string pd.Timestamp is defined only - # from year -9999-01-01 to 9999-12-31. Therefore we wrap - # the times by np.datetime64 before. This works as long we do - # not overflow (eg. for dates outside 1678 to 2262). - dt = pd.Timestamp(np.datetime64(t)) + # We expect either "us" resolution or "s" resolution depending on + # whether 'microseconds' are defined for the input or not. + dt = np.datetime64(t.isoformat()) except ValueError as e: if raise_on_invalid: raise ValueError( @@ -591,8 +589,8 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: f"standard calendar. Reason: {e}." ) from e else: - dt = "NaT" - new.append(np.datetime64(dt)) + dt = np.datetime64("NaT") + new.append(dt) return np.asarray(new).reshape(times.shape) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 9ae75c70557..710a613b572 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1159,7 +1159,7 @@ def test_strftime_of_cftime_array(calendar): @pytest.mark.parametrize("unsafe", [False, True]) def test_to_datetimeindex(calendar, unsafe): index = xr.cftime_range("2000", periods=5, calendar=calendar) - expected = pd.date_range("2000", periods=5, unit="us") + expected = pd.date_range("2000", periods=5, unit="s") if calendar in _NON_STANDARD_CALENDARS and not unsafe: with pytest.warns(RuntimeWarning, match="non-standard"): @@ -1204,7 +1204,7 @@ def test_multiindex(): @pytest.mark.parametrize("method", ["floor", "ceil", "round"]) def test_rounding_methods_against_datetimeindex(freq, method): # todo: check, if setting to "us" is enough - expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="us") + expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="s") expected = getattr(expected, method)(freq) result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777s") result = getattr(result, method)(freq).to_datetimeindex() diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 3392d189db0..537ab8e2554 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -295,7 +295,7 @@ def test_decode_standard_calendar_single_element_inside_timestamp_range( calendar, time_unit ) -> None: units = "days since 0001-01-01" - unit = "us" + unit = "s" if calendar == "proleptic_gregorian" and time_unit != "ns": unit = time_unit for num_time in [735368, [735368], [[735368]]]: @@ -349,7 +349,7 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( import cftime units = "days since 0001-01-01" - unit = "us" + unit = "s" if calendar == "proleptic_gregorian" and time_unit != "ns": unit = time_unit times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") From 08afc3bc66eeaa7763249b094771986beb58a6ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 10:45:25 +0100 Subject: [PATCH 040/134] fix time-coding.rst --- doc/internals/time-coding.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 7c249bd2c6b..36d227d2fb2 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -341,36 +341,36 @@ Decoding of ``values`` with time unit specification like ``seconds since 1992-10 calendar = "proleptic_gregorian" values = np.array([-1000 * 365, 0, 1000 * 365], dtype="int64") units = "days since 2000-01-01 00:00:00.000001" - dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") print(dt) assert dt.dtype == "datetime64[us]" units = "microseconds since 2000-01-01 00:00:00" - dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") print(dt) assert dt.dtype == "datetime64[us]" values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") units = "days since 2000-01-01 00:00:00.001" - dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") print(dt) assert dt.dtype == "datetime64[ms]" values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") units = "hours since 2000-01-01" - dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") print(dt) assert dt.dtype == "datetime64[s]" values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") units = "hours since 2000-01-01 00:00:00 03:30" - dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") print(dt) assert dt.dtype == "datetime64[s]" values = np.array([-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64") units = "days since 0001-01-01 00:00:00" - dt = xr.coding.times.decode_cf_datetime(values, units, calendar) + dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") print(dt) assert dt.dtype == "datetime64[s]" From edc55e1f702440a5bea9e79926ca709f4a9386b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 11:16:26 +0100 Subject: [PATCH 041/134] use us in to_datetimeindex --- xarray/coding/cftimeindex.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 18877defa73..18ce2f464ec 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -652,7 +652,8 @@ def to_datetimeindex(self, unsafe=False): if not self._data.size: return pd.DatetimeIndex([]) - nptimes = cftime_to_nptime(self) + # transform to us-resolution is needed for DatetimeIndex + nptimes = cftime_to_nptime(self).astype("=M8[us]") calendar = infer_calendar_name(self) if calendar not in _STANDARD_CALENDARS and not unsafe: warnings.warn( From bffe9190e76830301f5a1adcdf8fadbaa7be2af2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 11:22:01 +0100 Subject: [PATCH 042/134] revert back to us for datetimeindex tests --- xarray/tests/test_cftimeindex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 710a613b572..9ae75c70557 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1159,7 +1159,7 @@ def test_strftime_of_cftime_array(calendar): @pytest.mark.parametrize("unsafe", [False, True]) def test_to_datetimeindex(calendar, unsafe): index = xr.cftime_range("2000", periods=5, calendar=calendar) - expected = pd.date_range("2000", periods=5, unit="s") + expected = pd.date_range("2000", periods=5, unit="us") if calendar in _NON_STANDARD_CALENDARS and not unsafe: with pytest.warns(RuntimeWarning, match="non-standard"): @@ -1204,7 +1204,7 @@ def test_multiindex(): @pytest.mark.parametrize("method", ["floor", "ceil", "round"]) def test_rounding_methods_against_datetimeindex(freq, method): # todo: check, if setting to "us" is enough - expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="s") + expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="us") expected = getattr(expected, method)(freq) result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777s") result = getattr(result, method)(freq).to_datetimeindex() From 150b98236fe62fcfcaa067490ffc23c8a81dd3b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 15:52:13 +0100 Subject: [PATCH 043/134] estimate fitting resolution for floating point values, when decoding times --- xarray/coding/times.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 925d4d38a47..bac36388143 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -377,6 +377,23 @@ def _decode_datetime_with_pandas( flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) time_unit = ref_date.unit + # estimate fitting resolution for floating point values + if flat_num_dates.dtype.kind == "f": + res = ["s", "ms", "us", "ns"] + has_decimal = lambda x: ((x % 1) > 0).any() + while has_decimal(flat_num_dates) and time_unit != "ns": + idx = res.index(time_unit) + new_time_unit = res[idx + 1] + msg = ( + f"Can't decode floating point datetime to {time_unit} without precision loss," + f"decoding to {new_time_unit} instead." + ) + emit_user_level_warning(msg, SerializationWarning) + flat_num_dates *= np.int64( + _NS_PER_TIME_DELTA[time_unit] / _NS_PER_TIME_DELTA[new_time_unit] + ) + time_unit = new_time_unit + # Cast input ordinals to integers and properly handle NaN/NaT # to prevent casting NaN to int flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) @@ -419,6 +436,7 @@ def decode_cf_datetime( flat_num_dates, units, calendar, time_unit ) except (KeyError, OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError): + print("decoding with cftime:", flat_num_dates.dtype) dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) @@ -446,7 +464,7 @@ def decode_cf_datetime( elif use_cftime: dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar) else: - dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar) + dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar, time_unit) return reshape(dates, num_dates.shape) From 7113cebf1d925025035b16d35dcea5ff2c83c9cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 15:58:43 +0100 Subject: [PATCH 044/134] add test --- xarray/tests/test_coding_times.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 537ab8e2554..e437f4237e1 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1253,6 +1253,16 @@ def test_decode_float_datetime(): np.testing.assert_equal(actual, expected) +def test_decode_float_datetime_with_decimals(): + # test resolution enhancement for floats + values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + units = "seconds since 2000-01-01" + calendar = "standard" + with pytest.warns(SerializationWarning): + actual = decode_cf_datetime(values, units, calendar, time_unit="s") + assert actual.dtype == np.dtype("=M8[ms]") + + @requires_cftime def test_scalar_unit() -> None: # test that a scalar units (often NaN when using to_netcdf) does not raise an error From 7f47f0b4d6abd9256b1bfadd7c31e44aef2f4ac9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 17:54:12 +0100 Subject: [PATCH 045/134] refactor floating point decoding --- xarray/coding/times.py | 36 +++++++++++++++++++++---------- xarray/tests/test_coding_times.py | 16 ++++++++++++-- 2 files changed, 39 insertions(+), 13 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index bac36388143..c640c6df5e1 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -377,22 +377,36 @@ def _decode_datetime_with_pandas( flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) time_unit = ref_date.unit - # estimate fitting resolution for floating point values - if flat_num_dates.dtype.kind == "f": + def _check_higher_resolution( + flat_num_dates: np.ndarray, time_unit="s" + ) -> np.ndarray: res = ["s", "ms", "us", "ns"] - has_decimal = lambda x: ((x % 1) > 0).any() - while has_decimal(flat_num_dates) and time_unit != "ns": + fract = np.unique(flat_num_dates % 1) + old_time_unit = time_unit + if (fract > 0).any() and time_unit != "ns": idx = res.index(time_unit) - new_time_unit = res[idx + 1] + time_unit = res[idx + 1] + flat_num_dates *= 1000 + fract = np.unique(flat_num_dates % 1) + # If the elements can evenly divide the 'units' + # we can stop after this iteration. + # Otherwise we continue until we reach "ns" resolution. + if (np.unique((1 / fract[fract > 0]) % 1) > 0).any(): + flat_num_dates, time_unit = _check_higher_resolution( + flat_num_dates, time_unit + ) + if old_time_unit != time_unit: msg = ( - f"Can't decode floating point datetime to {time_unit} without precision loss," - f"decoding to {new_time_unit} instead." + f"Can't decode floating point datetime to {old_time_unit!r} without precision loss," + f"decoding to {time_unit!r} instead. To silence this warning use " + f"time_unit={time_unit!r} in call to decoding function." ) emit_user_level_warning(msg, SerializationWarning) - flat_num_dates *= np.int64( - _NS_PER_TIME_DELTA[time_unit] / _NS_PER_TIME_DELTA[new_time_unit] - ) - time_unit = new_time_unit + return flat_num_dates, time_unit + + # estimate fitting resolution for floating point values + if flat_num_dates.dtype.kind == "f": + flat_num_dates, time_unit = _check_higher_resolution(flat_num_dates, time_unit) # Cast input ordinals to integers and properly handle NaN/NaT # to prevent casting NaN to int diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index e437f4237e1..6e7bba80b58 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1255,12 +1255,24 @@ def test_decode_float_datetime(): def test_decode_float_datetime_with_decimals(): # test resolution enhancement for floats - values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") + values = np.array([0, 0.25, 1 / 3.0, 0.75, 1.0], dtype="float64") + expected = np.array( + [ + "2000-01-01T00:00:00.000000000", + "2000-01-01T00:00:00.250000000", + "2000-01-01T00:00:00.333333333", + "2000-01-01T00:00:00.750000000", + "2000-01-01T00:00:01.000000000", + ], + dtype="=M8[ns]", + ) + units = "seconds since 2000-01-01" calendar = "standard" with pytest.warns(SerializationWarning): actual = decode_cf_datetime(values, units, calendar, time_unit="s") - assert actual.dtype == np.dtype("=M8[ms]") + assert actual.dtype == expected.dtype + np.testing.assert_equal(actual, expected) @requires_cftime From 63c83f4cba1b52dbd865fe7d2f78f74c9a9de0e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 18 Nov 2024 23:32:40 +0100 Subject: [PATCH 046/134] simplify recursive function, update tests --- xarray/coding/times.py | 38 +++++++++++++------------------ xarray/tests/test_coding_times.py | 18 +++++++++------ 2 files changed, 27 insertions(+), 29 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index c640c6df5e1..dc359ad7234 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -377,32 +377,26 @@ def _decode_datetime_with_pandas( flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) time_unit = ref_date.unit + res = {"s": "ms", "ms": "us", "us": "ns"} + def _check_higher_resolution( - flat_num_dates: np.ndarray, time_unit="s" - ) -> np.ndarray: - res = ["s", "ms", "us", "ns"] - fract = np.unique(flat_num_dates % 1) - old_time_unit = time_unit - if (fract > 0).any() and time_unit != "ns": - idx = res.index(time_unit) - time_unit = res[idx + 1] - flat_num_dates *= 1000 - fract = np.unique(flat_num_dates % 1) - # If the elements can evenly divide the 'units' - # we can stop after this iteration. - # Otherwise we continue until we reach "ns" resolution. - if (np.unique((1 / fract[fract > 0]) % 1) > 0).any(): - flat_num_dates, time_unit = _check_higher_resolution( - flat_num_dates, time_unit - ) - if old_time_unit != time_unit: + flat_num_dates: np.ndarray, + new_time_unit: str, + ) -> tuple[np.ndarray, str]: + if (np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns": + flat_num_dates, new_time_unit = _check_higher_resolution( + flat_num_dates * 1000, + new_time_unit=res[new_time_unit], + ) + if time_unit != new_time_unit: msg = ( - f"Can't decode floating point datetime to {old_time_unit!r} without precision loss," - f"decoding to {time_unit!r} instead. To silence this warning use " - f"time_unit={time_unit!r} in call to decoding function." + f"Can't decode floating point datetime to {time_unit!r} without " + f"precision loss, decoding to {new_time_unit!r} instead. " + f"To silence this warning use time_unit={new_time_unit!r} in call to " + f"decoding function." ) emit_user_level_warning(msg, SerializationWarning) - return flat_num_dates, time_unit + return flat_num_dates, new_time_unit # estimate fitting resolution for floating point values if flat_num_dates.dtype.kind == "f": diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 6e7bba80b58..096af1b88b4 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1255,16 +1255,17 @@ def test_decode_float_datetime(): def test_decode_float_datetime_with_decimals(): # test resolution enhancement for floats - values = np.array([0, 0.25, 1 / 3.0, 0.75, 1.0], dtype="float64") + values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float64") expected = np.array( [ - "2000-01-01T00:00:00.000000000", - "2000-01-01T00:00:00.250000000", - "2000-01-01T00:00:00.333333333", - "2000-01-01T00:00:00.750000000", - "2000-01-01T00:00:01.000000000", + "2000-01-01T00:00:00.000", + "2000-01-01T00:00:00.125", + "2000-01-01T00:00:00.250", + "2000-01-01T00:00:00.375", + "2000-01-01T00:00:00.750", + "2000-01-01T00:00:01.000", ], - dtype="=M8[ns]", + dtype="=M8[ms]", ) units = "seconds since 2000-01-01" @@ -1273,6 +1274,9 @@ def test_decode_float_datetime_with_decimals(): actual = decode_cf_datetime(values, units, calendar, time_unit="s") assert actual.dtype == expected.dtype np.testing.assert_equal(actual, expected) + actual = decode_cf_datetime(values, units, calendar, time_unit="ms") + assert actual.dtype == expected.dtype + np.testing.assert_equal(actual, expected) @requires_cftime From 0efbbeb68b32708f603da92de5f2b231ddcab3f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 13:15:32 +0100 Subject: [PATCH 047/134] more refactoring, update tests --- xarray/coding/times.py | 50 +++++++++++++++++++------------ xarray/tests/test_coding_times.py | 32 +++++++++++++++----- 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index dc359ad7234..33c613ef078 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -2,7 +2,7 @@ import re import warnings -from collections.abc import Callable, Hashable +from collections.abc import Callable, Hashable, Iterator from datetime import datetime, timedelta from functools import partial from typing import Literal, Union, cast @@ -36,7 +36,12 @@ except ImportError: cftime = None -from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray +from xarray.core.types import ( + CFCalendar, + NPDatetimeUnitOptions, + PDDatetimeUnitOptions, + T_DuckArray, +) T_Name = Union[Hashable, None] @@ -310,11 +315,25 @@ def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: ) +def _check_higher_resolution( + flat_num_dates: np.ndarray, + iter_unit: Iterator[PDDatetimeUnitOptions], +) -> tuple[np.ndarray, PDDatetimeUnitOptions]: + """Iterate until fitting resolution found.""" + new_time_unit: PDDatetimeUnitOptions = next(iter_unit) + if (np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns": + flat_num_dates, new_time_unit = _check_higher_resolution( + flat_num_dates * 1000, + iter_unit=iter_unit, + ) + return flat_num_dates, new_time_unit + + def _decode_datetime_with_pandas( flat_num_dates: np.ndarray, units: str, calendar: str, - time_resolution: Literal["s", "ms", "us", "ns"] = "ns", + time_resolution: PDDatetimeUnitOptions = "ns", ) -> np.ndarray: if not _is_standard_calendar(calendar): raise OutOfBoundsDatetime( @@ -377,17 +396,14 @@ def _decode_datetime_with_pandas( flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) time_unit = ref_date.unit - res = {"s": "ms", "ms": "us", "us": "ns"} - - def _check_higher_resolution( - flat_num_dates: np.ndarray, - new_time_unit: str, - ) -> tuple[np.ndarray, str]: - if (np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns": - flat_num_dates, new_time_unit = _check_higher_resolution( - flat_num_dates * 1000, - new_time_unit=res[new_time_unit], - ) + # estimate fitting resolution for floating point values + # this iterates until all floats are fraction less or time_unit == "ns" + if flat_num_dates.dtype.kind == "f" and time_unit != "ns": + res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] + iter_unit = iter(res[res.index(time_unit) :]) + flat_num_dates, new_time_unit = _check_higher_resolution( + flat_num_dates, iter_unit + ) if time_unit != new_time_unit: msg = ( f"Can't decode floating point datetime to {time_unit!r} without " @@ -396,11 +412,7 @@ def _check_higher_resolution( f"decoding function." ) emit_user_level_warning(msg, SerializationWarning) - return flat_num_dates, new_time_unit - - # estimate fitting resolution for floating point values - if flat_num_dates.dtype.kind == "f": - flat_num_dates, time_unit = _check_higher_resolution(flat_num_dates, time_unit) + time_unit = new_time_unit # Cast input ordinals to integers and properly handle NaN/NaT # to prevent casting NaN to int diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 096af1b88b4..853c6b6c52e 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -39,6 +39,7 @@ from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder from xarray.core.common import contains_cftime_datetimes +from xarray.core.types import PDDatetimeUnitOptions from xarray.core.utils import is_duck_dask_array from xarray.testing import assert_equal, assert_identical from xarray.tests import ( @@ -1253,9 +1254,12 @@ def test_decode_float_datetime(): np.testing.assert_equal(actual, expected) -def test_decode_float_datetime_with_decimals(): +@pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) +def test_decode_float_datetime_with_decimals( + time_unit: Literal["ms", "us", "ns"], +) -> None: # test resolution enhancement for floats - values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float64") + values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float32") expected = np.array( [ "2000-01-01T00:00:00.000", @@ -1265,20 +1269,32 @@ def test_decode_float_datetime_with_decimals(): "2000-01-01T00:00:00.750", "2000-01-01T00:00:01.000", ], - dtype="=M8[ms]", + dtype=f"=M8[{time_unit}]", ) units = "seconds since 2000-01-01" calendar = "standard" - with pytest.warns(SerializationWarning): - actual = decode_cf_datetime(values, units, calendar, time_unit="s") - assert actual.dtype == expected.dtype - np.testing.assert_equal(actual, expected) - actual = decode_cf_datetime(values, units, calendar, time_unit="ms") + actual = decode_cf_datetime(values, units, calendar, time_unit=time_unit) assert actual.dtype == expected.dtype np.testing.assert_equal(actual, expected) +@pytest.mark.parametrize( + "time_unit, num", [("s", 0.123), ("ms", 0.1234), ("us", 0.1234567)] +) +def test_coding_float_datetime_warning( + time_unit: PDDatetimeUnitOptions, num: float +) -> None: + units = "seconds since 2000-01-01" + calendar = "standard" + values = np.array([num], dtype="float32") + with pytest.warns( + SerializationWarning, + match=f"Can't decode floating point datetime to {time_unit!r}", + ): + decode_cf_datetime(values, units, calendar, time_unit=time_unit) + + @requires_cftime def test_scalar_unit() -> None: # test that a scalar units (often NaN when using to_netcdf) does not raise an error From 2910250882e0a9356a8f424eb1feac12fd55eb69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 14:03:02 +0100 Subject: [PATCH 048/134] add fixture, apply fixture to more tests. --- xarray/coding/times.py | 2 +- xarray/tests/conftest.py | 5 +++ xarray/tests/test_coding_times.py | 73 +++++++++++++++++++------------ 3 files changed, 50 insertions(+), 30 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 33c613ef078..4b24eb6a71a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -397,7 +397,7 @@ def _decode_datetime_with_pandas( time_unit = ref_date.unit # estimate fitting resolution for floating point values - # this iterates until all floats are fraction less or time_unit == "ns" + # this iterates until all floats are fractionless or time_unit == "ns" if flat_num_dates.dtype.kind == "f" and time_unit != "ns": res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] iter_unit = iter(res[res.index(time_unit) :]) diff --git a/xarray/tests/conftest.py b/xarray/tests/conftest.py index 97de58c4af2..c3f1ccbfe3c 100644 --- a/xarray/tests/conftest.py +++ b/xarray/tests/conftest.py @@ -220,3 +220,8 @@ def simple_datatree(create_test_datatree): Returns a DataTree. """ return create_test_datatree() + + +@pytest.fixture(scope="module", params=["s", "ms", "us", "ns"]) +def time_unit(request): + return request.param diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 853c6b6c52e..74eab5aca5b 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -124,8 +124,9 @@ def _all_cftime_date_types(): @pytest.mark.filterwarnings("ignore:Ambiguous reference date string") @pytest.mark.filterwarnings("ignore:Times can't be serialized faithfully") @pytest.mark.parametrize(["num_dates", "units", "calendar"], _CF_DATETIME_TESTS) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) -def test_cf_datetime(num_dates, units, calendar, time_unit) -> None: +def test_cf_datetime( + num_dates, units, calendar, time_unit: PDDatetimeUnitOptions +) -> None: import cftime expected = cftime.num2date( @@ -261,8 +262,9 @@ def test_decode_non_standard_calendar_inside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) -def test_decode_dates_outside_timestamp_range(calendar, time_unit) -> None: +def test_decode_dates_outside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: from datetime import datetime import cftime @@ -291,9 +293,8 @@ def test_decode_dates_outside_timestamp_range(calendar, time_unit) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_decode_standard_calendar_single_element_inside_timestamp_range( - calendar, time_unit + calendar, time_unit: PDDatetimeUnitOptions ) -> None: units = "days since 0001-01-01" unit = "s" @@ -342,10 +343,9 @@ def test_decode_single_element_outside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_decode_standard_calendar_multidim_time_inside_timestamp_range( calendar, - time_unit, + time_unit: PDDatetimeUnitOptions, ) -> None: import cftime @@ -421,8 +421,9 @@ def test_decode_nonstandard_calendar_multidim_time_inside_timestamp_range( @requires_cftime @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) -def test_decode_multidim_time_outside_timestamp_range(calendar, time_unit) -> None: +def test_decode_multidim_time_outside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: from datetime import datetime import cftime @@ -543,8 +544,7 @@ def test_cf_datetime_nan(num_dates, units, expected_list) -> None: @requires_cftime -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) -def test_decoded_cf_datetime_array_2d(time_unit) -> None: +def test_decoded_cf_datetime_array_2d(time_unit: PDDatetimeUnitOptions) -> None: # regression test for GH1229 variable = Variable( ("x", "y"), np.array([[0, 1], [2, 3]]), {"units": "days since 2000-01-01"} @@ -699,8 +699,7 @@ def test_format_cftime_datetime(date_args, expected) -> None: @pytest.mark.parametrize("calendar", _ALL_CALENDARS) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) -def test_decode_cf(calendar, time_unit) -> None: +def test_decode_cf(calendar, time_unit: PDDatetimeUnitOptions) -> None: days = [1.0, 2.0, 3.0] # TODO: GH5690 — do we want to allow this type for `coords`? da = DataArray(days, coords=[days], dims=["time"], name="test") @@ -722,8 +721,7 @@ def test_decode_cf(calendar, time_unit) -> None: assert ds.test.dtype == np.dtype(f"M8[{time_unit}]") -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) -def test_decode_cf_time_bounds(time_unit) -> None: +def test_decode_cf_time_bounds(time_unit: PDDatetimeUnitOptions) -> None: da = DataArray( np.arange(6, dtype="int64").reshape((3, 2)), coords={"time": [1, 2, 3]}, @@ -1097,14 +1095,16 @@ def test_encode_cf_datetime_defaults_to_correct_dtype( @pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys()) -def test_encode_decode_roundtrip_datetime64(freq) -> None: +def test_encode_decode_roundtrip_datetime64( + freq, time_unit: PDDatetimeUnitOptions +) -> None: # See GH 4045. Prior to GH 4684 this test would fail for frequencies of # "s", "ms", "us", and "ns". initial_time = pd.date_range("1678-01-01", periods=1) times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded) + decoded = conventions.decode_cf_variable("time", encoded, time_unit=time_unit) assert_equal(variable, decoded) @@ -1144,13 +1144,15 @@ def test__encode_datetime_with_cftime() -> None: @pytest.mark.parametrize("calendar", ["gregorian", "Gregorian", "GREGORIAN"]) -def test_decode_encode_roundtrip_with_non_lowercase_letters(calendar) -> None: +def test_decode_encode_roundtrip_with_non_lowercase_letters( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: # See GH 5093. times = [0, 1] units = "days since 2000-01-01" attrs = {"calendar": calendar, "units": units} variable = Variable(["time"], times, attrs) - decoded = conventions.decode_cf_variable("time", variable) + decoded = conventions.decode_cf_variable("time", variable, time_unit=time_unit) encoded = conventions.encode_cf_variable(decoded) # Previously this would erroneously be an array of cftime.datetime @@ -1256,7 +1258,7 @@ def test_decode_float_datetime(): @pytest.mark.parametrize("time_unit", ["ms", "us", "ns"]) def test_decode_float_datetime_with_decimals( - time_unit: Literal["ms", "us", "ns"], + time_unit: PDDatetimeUnitOptions, ) -> None: # test resolution enhancement for floats values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float32") @@ -1335,14 +1337,13 @@ def test_contains_cftime_lazy() -> None: ("1677-09-21T00:21:52.901038080", "ns", np.float32, 20.0, True), ], ) -@pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_roundtrip_datetime64_nanosecond_precision( timestr: str, timeunit: Literal["ns", "us"], dtype: np.typing.DTypeLike, fill_value: int | float | None, use_encoding: bool, - time_unit: Literal["s", "ms", "us", "ns"], + time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7817 time = np.datetime64(timestr, timeunit) @@ -1382,7 +1383,9 @@ def test_roundtrip_datetime64_nanosecond_precision( assert_identical(var, decoded_var) -def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: +def test_roundtrip_datetime64_nanosecond_precision_warning( + time_unit: PDDatetimeUnitOptions, +) -> None: # test warning if times can't be serialized faithfully times = [ np.datetime64("1970-01-01T00:01:00", "ns"), @@ -1414,7 +1417,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, time_unit=time_unit + ) assert_identical(var, decoded_var) encoding = dict(dtype="float64", _FillValue=20, units=units) @@ -1426,7 +1431,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == units assert encoded_var.attrs["_FillValue"] == 20.0 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, time_unit=time_unit + ) assert_identical(var, decoded_var) encoding = dict(dtype="int64", _FillValue=20, units=new_units) @@ -1438,7 +1445,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: assert encoded_var.attrs["units"] == new_units assert encoded_var.attrs["_FillValue"] == 20 - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, time_unit=time_unit + ) assert_identical(var, decoded_var) @@ -1447,7 +1456,9 @@ def test_roundtrip_datetime64_nanosecond_precision_warning() -> None: [(np.int64, 20), (np.int64, np.iinfo(np.int64).min), (np.float64, 1e30)], ) def test_roundtrip_timedelta64_nanosecond_precision( - dtype: np.typing.DTypeLike, fill_value: int | float + dtype: np.typing.DTypeLike, + fill_value: int | float, + time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7942 one_day = np.timedelta64(1, "ns") @@ -1460,7 +1471,9 @@ def test_roundtrip_timedelta64_nanosecond_precision( var = Variable(["time"], timedelta_values, encoding=encoding) encoded_var = conventions.encode_cf_variable(var) - decoded_var = conventions.decode_cf_variable("foo", encoded_var) + decoded_var = conventions.decode_cf_variable( + "foo", encoded_var, time_unit=time_unit + ) assert_identical(var, decoded_var) @@ -1655,7 +1668,9 @@ def test_encode_cf_datetime_casting_value_error(use_cftime, use_dask) -> None: with pytest.warns(UserWarning, match="Times can't be serialized"): encoded = conventions.encode_cf_variable(variable) assert encoded.attrs["units"] == "hours since 2000-01-01" + decoded = conventions.decode_cf_variable("name", encoded) + print(decoded.load()) assert_equal(variable, decoded) else: with pytest.raises(ValueError, match="Not possible"): From 57d8d725d364479f63c0779f304c1365bdac106e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 14:40:18 +0100 Subject: [PATCH 049/134] update time-coding.rst --- doc/internals/time-coding.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 36d227d2fb2..09d73d8e7b8 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -332,9 +332,11 @@ Decoding of ``values`` with time unit specification like ``seconds since 1992-10 2. ``standard``/``gregorian`` calendar and ``proleptic_gregorian`` are equivalent for any dates and reference times >= ``1582-10-15``. First the reference time is checked and any timezone information stripped off and in a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For ``standard``/``gregorian`` calendar the dates are checked to be >= ``1582-10-15``. If anything fails, the decoding is done with ``cftime``). -3. As the time unit (here ``seconds``) and the resolution of the reference time ``1992-10-8 15:15:42.5 -6:00`` (here ``milliseconds``) might be different, this has to be aligned to the higher resolution (retrieve new unit). This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To not break consistency for ``NaT`` a mask is kept and re-introduced after the multiplication. +3. As the unit (here ``seconds``) and the resolution of the reference time ``1992-10-8 15:15:42.5 -6:00`` (here ``milliseconds``) might be different, this has to be aligned to the higher resolution (retrieve new unit). User may also specify their wanted target resolution by setting kwarg ``time_unit`` to one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` (default ``'ns'``). This will be included into the alignment process. This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To not break consistency for ``NaT`` a mask is kept and re-introduced after the multiplication. -4. Finally, the ``values`` (``int64``) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. +4. Times encoded as floating point values are checked for fractional parts and the resolution is enhanced in an iterative process until a fitting resolution (or nansosecond) is found. A ``SerializationWarning`` is issued to make the user aware of the possibly problematic encoding. + +5. Finally, the ``values`` (``int64``) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. .. ipython:: python From 53332401b1929b2e06324c6ecb77dfa4510239fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 14:48:24 +0100 Subject: [PATCH 050/134] fix typing --- xarray/coding/times.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 4b24eb6a71a..539da02b68b 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -400,7 +400,7 @@ def _decode_datetime_with_pandas( # this iterates until all floats are fractionless or time_unit == "ns" if flat_num_dates.dtype.kind == "f" and time_unit != "ns": res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] - iter_unit = iter(res[res.index(time_unit) :]) + iter_unit = iter(res[res.index(cast(PDDatetimeUnitOptions, time_unit)) :]) flat_num_dates, new_time_unit = _check_higher_resolution( flat_num_dates, iter_unit ) From 6f35c81ed58981940042540340a0ce2d06f09dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 15:05:07 +0100 Subject: [PATCH 051/134] try to fix test, remove stale print --- xarray/coding/times.py | 1 - xarray/tests/test_coding_times.py | 10 +++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 539da02b68b..d6bc966f35e 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -456,7 +456,6 @@ def decode_cf_datetime( flat_num_dates, units, calendar, time_unit ) except (KeyError, OutOfBoundsDatetime, OutOfBoundsTimedelta, OverflowError): - print("decoding with cftime:", flat_num_dates.dtype) dates = _decode_datetime_with_cftime( flat_num_dates.astype(float), units, calendar ) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 74eab5aca5b..0f0ca554124 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -214,8 +214,7 @@ def test_decode_cf_datetime_non_iso_strings() -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -@pytest.mark.parametrize("unit", ["s", "ms", "us", "ns"]) -def test_decode_standard_calendar_inside_timestamp_range(calendar, unit) -> None: +def test_decode_standard_calendar_inside_timestamp_range(calendar, time_unit) -> None: import cftime units = "days since 0001-01-01" @@ -225,9 +224,11 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar, unit) -> None expected = times.values # for cftime we get "us" resolution # ns resolution is handled by cftime, too (OutOfBounds) - actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=unit) - if calendar != "proleptic_gregorian" or unit == "ns": + actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) + if calendar != "proleptic_gregorian" or time_unit == "ns": unit = "us" + else: + unit = time_unit expected_dtype = np.dtype(f"M8[{unit}]") assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) @@ -1670,7 +1671,6 @@ def test_encode_cf_datetime_casting_value_error(use_cftime, use_dask) -> None: assert encoded.attrs["units"] == "hours since 2000-01-01" decoded = conventions.decode_cf_variable("name", encoded) - print(decoded.load()) assert_equal(variable, decoded) else: with pytest.raises(ValueError, match="Not possible"): From d0c17a4d6aecbfb48526cfe836fe2203c00b55fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 15:24:45 +0100 Subject: [PATCH 052/134] another attempt to fix test --- xarray/tests/test_coding_times.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 0f0ca554124..6ff9a00d2f9 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -229,7 +229,7 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar, time_unit) -> unit = "us" else: unit = time_unit - expected_dtype = np.dtype(f"M8[{unit}]") + expected_dtype = np.dtype(f"=M8[{unit}]") assert actual.dtype == expected_dtype abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, @@ -308,7 +308,7 @@ def test_decode_standard_calendar_single_element_inside_timestamp_range( num_time, units, calendar=calendar, time_unit=time_unit ) - assert actual.dtype == np.dtype(f"M8[{unit}]") + assert actual.dtype == np.dtype(f"=M8[{unit}]") @requires_cftime @@ -368,7 +368,7 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( actual = decode_cf_datetime( mdim_time, units, calendar=calendar, time_unit=time_unit ) - assert actual.dtype == np.dtype(f"M8[{unit}]") + assert actual.dtype == np.dtype(f"=M8[{unit}]") abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -454,7 +454,7 @@ def test_decode_multidim_time_outside_timestamp_range( dtype: np.dtype dtype = np.dtype("O") if calendar == "proleptic_gregorian" and time_unit != "ns": - dtype = np.dtype(f"M8[{time_unit}]") + dtype = np.dtype(f"=M8[{time_unit}]") assert actual.dtype == dtype @@ -719,7 +719,7 @@ def test_decode_cf(calendar, time_unit: PDDatetimeUnitOptions) -> None: if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype(f"M8[{time_unit}]") + assert ds.test.dtype == np.dtype(f"=M8[{time_unit}]") def test_decode_cf_time_bounds(time_unit: PDDatetimeUnitOptions) -> None: @@ -744,7 +744,7 @@ def test_decode_cf_time_bounds(time_unit: PDDatetimeUnitOptions) -> None: "calendar": "standard", } dsc = decode_cf(ds, time_unit=time_unit) - assert dsc.time_bnds.dtype == np.dtype(f"M8[{time_unit}]") + assert dsc.time_bnds.dtype == np.dtype(f"=M8[{time_unit}]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -1230,7 +1230,7 @@ def test_decode_0size_datetime(use_cftime): if use_cftime and not has_cftime: pytest.skip() - dtype = object if use_cftime else "M8[ns]" + dtype = object if use_cftime else "=M8[ns]" expected = np.array([], dtype=dtype) actual = decode_cf_datetime( np.zeros(shape=0, dtype=np.int64), From b2b6bb101173528996f0bc6ccb2bf69b4e54b63e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 19 Nov 2024 15:54:33 +0100 Subject: [PATCH 053/134] debug failing test --- xarray/tests/test_coding_times.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 6ff9a00d2f9..7fc78a2d722 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -222,9 +222,11 @@ def test_decode_standard_calendar_inside_timestamp_range(calendar, time_unit) -> # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values + print(expected) # for cftime we get "us" resolution # ns resolution is handled by cftime, too (OutOfBounds) actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) + print(actual, actual.dtype) if calendar != "proleptic_gregorian" or time_unit == "ns": unit = "us" else: From 5dbc8a7ff46815e7ff4c9db48d9754f2a6dbae22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 21 Nov 2024 16:05:52 +0100 Subject: [PATCH 054/134] refactor cftime fallback in datetime decoding --- xarray/coding/times.py | 31 ++++++++++-- xarray/tests/test_coding_times.py | 82 +++++++++++++++---------------- 2 files changed, 69 insertions(+), 44 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index d6bc966f35e..7a3d552c8ec 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -463,6 +463,11 @@ def decode_cf_datetime( cftype = type(dates[np.nanargmin(num_dates)]) # create first day of gregorian calendar in current cf calendar type border = cftype(1582, 10, 15) + # "ns" boarders + # between ['1677-09-21T00:12:43.145224193', '2262-04-11T23:47:16.854775807'] + lower = cftype(1677, 9, 21, 0, 12, 43, 145224) + upper = cftype(2262, 4, 11, 23, 47, 16, 854775) + # todo: check if test for minimum date is enough if ( dates[np.nanargmin(num_dates)] < border @@ -477,9 +482,27 @@ def decode_cf_datetime( SerializationWarning, stacklevel=3, ) + elif time_unit == "ns" and ( + ( + dates[np.nanargmin(num_dates)] < lower + or dates[np.nanargmin(num_dates)] > upper + ) + or ( + dates[np.nanargmax(num_dates)] < lower + or dates[np.nanargmax(num_dates)] > upper + ) + ): + warnings.warn( + "Unable to decode time axis into full " + "numpy.datetime64 objects, continuing using " + "cftime.datetime objects instead, reason: dates out " + "of range", + SerializationWarning, + stacklevel=3, + ) else: if _is_standard_calendar(calendar): - dates = cftime_to_nptime(dates) + dates = cftime_to_nptime(dates, time_unit=time_unit) elif use_cftime: dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar) else: @@ -605,7 +628,9 @@ def infer_timedelta_units(deltas) -> str: return _infer_time_units_from_diff(unique_timedeltas) -def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: +def cftime_to_nptime( + times, raise_on_invalid: bool = True, time_unit: PDDatetimeUnitOptions = "ns" +) -> np.ndarray: """Given an array of cftime.datetime objects, return an array of numpy.datetime64 objects of the same size @@ -618,7 +643,7 @@ def cftime_to_nptime(times, raise_on_invalid: bool = True) -> np.ndarray: try: # We expect either "us" resolution or "s" resolution depending on # whether 'microseconds' are defined for the input or not. - dt = np.datetime64(t.isoformat()) + dt = np.datetime64(t.isoformat()).astype(f"=M8[{time_unit}]") except ValueError as e: if raise_on_invalid: raise ValueError( diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 7fc78a2d722..0197fa10538 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -133,17 +133,13 @@ def test_cf_datetime( num_dates, units, calendar, only_use_cftime_datetimes=True ) - min_y = np.ravel(np.atleast_1d(expected))[np.nanargmin(num_dates)] # .year - max_y = np.ravel(np.atleast_1d(expected))[np.nanargmax(num_dates)] # .year - typ = type(min_y) - border = typ(1582, 10, 15) - if (calendar == "proleptic_gregorian" and time_unit != "ns") or ( - calendar in _STANDARD_CALENDARS and (min_y >= border and max_y >= border) - ): - expected = cftime_to_nptime(expected) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") actual = decode_cf_datetime(num_dates, units, calendar, time_unit=time_unit) + + if actual.dtype.kind != "O": + expected = cftime_to_nptime(expected) + abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -164,7 +160,7 @@ def test_cf_datetime( @requires_cftime -def test_decode_cf_datetime_overflow() -> None: +def test_decode_cf_datetime_overflow(time_unit: PDDatetimeUnitOptions) -> None: # checks for # https://github.com/pydata/pandas/issues/14068 # https://github.com/pydata/xarray/issues/975 @@ -174,13 +170,13 @@ def test_decode_cf_datetime_overflow() -> None: units = "days since 2000-01-01 00:00:00" # date after 2262 and before 1678 - days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) + days = (-117710, 95795) + expected = (datetime(1677, 9, 20), datetime(2262, 4, 12)) for i, day in enumerate(days): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - result = decode_cf_datetime(day, units) + result = decode_cf_datetime(day, units, time_unit=time_unit) assert result == expected[i] @@ -214,25 +210,22 @@ def test_decode_cf_datetime_non_iso_strings() -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) -def test_decode_standard_calendar_inside_timestamp_range(calendar, time_unit) -> None: +def test_decode_standard_calendar_inside_timestamp_range( + calendar, time_unit: PDDatetimeUnitOptions +) -> None: import cftime units = "days since 0001-01-01" - times = pd.date_range("2001-04-01-00", end="2001-04-30-23", unit="us", freq="h") + times = pd.date_range( + "2001-04-01-00", end="2001-04-30-23", unit=time_unit, freq="h" + ) # to_pydatetime() will return microsecond time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values - print(expected) # for cftime we get "us" resolution # ns resolution is handled by cftime, too (OutOfBounds) actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) - print(actual, actual.dtype) - if calendar != "proleptic_gregorian" or time_unit == "ns": - unit = "us" - else: - unit = time_unit - expected_dtype = np.dtype(f"=M8[{unit}]") - assert actual.dtype == expected_dtype + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff = abs(actual - expected) # once we no longer support versions of netCDF4 older than 1.1.5, # we could do this check with near microsecond accuracy: @@ -296,21 +289,20 @@ def test_decode_dates_outside_timestamp_range( @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) +@pytest.mark.parametrize("num_time", [735368, [735368], [[735368]]]) def test_decode_standard_calendar_single_element_inside_timestamp_range( - calendar, time_unit: PDDatetimeUnitOptions + calendar, + time_unit: PDDatetimeUnitOptions, + num_time, ) -> None: units = "days since 0001-01-01" - unit = "s" - if calendar == "proleptic_gregorian" and time_unit != "ns": - unit = time_unit - for num_time in [735368, [735368], [[735368]]]: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime( - num_time, units, calendar=calendar, time_unit=time_unit - ) + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unable to decode time axis") + actual = decode_cf_datetime( + num_time, units, calendar=calendar, time_unit=time_unit + ) - assert actual.dtype == np.dtype(f"=M8[{unit}]") + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") @requires_cftime @@ -353,9 +345,6 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( import cftime units = "days since 0001-01-01" - unit = "s" - if calendar == "proleptic_gregorian" and time_unit != "ns": - unit = time_unit times1 = pd.date_range("2001-04-01", end="2001-04-05", freq="D") times2 = pd.date_range("2001-05-01", end="2001-05-05", freq="D") time1 = cftime.date2num(times1.to_pydatetime(), units, calendar=calendar) @@ -370,7 +359,7 @@ def test_decode_standard_calendar_multidim_time_inside_timestamp_range( actual = decode_cf_datetime( mdim_time, units, calendar=calendar, time_unit=time_unit ) - assert actual.dtype == np.dtype(f"=M8[{unit}]") + assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff1 = abs(actual[:, 0] - expected1) abs_diff2 = abs(actual[:, 1] - expected2) @@ -984,7 +973,9 @@ def test_use_cftime_default_standard_calendar_out_of_range( @requires_cftime @pytest.mark.parametrize("calendar", _NON_STANDARD_CALENDARS) @pytest.mark.parametrize("units_year", [1500, 2000, 2500]) -def test_use_cftime_default_non_standard_calendar(calendar, units_year) -> None: +def test_use_cftime_default_non_standard_calendar( + calendar, units_year, time_unit +) -> None: from cftime import num2date numerical_dates = [0, 1] @@ -993,9 +984,18 @@ def test_use_cftime_default_non_standard_calendar(calendar, units_year) -> None: numerical_dates, units, calendar, only_use_cftime_datetimes=True ) - with assert_no_warnings(): - result = decode_cf_datetime(numerical_dates, units, calendar) - np.testing.assert_array_equal(result, expected) + if time_unit == "ns" and units_year == 2500: + with pytest.warns(SerializationWarning, match="Unable to decode time axis"): + result = decode_cf_datetime( + numerical_dates, units, calendar, time_unit=time_unit + ) + else: + with assert_no_warnings(): + result = decode_cf_datetime( + numerical_dates, units, calendar, time_unit=time_unit + ) + + np.testing.assert_array_equal(result, expected) @requires_cftime From f95408aa1426cb2129d3e60f69744f9ea3e5aa12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 21 Nov 2024 16:18:01 +0100 Subject: [PATCH 055/134] fix merge-collission --- xarray/coding/cftime_offsets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 963b89551c9..ad28c7caac3 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -85,7 +85,7 @@ def get_date_type(calendar, use_cftime=True): cftime = attempt_import("cftime") if _is_standard_calendar(calendar) and not use_cftime: - return _nanosecond_precision_timestamp + return default_precision_timestamp calendars = { "noleap": cftime.DatetimeNoLeap, From ec7f1652f918b82fe3884a39851e7fef24a59bcd Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Fri, 22 Nov 2024 11:19:14 +0100 Subject: [PATCH 056/134] use CFDatetimeCoder instance to transport unit/use_cftime --- xarray/backends/api.py | 7 ++++--- xarray/coding/times.py | 2 +- xarray/conventions.py | 27 +++++++++------------------ xarray/tests/test_coding_times.py | 18 +++++++++--------- xarray/tests/test_conventions.py | 6 +++--- 5 files changed, 26 insertions(+), 34 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 785ab3913ef..ca91a83d1e2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -46,6 +46,7 @@ from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes from xarray.core.utils import is_remote_uri +from xarray.coding.times import CFDatetimeCoder from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager @@ -481,7 +482,7 @@ def open_dataset( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -543,9 +544,9 @@ def open_dataset( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool or CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. diff --git a/xarray/coding/times.py b/xarray/coding/times.py index ba7e2a672a9..061379ddc9c 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -1139,7 +1139,7 @@ class CFDatetimeCoder(VariableCoder): def __init__( self, use_cftime: bool | None = None, - time_unit: Literal["s", "ms", "us", "ns"] = "ns", + time_unit: PDDatetimeUnitOptions = "ns", ) -> None: self.use_cftime = use_cftime self.time_unit = time_unit diff --git a/xarray/conventions.py b/xarray/conventions.py index ec31f3f1c9e..f3b0f8c7b77 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -209,12 +209,11 @@ def decode_cf_variable( var: Variable, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | times.CFDatetimeCoder = True, decode_endianness: bool = True, stack_char_dim: bool = True, use_cftime: bool | None = None, decode_timedelta: bool | None = None, - time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> Variable: """ Decodes a variable which may hold CF encoded information. @@ -237,7 +236,7 @@ def decode_cf_variable( Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). If the _Unsigned attribute is present treat integer arrays as unsigned. - decode_times : bool + decode_times : bool | xarray.times.CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. @@ -255,9 +254,6 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - time_unit : Literal["s", "ms", "us", "ns"], optional - Time unit to which resolution cf times should at least be decoded. - Defaults to "ns". Returns ------- @@ -295,7 +291,9 @@ def decode_cf_variable( if decode_timedelta: var = times.CFTimedeltaCoder().decode(var, name=name) if decode_times: - var = times.CFDatetimeCoder(use_cftime=use_cftime, time_unit=time_unit).decode( + if not isinstance(decode_times, times.CFDatetimeCoder): + decode_times = times.CFDatetimeCoder(use_cftime=use_cftime) + var = decode_times.decode( var, name=name ) @@ -413,7 +411,6 @@ def decode_cf_variables( drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, - time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> tuple[T_Variables, T_Attrs, set[Hashable]]: """ Decode several CF encoded variables. @@ -466,7 +463,6 @@ def stackable(dim: Hashable) -> bool: stack_char_dim=stack_char_dim, use_cftime=_item_or_default(use_cftime, k, None), decode_timedelta=_item_or_default(decode_timedelta, k, None), - time_unit=time_unit, ) except Exception as e: raise type(e)(f"Failed to decode variable {k!r}: {e}") from e @@ -547,12 +543,11 @@ def decode_cf( obj: T_DatasetOrAbstractstore, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | times.CFDatetimeCoder = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, decode_timedelta: bool | None = None, - time_unit: Literal["s", "ms", "us", "ns"] = "ns", ) -> Dataset: """Decode the given Dataset or Datastore according to CF conventions into a new Dataset. @@ -567,7 +562,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool, optional + decode_times : bool | xr.times.times.CFDatetimeCoder, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -597,9 +592,6 @@ def decode_cf( {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. - time_unit : Literal["s", "ms", "us", "ns"], optional - Time unit to which resolution cf times should at least be decoded. - Defaults to "ns". Returns ------- @@ -634,7 +626,6 @@ def decode_cf( drop_variables=drop_variables, use_cftime=use_cftime, decode_timedelta=decode_timedelta, - time_unit=time_unit, ) ds = Dataset(vars, attrs=attrs) ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars)) @@ -649,7 +640,7 @@ def cf_decoder( attributes: T_Attrs, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | times.CFDatetimeCoder = True, ) -> tuple[T_Variables, T_Attrs]: """ Decode a set of CF encoded variables and attributes. @@ -666,7 +657,7 @@ def cf_decoder( mask_and_scale : bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool + decode_times : bool | xr.times.times.CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. Returns diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 0197fa10538..6ea5c3e9f44 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -705,7 +705,7 @@ def test_decode_cf(calendar, time_unit: PDDatetimeUnitOptions) -> None: with pytest.raises(ValueError): ds = decode_cf(ds) else: - ds = decode_cf(ds, time_unit=time_unit) + ds = decode_cf(ds, decode_times=CFDatetimeCoder(time_unit=time_unit)) if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") @@ -734,7 +734,7 @@ def test_decode_cf_time_bounds(time_unit: PDDatetimeUnitOptions) -> None: "units": "days since 2001-01", "calendar": "standard", } - dsc = decode_cf(ds, time_unit=time_unit) + dsc = decode_cf(ds, decode_times=CFDatetimeCoder(time_unit=time_unit)) assert dsc.time_bnds.dtype == np.dtype(f"=M8[{time_unit}]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -1107,7 +1107,7 @@ def test_encode_decode_roundtrip_datetime64( times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded, time_unit=time_unit) + decoded = conventions.decode_cf_variable("time", encoded, decode_times=CFDatetimeCoder(time_unit=time_unit)) assert_equal(variable, decoded) @@ -1155,7 +1155,7 @@ def test_decode_encode_roundtrip_with_non_lowercase_letters( units = "days since 2000-01-01" attrs = {"calendar": calendar, "units": units} variable = Variable(["time"], times, attrs) - decoded = conventions.decode_cf_variable("time", variable, time_unit=time_unit) + decoded = conventions.decode_cf_variable("time", variable, decode_times=CFDatetimeCoder(time_unit=time_unit)) encoded = conventions.encode_cf_variable(decoded) # Previously this would erroneously be an array of cftime.datetime @@ -1368,7 +1368,7 @@ def test_roundtrip_datetime64_nanosecond_precision( assert encoded_var.attrs["calendar"] == "proleptic_gregorian" assert encoded_var.data.dtype == dtype decoded_var = conventions.decode_cf_variable( - "foo", encoded_var, time_unit=time_unit + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) ) result_unit = ( @@ -1421,7 +1421,7 @@ def test_roundtrip_datetime64_nanosecond_precision_warning( assert encoded_var.attrs["_FillValue"] == 20 decoded_var = conventions.decode_cf_variable( - "foo", encoded_var, time_unit=time_unit + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) ) assert_identical(var, decoded_var) @@ -1435,7 +1435,7 @@ def test_roundtrip_datetime64_nanosecond_precision_warning( assert encoded_var.attrs["_FillValue"] == 20.0 decoded_var = conventions.decode_cf_variable( - "foo", encoded_var, time_unit=time_unit + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) ) assert_identical(var, decoded_var) @@ -1449,7 +1449,7 @@ def test_roundtrip_datetime64_nanosecond_precision_warning( assert encoded_var.attrs["_FillValue"] == 20 decoded_var = conventions.decode_cf_variable( - "foo", encoded_var, time_unit=time_unit + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) ) assert_identical(var, decoded_var) @@ -1475,7 +1475,7 @@ def test_roundtrip_timedelta64_nanosecond_precision( encoded_var = conventions.encode_cf_variable(var) decoded_var = conventions.decode_cf_variable( - "foo", encoded_var, time_unit=time_unit + "foo", encoded_var, decode_times=CFDatetimeCoder(time_unit=time_unit) ) assert_identical(var, decoded_var) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 465130b79d4..6d9e106457f 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -454,7 +454,7 @@ def test_dataset_repr_with_netcdf4_datetimes(self, time_unit) -> None: assert "(time) object" in repr(ds) attrs = {"units": "days since 1900-01-01"} - ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)}), time_unit=time_unit) + ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)}), decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit)) assert f"(time) datetime64[{time_unit}]" in repr(ds) @requires_cftime @@ -537,14 +537,14 @@ def test_decode_cf_time_kwargs(self, time_unit) -> None: } ) - dsc = conventions.decode_cf(ds, time_unit=time_unit) + dsc = conventions.decode_cf(ds, decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit)) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") dsc = conventions.decode_cf( - ds, decode_times=True, time_unit=time_unit, decode_timedelta=False + ds, decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit), decode_timedelta=False ) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") From 1f1cf1cc667d19ad161ed59c61f4fc64f7c3f319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 25 Nov 2024 12:32:42 +0100 Subject: [PATCH 057/134] decode_times with CFDatetimeCoder --- xarray/__init__.py | 2 ++ xarray/backends/api.py | 46 +++++++++++++++++++++++-------- xarray/coding/times.py | 6 ++-- xarray/conventions.py | 36 ++++++++++++++++++++---- xarray/tests/test_backends.py | 15 +++++++--- xarray/tests/test_coding_times.py | 11 ++++++-- 6 files changed, 88 insertions(+), 28 deletions(-) diff --git a/xarray/__init__.py b/xarray/__init__.py index 634f67a61a2..3f22dd62b98 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -15,6 +15,7 @@ from xarray.coding.cftime_offsets import cftime_range, date_range, date_range_like from xarray.coding.cftimeindex import CFTimeIndex from xarray.coding.frequencies import infer_freq +from xarray.coding.times import CFDatetimeCoder from xarray.conventions import SerializationWarning, decode_cf from xarray.core.alignment import align, broadcast from xarray.core.combine import combine_by_coords, combine_nested @@ -113,6 +114,7 @@ "where", "zeros_like", # Classes + "CFDatetimeCoder", "CFTimeIndex", "Context", "Coordinates", diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ca91a83d1e2..94634704466 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -33,6 +33,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.coding.times import CFDatetimeCoder from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -46,7 +47,6 @@ from xarray.core.treenode import group_subtrees from xarray.core.types import NetcdfWriteModes, ZarrWriteModes from xarray.core.utils import is_remote_uri -from xarray.coding.times import CFDatetimeCoder from xarray.namedarray.daskmanager import DaskManager from xarray.namedarray.parallelcompat import guess_chunkmanager @@ -482,7 +482,10 @@ def open_dataset( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -544,7 +547,7 @@ def open_dataset( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or CFDatetimeCoder or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, @@ -570,6 +573,8 @@ def open_dataset( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -699,7 +704,10 @@ def open_dataarray( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, - decode_times: bool | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | None = None, use_cftime: bool | None = None, concat_characters: bool | None = None, @@ -762,9 +770,11 @@ def open_dataarray( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. This keyword may not be supported by all the backends. - decode_times : bool, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in @@ -782,6 +792,8 @@ def open_dataarray( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -904,7 +916,10 @@ def open_datatree( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -962,9 +977,9 @@ def open_datatree( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -988,6 +1003,8 @@ def open_datatree( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1119,7 +1136,10 @@ def open_groups( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -1181,9 +1201,9 @@ def open_groups( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -1207,6 +1227,8 @@ def open_groups( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 061379ddc9c..e79806bbd61 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -5,7 +5,7 @@ from collections.abc import Callable, Hashable, Iterator from datetime import datetime, timedelta from functools import partial -from typing import TYPE_CHECKING, Literal, Union, cast +from typing import TYPE_CHECKING, Union, cast import numpy as np import pandas as pd @@ -226,7 +226,7 @@ def _decode_cf_datetime_dtype( units: str, calendar: str | None, use_cftime: bool | None, - time_unit: Literal["s", "ms", "us", "ns"] = "ns", + time_unit: PDDatetimeUnitOptions = "ns", ) -> np.dtype: # Verify that at least the first and last date can be decoded # successfully. Otherwise, tracebacks end up swallowed by @@ -431,7 +431,7 @@ def decode_cf_datetime( units: str, calendar: str | None = None, use_cftime: bool | None = None, - time_unit: Literal["s", "ms", "us", "ns"] = "ns", + time_unit: PDDatetimeUnitOptions = "ns", ) -> np.ndarray: """Given an array of numeric dates in netCDF format, convert it into a numpy array of date time objects. diff --git a/xarray/conventions.py b/xarray/conventions.py index f3b0f8c7b77..774997bc39f 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -236,7 +236,7 @@ def decode_cf_variable( Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). If the _Unsigned attribute is present treat integer arrays as unsigned. - decode_times : bool | xarray.times.CFDatetimeCoder + decode_times : bool or xarray.times.CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. @@ -254,6 +254,8 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + Usage of use_cftime as kwarg is deprecated, please initialize it with + xarray.times.CFDatetimeCoder and ``decode_times``. Returns ------- @@ -291,11 +293,28 @@ def decode_cf_variable( if decode_timedelta: var = times.CFTimedeltaCoder().decode(var, name=name) if decode_times: + # remove checks after end of deprecation cycle if not isinstance(decode_times, times.CFDatetimeCoder): + if use_cftime is not None: + from warnings import warn + + warn( + "Usage of 'use_cftime' as kwarg is deprecated. " + "Please initialize it with xarray.times.CFDatetimeCoder and " + "'decode_times' kwarg.", + DeprecationWarning, + stacklevel=2, + ) decode_times = times.CFDatetimeCoder(use_cftime=use_cftime) - var = decode_times.decode( - var, name=name - ) + else: + if use_cftime is not None: + raise TypeError( + "Usage of 'use_cftime' as kwarg is not allowed, " + "if 'decode_times' is initialized with " + "xarray.times.CFDatetimeCoder. Please add 'use_cftime' " + "when initializing xarray.times.CFDatetimeCoder." + ) + var = decode_times.decode(var, name=name) if decode_endianness and not var.dtype.isnative: var = variables.EndianCoder().decode(var) @@ -406,7 +425,10 @@ def decode_cf_variables( attributes: T_Attrs, concat_characters: bool | Mapping[str, bool] = True, mask_and_scale: bool | Mapping[str, bool] = True, - decode_times: bool | Mapping[str, bool] = True, + decode_times: bool + | times.CFDatetimeCoder + | Mapping[str, bool | times.CFDatetimeCoder] + | None = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, @@ -562,7 +584,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool | xr.times.times.CFDatetimeCoder, optional + decode_times : bool or xr.times.times.CFDatetimeCoder, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -587,6 +609,8 @@ def decode_cf( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + Usage of use_cftime as kwarg is deprecated, please initialize it with + xarray.times.CFDatetimeCoder and ``decode_times``. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index bfc4c2fe1e1..d35cacaaae6 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -29,6 +29,7 @@ import xarray as xr from xarray import ( + CFDatetimeCoder, DataArray, Dataset, backends, @@ -3196,7 +3197,10 @@ def test_open_zarr_use_cftime(self) -> None: ds.to_zarr(store_target, **self.version_kwargs) ds_a = xr.open_zarr(store_target, **self.version_kwargs) assert_identical(ds, ds_a) - ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs) + decoder = CFDatetimeCoder(use_cftime=True) + ds_b = xr.open_zarr( + store_target, decode_times=decoder, **self.version_kwargs + ) assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable) def test_write_read_select_write(self) -> None: @@ -5613,7 +5617,8 @@ def test_use_cftime_true(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with warnings.catch_warnings(record=True) as record: - with open_dataset(tmp_file, use_cftime=True) as ds: + decoder = CFDatetimeCoder(use_cftime=True) + with open_dataset(tmp_file, decode_times=decoder) as ds: assert_identical(expected_x, ds.x) assert_identical(expected_time, ds.time) _assert_no_dates_out_of_range_warning(record) @@ -5666,7 +5671,8 @@ def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) - with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with pytest.raises((OutOfBoundsDatetime, ValueError)): - open_dataset(tmp_file, use_cftime=False) + decoder = CFDatetimeCoder(use_cftime=False) + open_dataset(tmp_file, decode_times=decoder) @requires_scipy_or_netCDF4 @@ -5684,7 +5690,8 @@ def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with pytest.raises((OutOfBoundsDatetime, ValueError)): - open_dataset(tmp_file, use_cftime=False) + decoder = CFDatetimeCoder(use_cftime=False) + open_dataset(tmp_file, decode_times=decoder) @pytest.mark.parametrize("engine", ["netcdf4", "scipy"]) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 6ea5c3e9f44..ca9a8e2390a 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1107,7 +1107,9 @@ def test_encode_decode_roundtrip_datetime64( times = initial_time.append(pd.date_range("1968", periods=2, freq=freq)) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded, decode_times=CFDatetimeCoder(time_unit=time_unit)) + decoded = conventions.decode_cf_variable( + "time", encoded, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) assert_equal(variable, decoded) @@ -1120,7 +1122,8 @@ def test_encode_decode_roundtrip_cftime(freq) -> None: ) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded, use_cftime=True) + decoder = CFDatetimeCoder(use_cftime=True) + decoded = conventions.decode_cf_variable("time", encoded, decode_times=decoder) assert_equal(variable, decoded) @@ -1155,7 +1158,9 @@ def test_decode_encode_roundtrip_with_non_lowercase_letters( units = "days since 2000-01-01" attrs = {"calendar": calendar, "units": units} variable = Variable(["time"], times, attrs) - decoded = conventions.decode_cf_variable("time", variable, decode_times=CFDatetimeCoder(time_unit=time_unit)) + decoded = conventions.decode_cf_variable( + "time", variable, decode_times=CFDatetimeCoder(time_unit=time_unit) + ) encoded = conventions.encode_cf_variable(decoded) # Previously this would erroneously be an array of cftime.datetime From 14b1a88e7b4d24199fe8e267f5a996e1b8f4aaa4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:10:16 +0000 Subject: [PATCH 058/134] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_conventions.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 6d9e106457f..5de370e23d2 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -454,7 +454,10 @@ def test_dataset_repr_with_netcdf4_datetimes(self, time_unit) -> None: assert "(time) object" in repr(ds) attrs = {"units": "days since 1900-01-01"} - ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)}), decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit)) + ds = decode_cf( + Dataset({"time": ("time", [0, 1], attrs)}), + decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit), + ) assert f"(time) datetime64[{time_unit}]" in repr(ds) @requires_cftime @@ -537,14 +540,18 @@ def test_decode_cf_time_kwargs(self, time_unit) -> None: } ) - dsc = conventions.decode_cf(ds, decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit)) + dsc = conventions.decode_cf( + ds, decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit) + ) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") dsc = conventions.decode_cf(ds, decode_times=False) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype("int64") dsc = conventions.decode_cf( - ds, decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit), decode_timedelta=False + ds, + decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit), + decode_timedelta=False, ) assert dsc.timedelta.dtype == np.dtype("int64") assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") From e7cbf3aa19625e9ab9d021e69576d72a4b5f031d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 26 Nov 2024 08:33:56 +0100 Subject: [PATCH 059/134] fix mypy, warning/error --- xarray/conventions.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 774997bc39f..7136cdb3085 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -269,7 +269,7 @@ def decode_cf_variable( original_dtype = var.dtype if decode_timedelta is None: - decode_timedelta = decode_times + decode_timedelta = True if decode_times else False if concat_characters: if stack_char_dim: @@ -300,7 +300,7 @@ def decode_cf_variable( warn( "Usage of 'use_cftime' as kwarg is deprecated. " - "Please initialize it with xarray.times.CFDatetimeCoder and " + "Please initialize it with xarray.CFDatetimeCoder and " "'decode_times' kwarg.", DeprecationWarning, stacklevel=2, @@ -311,8 +311,8 @@ def decode_cf_variable( raise TypeError( "Usage of 'use_cftime' as kwarg is not allowed, " "if 'decode_times' is initialized with " - "xarray.times.CFDatetimeCoder. Please add 'use_cftime' " - "when initializing xarray.times.CFDatetimeCoder." + "xarray.CFDatetimeCoder. Please add 'use_cftime' " + "when initializing xarray.CFDatetimeCoder." ) var = decode_times.decode(var, name=name) @@ -427,8 +427,7 @@ def decode_cf_variables( mask_and_scale: bool | Mapping[str, bool] = True, decode_times: bool | times.CFDatetimeCoder - | Mapping[str, bool | times.CFDatetimeCoder] - | None = True, + | Mapping[str, bool | times.CFDatetimeCoder] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, From fc87e046b450e545c09b4989b776d7b40770c072 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 26 Nov 2024 12:40:36 +0100 Subject: [PATCH 060/134] api, docs, docstrings --- doc/api.rst | 1 + doc/user-guide/time-series.rst | 6 ++++-- doc/user-guide/weather-climate.rst | 1 - xarray/backends/api.py | 8 ++++---- xarray/conventions.py | 6 +++--- 5 files changed, 12 insertions(+), 10 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 85ef46ca6ba..8bf2cff7e20 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -52,6 +52,7 @@ Creating a dataset Dataset decode_cf + CFDatetimeCoder Attributes ---------- diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 75566724443..bc09e55f382 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -77,9 +77,11 @@ You can manual decode arrays in this form by passing a dataset to attrs = {"units": "hours since 2000-01-01"} ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) - xr.decode_cf(ds) + print("Default decoding to 'ns'-resolution:", xr.decode_cf(ds)) + coder = xr.CFDatetimeCoder(time_unit="s") + print("Decoding to 's'-resolution:", xr.decode_cf(ds, decode_times=coder)) -From xarray 2024.11 the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. See :ref:`CFTimeIndex` for more information. diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 8556d3b3985..06270421dfc 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -133,7 +133,6 @@ facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also ava :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to use ``pandas`` when possible, i.e. when the calendar is standard and dates starting with 1582-10-15. -use ``pandas`` when possible, i.e. when the calendar is standard and dates are within 1678 and 2262. .. ipython:: python diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 94634704466..53151ef503a 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -574,7 +574,7 @@ def open_dataset( to toggle this feature per-variable individually. This keyword may not be supported by all the backends. Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -793,7 +793,7 @@ def open_dataarray( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. This keyword may not be supported by all the backends. Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1004,7 +1004,7 @@ def open_datatree( to toggle this feature per-variable individually. This keyword may not be supported by all the backends. Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1228,7 +1228,7 @@ def open_groups( to toggle this feature per-variable individually. This keyword may not be supported by all the backends. Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with xarray.times.CFDatetimeCoder and 'decode_times' kwarg. + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and diff --git a/xarray/conventions.py b/xarray/conventions.py index 7136cdb3085..147acb9e537 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -236,7 +236,7 @@ def decode_cf_variable( Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). If the _Unsigned attribute is present treat integer arrays as unsigned. - decode_times : bool or xarray.times.CFDatetimeCoder + decode_times : bool or CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. @@ -255,7 +255,7 @@ def decode_cf_variable( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. Usage of use_cftime as kwarg is deprecated, please initialize it with - xarray.times.CFDatetimeCoder and ``decode_times``. + CFDatetimeCoder and ``decode_times``. Returns ------- @@ -609,7 +609,7 @@ def decode_cf( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. Usage of use_cftime as kwarg is deprecated, please initialize it with - xarray.times.CFDatetimeCoder and ``decode_times``. + CFDatetimeCoder and ``decode_times``. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} From 277d1c61c4c87ac8969fd4561bb92db43a5a0d50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 27 Nov 2024 09:01:25 +0100 Subject: [PATCH 061/134] docs, whats-new.rst --- doc/user-guide/time-series.rst | 6 ++++-- doc/whats-new.rst | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index bc09e55f382..69f8c26d0e0 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -77,9 +77,11 @@ You can manual decode arrays in this form by passing a dataset to attrs = {"units": "hours since 2000-01-01"} ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) - print("Default decoding to 'ns'-resolution:", xr.decode_cf(ds)) + # Default decoding to 'ns'-resolution + xr.decode_cf(ds) + # Decoding to 's'-resolution coder = xr.CFDatetimeCoder(time_unit="s") - print("Decoding to 's'-resolution:", xr.decode_cf(ds, decode_times=coder)) + xr.decode_cf(ds, decode_times=coder) From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8bd57339180..d953ae55cb5 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,7 +24,8 @@ New Features - Better support wrapping additional array types (e.g. ``cupy`` or ``jax``) by calling generalized duck array operations throughout more xarray methods. (:issue:`7848`, :pull:`9798`). By `Sam Levang `_. - +- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). + By `Kai Mühlbauer `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -32,7 +33,9 @@ Breaking changes Deprecations ~~~~~~~~~~~~ - +- Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument + ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions +instead. Bug fixes ~~~~~~~~~ From 81a9d94813f6cca712076f6d0d33f8f38e32f9e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 27 Nov 2024 10:33:38 +0100 Subject: [PATCH 062/134] fix whats-new.rst --- doc/whats-new.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d953ae55cb5..191a55a0285 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,7 +35,7 @@ Deprecations ~~~~~~~~~~~~ - Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions -instead. + instead. Bug fixes ~~~~~~~~~ From 9653a012a49ce54e9e1ef25b815ecfad8b960b72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 10 Dec 2024 13:57:33 +0100 Subject: [PATCH 063/134] fix tests after merge --- xarray/tests/test_variable.py | 58 ++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 6c5fc3ac9e1..32c4f8130d6 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -275,33 +275,43 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected + dt64_data = pd.date_range("2000-01-01", periods=3) + @pytest.mark.filterwarnings("ignore:Converting non-default") - def test_datetime64_conversion(self): - times = pd.date_range("2000-01-01", periods=3) - for values in [ - times, - times.values, - times.values.astype("datetime64[s]"), - times.to_pydatetime(), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype("datetime64[ns]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("datetime64[ns]") + @pytest.mark.parametrize( + "values, unit", + [ + (dt64_data, "ns"), + (dt64_data.values, "ns"), + (dt64_data.values.astype("datetime64[s]"), "s"), + (dt64_data.to_pydatetime(), "ns"), + ], + ) + def test_datetime64_conversion(self, values, unit): + # todo: check, if this test is OK + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"datetime64[{unit}]") + assert_array_equal(v.values, self.dt64_data.values) + assert v.values.dtype == np.dtype(f"datetime64[{unit}]") + + td64_data = pd.timedelta_range(start=0, periods=3) @pytest.mark.filterwarnings("ignore:Converting non-default") - def test_timedelta64_conversion(self): - times = pd.timedelta_range(start=0, periods=3) - for values in [ - times, - times.values, - times.values.astype("timedelta64[s]"), - times.to_pytimedelta(), - ]: - v = self.cls(["t"], values) - assert v.dtype == np.dtype("timedelta64[ns]") - assert_array_equal(v.values, times.values) - assert v.values.dtype == np.dtype("timedelta64[ns]") + @pytest.mark.parametrize( + "values, unit", + [ + (td64_data, "ns"), + (td64_data.values, "ns"), + (td64_data.values.astype("timedelta64[s]"), "s"), + (td64_data.to_pytimedelta(), "ns"), + ], + ) + def test_timedelta64_conversion(self, values, unit): + # todo: check, if this test is OK + v = self.cls(["t"], values) + assert v.dtype == np.dtype(f"timedelta64[{unit}]") + assert_array_equal(v.values, self.td64_data.values) + assert v.values.dtype == np.dtype(f"timedelta64[{unit}]") def test_object_conversion(self): data = np.arange(5).astype(str).astype(object) From 66c0b9ff03ff8bd6d1a9074e1406ad04d0a0de36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 13 Dec 2024 08:19:30 +0100 Subject: [PATCH 064/134] Apply suggestions from code review Co-authored-by: Deepak Cherian --- doc/user-guide/time-series.rst | 6 +++--- doc/user-guide/weather-climate.rst | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 69f8c26d0e0..9233791249e 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -22,7 +22,7 @@ Creating datetime64 data ------------------------ Xarray uses the numpy dtypes ``datetime64[unit]`` and ``timedelta64[unit]`` -(where unit is anything of "s", "ms", "us" and "ns") to represent datetime +(where unit is one of "s", "ms", "us" and "ns") to represent datetime data, which offer vectorized operations with numpy and smooth integration with pandas. To convert to or create regular arrays of ``datetime64`` data, we recommend @@ -62,7 +62,7 @@ attribute like ``'days since 2000-01-01'``). .. note:: When decoding/encoding datetimes for non-standard calendars or for dates - before 1582-10-15, xarray uses the `cftime`_ library. + before [1582-10-15](https://en.wikipedia.org/wiki/Gregorian_calendar), xarray uses the `cftime`_ library by default. It was previously packaged with the ``netcdf4-python`` package under the name ``netcdftime`` but is now distributed separately. ``cftime`` is an :ref:`optional dependency` of xarray. @@ -80,7 +80,7 @@ You can manual decode arrays in this form by passing a dataset to # Default decoding to 'ns'-resolution xr.decode_cf(ds) # Decoding to 's'-resolution - coder = xr.CFDatetimeCoder(time_unit="s") + coder = xr.coders.CFDatetimeCoder(time_unit="s") xr.decode_cf(ds, decode_times=coder) From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 06270421dfc..dace53d8d9f 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -64,7 +64,7 @@ Through the standalone ``cftime`` library and a custom subclass of :py:class:`pandas.Index`, xarray supports a subset of the indexing functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `precision range`_ and dates prior 1582-10-15. +using a standard calendar, but outside the `precision range`_ and dates [prior to 1582-10-15](https://en.wikipedia.org/wiki/Gregorian_calendar). .. note:: @@ -78,7 +78,7 @@ using a standard calendar, but outside the `precision range`_ and dates prior 15 - Any dates are outside the time span limited by the resolution (from xarray version v2024.11) Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[unit]`` data type (where unit can be any of ["s", "ms", "us", "ns"], enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. + represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"], enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime values. From xarray version 2024.11 the relaxed non-nanosecond precision datetime values will be used. @@ -132,7 +132,7 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use ``pandas`` when possible, i.e. when the calendar is standard and dates starting with 1582-10-15. +use ``pandas`` when possible, i.e. when the calendar is standard and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). .. ipython:: python From ba512746bf98e5d4952c89e4d95fa4b5f74504c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 13 Dec 2024 08:45:13 +0100 Subject: [PATCH 065/134] provide CFDatetimeCoder from xarray.coders --- doc/api.rst | 12 +++++++++++- xarray/__init__.py | 5 ++--- xarray/coders.py | 10 ++++++++++ xarray/tests/test_backends.py | 2 +- xarray/tests/test_coding_times.py | 2 +- 5 files changed, 25 insertions(+), 6 deletions(-) create mode 100644 xarray/coders.py diff --git a/doc/api.rst b/doc/api.rst index 8bf2cff7e20..e8523a0a4b1 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -52,7 +52,6 @@ Creating a dataset Dataset decode_cf - CFDatetimeCoder Attributes ---------- @@ -1094,6 +1093,17 @@ DataTree methods .. Missing: .. ``open_mfdatatree`` +Encoding/Decoding +================= + +Coder objects +------------- + +.. autosummary:: + :toctree: generated/ + + coders.CFDatetimeCoder + Coordinates objects =================== diff --git a/xarray/__init__.py b/xarray/__init__.py index 9663bbd2efd..8af936ed27a 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -1,6 +1,6 @@ from importlib.metadata import version as _version -from xarray import groupers, testing, tutorial, ufuncs +from xarray import coders, groupers, testing, tutorial, ufuncs from xarray.backends.api import ( load_dataarray, load_dataset, @@ -15,7 +15,6 @@ from xarray.coding.cftime_offsets import cftime_range, date_range, date_range_like from xarray.coding.cftimeindex import CFTimeIndex from xarray.coding.frequencies import infer_freq -from xarray.coding.times import CFDatetimeCoder from xarray.conventions import SerializationWarning, decode_cf from xarray.core.alignment import align, broadcast from xarray.core.combine import combine_by_coords, combine_nested @@ -67,6 +66,7 @@ # `mypy --strict` running in projects that import xarray. __all__ = ( # noqa: RUF022 # Sub-packages + "coders", "groupers", "testing", "tutorial", @@ -114,7 +114,6 @@ "where", "zeros_like", # Classes - "CFDatetimeCoder", "CFTimeIndex", "Context", "Coordinates", diff --git a/xarray/coders.py b/xarray/coders.py new file mode 100644 index 00000000000..238ac714780 --- /dev/null +++ b/xarray/coders.py @@ -0,0 +1,10 @@ +""" +This module provides coder objects that encapsulate the +"encoding/decoding" process. +""" + +from xarray.coding.times import CFDatetimeCoder + +__all__ = [ + "CFDatetimeCoder", +] diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 7d5636930cb..8983b9810f3 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -29,7 +29,6 @@ import xarray as xr from xarray import ( - CFDatetimeCoder, DataArray, Dataset, backends, @@ -49,6 +48,7 @@ ) from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint +from xarray.coders import CFDatetimeCoder from xarray.coding.cftime_offsets import cftime_range from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype from xarray.coding.variables import SerializationWarning diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index ca9a8e2390a..decd870d77c 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -19,9 +19,9 @@ date_range, decode_cf, ) +from xarray.coders import CFDatetimeCoder from xarray.coding.times import _STANDARD_CALENDARS as _STANDARD_CALENDARS_UNSORTED from xarray.coding.times import ( - CFDatetimeCoder, _encode_datetime_with_cftime, _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, From 3ba3e3f737a823d53124ae46dc6c13199ff6d8cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 13 Dec 2024 08:56:56 +0100 Subject: [PATCH 066/134] provide CFDatetimeCoder from xarray.coders --- xarray/backends/api.py | 2 +- xarray/conventions.py | 10 +++++----- xarray/convert.py | 3 ++- xarray/tests/test_conventions.py | 7 ++++--- xarray/tests/test_dataarray.py | 2 +- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 53151ef503a..12abb655e14 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -33,7 +33,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler -from xarray.coding.times import CFDatetimeCoder +from xarray.coders import CFDatetimeCoder from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, diff --git a/xarray/conventions.py b/xarray/conventions.py index b2683e16691..105171878c0 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -200,7 +200,7 @@ def decode_cf_variable( warn( "Usage of 'use_cftime' as kwarg is deprecated. " - "Please initialize it with xarray.CFDatetimeCoder and " + "Please initialize it with xarray.coders.CFDatetimeCoder and " "'decode_times' kwarg.", DeprecationWarning, stacklevel=2, @@ -211,8 +211,8 @@ def decode_cf_variable( raise TypeError( "Usage of 'use_cftime' as kwarg is not allowed, " "if 'decode_times' is initialized with " - "xarray.CFDatetimeCoder. Please add 'use_cftime' " - "when initializing xarray.CFDatetimeCoder." + "xarray.coders.CFDatetimeCoder. Please add 'use_cftime' " + "when initializing xarray.coders.CFDatetimeCoder." ) var = decode_times.decode(var, name=name) @@ -483,7 +483,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool or xr.times.times.CFDatetimeCoder, optional + decode_times : bool or xr.coders.CFDatetimeCoder, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -580,7 +580,7 @@ def cf_decoder( mask_and_scale : bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool | xr.times.times.CFDatetimeCoder + decode_times : bool | xr.coders.CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. Returns diff --git a/xarray/convert.py b/xarray/convert.py index 14df7cadb9b..29d8f9650e3 100644 --- a/xarray/convert.py +++ b/xarray/convert.py @@ -4,7 +4,8 @@ import numpy as np -from xarray.coding.times import CFDatetimeCoder, CFTimedeltaCoder +from xarray.coders import CFDatetimeCoder +from xarray.coding.times import CFTimedeltaCoder from xarray.conventions import decode_cf from xarray.core import duck_array_ops from xarray.core.dataarray import DataArray diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index e76e9a1d346..2886691ce32 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -18,6 +18,7 @@ ) from xarray.backends.common import WritableCFDataStore from xarray.backends.memory import InMemoryDataStore +from xarray.coders import CFDatetimeCoder from xarray.conventions import decode_cf from xarray.testing import assert_identical from xarray.tests import ( @@ -449,7 +450,7 @@ def test_dataset_repr_with_netcdf4_datetimes(self, time_unit) -> None: attrs = {"units": "days since 1900-01-01"} ds = decode_cf( Dataset({"time": ("time", [0, 1], attrs)}), - decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit), + decode_times=CFDatetimeCoder(time_unit=time_unit), ) assert f"(time) datetime64[{time_unit}]" in repr(ds) @@ -534,7 +535,7 @@ def test_decode_cf_time_kwargs(self, time_unit) -> None: ) dsc = conventions.decode_cf( - ds, decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit) + ds, decode_times=CFDatetimeCoder(time_unit=time_unit) ) assert dsc.timedelta.dtype == np.dtype("m8[ns]") assert dsc.time.dtype == np.dtype(f"M8[{time_unit}]") @@ -543,7 +544,7 @@ def test_decode_cf_time_kwargs(self, time_unit) -> None: assert dsc.time.dtype == np.dtype("int64") dsc = conventions.decode_cf( ds, - decode_times=coding.times.CFDatetimeCoder(time_unit=time_unit), + decode_times=CFDatetimeCoder(time_unit=time_unit), decode_timedelta=False, ) assert dsc.timedelta.dtype == np.dtype("int64") diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index a118212c981..fb5c5f8c25d 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -30,7 +30,7 @@ broadcast, set_options, ) -from xarray.coding.times import CFDatetimeCoder +from xarray.coders import CFDatetimeCoder from xarray.core import dtypes from xarray.core.common import full_like from xarray.core.coordinates import Coordinates From 1ab43ebc357281fbe266fbd0754425399bb837d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 13 Dec 2024 09:40:13 +0100 Subject: [PATCH 067/134] provide CFDatetimeCoder from xarray.coders --- xarray/conventions.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 105171878c0..042a7f14032 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -7,6 +7,7 @@ import numpy as np +from xarray.coders import CFDatetimeCoder from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to from xarray.core import indexing @@ -88,7 +89,7 @@ def encode_cf_variable( ensure_not_multiindex(var, name=name) for coder in [ - times.CFDatetimeCoder(), + CFDatetimeCoder(), times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), @@ -109,7 +110,7 @@ def decode_cf_variable( var: Variable, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool | times.CFDatetimeCoder = True, + decode_times: bool | CFDatetimeCoder = True, decode_endianness: bool = True, stack_char_dim: bool = True, use_cftime: bool | None = None, @@ -194,25 +195,25 @@ def decode_cf_variable( var = times.CFTimedeltaCoder().decode(var, name=name) if decode_times: # remove checks after end of deprecation cycle - if not isinstance(decode_times, times.CFDatetimeCoder): + if not isinstance(decode_times, CFDatetimeCoder): if use_cftime is not None: from warnings import warn warn( "Usage of 'use_cftime' as kwarg is deprecated. " - "Please initialize it with xarray.coders.CFDatetimeCoder and " + "Please initialize it with CFDatetimeCoder and " "'decode_times' kwarg.", DeprecationWarning, stacklevel=2, ) - decode_times = times.CFDatetimeCoder(use_cftime=use_cftime) + decode_times = CFDatetimeCoder(use_cftime=use_cftime) else: if use_cftime is not None: raise TypeError( "Usage of 'use_cftime' as kwarg is not allowed, " "if 'decode_times' is initialized with " - "xarray.coders.CFDatetimeCoder. Please add 'use_cftime' " - "when initializing xarray.coders.CFDatetimeCoder." + "CFDatetimeCoder. Please add 'use_cftime' " + "when initializing CFDatetimeCoder." ) var = decode_times.decode(var, name=name) @@ -325,9 +326,7 @@ def decode_cf_variables( attributes: T_Attrs, concat_characters: bool | Mapping[str, bool] = True, mask_and_scale: bool | Mapping[str, bool] = True, - decode_times: bool - | times.CFDatetimeCoder - | Mapping[str, bool | times.CFDatetimeCoder] = True, + decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, @@ -464,7 +463,7 @@ def decode_cf( obj: T_DatasetOrAbstractstore, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool | times.CFDatetimeCoder = True, + decode_times: bool | CFDatetimeCoder = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, @@ -483,7 +482,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool or xr.coders.CFDatetimeCoder, optional + decode_times : bool or CFDatetimeCoder, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -563,7 +562,7 @@ def cf_decoder( attributes: T_Attrs, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool | times.CFDatetimeCoder = True, + decode_times: bool | CFDatetimeCoder = True, ) -> tuple[T_Variables, T_Attrs]: """ Decode a set of CF encoded variables and attributes. @@ -580,7 +579,7 @@ def cf_decoder( mask_and_scale : bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool | xr.coders.CFDatetimeCoder + decode_times : bool | CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. Returns From 45ba9d324c6c0d57594c6b6aa533995ea911dd88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 13 Dec 2024 09:51:01 +0100 Subject: [PATCH 068/134] fix tests as suggested by code review --- xarray/tests/test_variable.py | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 32c4f8130d6..15a45b5d220 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -3081,37 +3081,11 @@ def test_pandas_two_only_datetime_conversion_warnings( (pd.timedelta_range("1", periods=1), "ns"), (timedelta(days=1), "us"), (np.array([timedelta(days=1)]), "ns"), + (pd.timedelta_range("1", periods=1).astype("timedelta64[s]"), "s"), ], ids=lambda x: f"{x}", ) def test_timedelta_conversion_warning(values, unit) -> None: - # todo: needs discussion - # todo: check, if this test is OK dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] var = Variable(dims, values) assert var.dtype == np.dtype(f"timedelta64[{unit}]") - - -def test_pandas_two_only_timedelta_conversion_warning() -> None: - # todo: test still needed? - # Note this test relies on a pandas feature that is only present in pandas - # 2.0.0 and above, and so for now cannot be parametrized. - data = pd.timedelta_range("1", periods=1).astype("timedelta64[s]") - var = Variable(["time"], data) - - assert var.dtype == np.dtype("timedelta64[s]") - - -@pytest.mark.parametrize( - ("index", "dtype"), - [ - (pd.date_range("2000", periods=1), "datetime64"), - (pd.timedelta_range("1", periods=1), "timedelta64"), - ], - ids=lambda x: f"{x}", -) -def test_pandas_indexing_adapter_non_nanosecond_conversion(index, dtype) -> None: - # todo: test still needed? - data = PandasIndexingAdapter(index.astype(f"{dtype}[s]")) - var = Variable(["time"], data) - assert var.dtype == np.dtype(f"{dtype}[s]") From ab3c9ed347d218fb438a83eb938d29ebdf55ac8a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 14 Dec 2024 16:39:23 +0000 Subject: [PATCH 069/134] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/coding/cftime_offsets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py index 452392d9dca..c800e19e63a 100644 --- a/xarray/coding/cftime_offsets.py +++ b/xarray/coding/cftime_offsets.py @@ -64,7 +64,6 @@ from xarray.core.pdcompat import ( count_not_none, default_precision_timestamp, - no_default, ) from xarray.core.utils import attempt_import, emit_user_level_warning From a16a890579db991b9eb4b3a3d99a3b0e4d746720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Dec 2024 13:14:43 +0100 Subject: [PATCH 070/134] Move scalar handling logic into `_possibly_convert_objects` as suggested by code review, try astype-conversion to "us" resolution when pd.Series can't convert object arrays --- xarray/core/variable.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 95bc39d31dd..d9f3998c9a6 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -6,7 +6,7 @@ import numbers import warnings from collections.abc import Callable, Hashable, Mapping, Sequence -from datetime import datetime, timedelta +from datetime import datetime from functools import partial from types import EllipsisType from typing import TYPE_CHECKING, Any, NoReturn, cast @@ -205,10 +205,23 @@ def _maybe_wrap_data(data): def _possibly_convert_objects(values): - """Convert arrays of datetime.datetime and datetime.timedelta objects into - datetime64 and timedelta64, according to the pandas convention. + """Convert object arrays into datetime64 and timedelta64 according + to the pandas convention. + + * datetime.datetime + * datetime.timedelta + * pd.Timestamp + * pd.Timedelta """ as_series = pd.Series(values.ravel(), copy=False) + # When receiving objects which pd.Series can't resolve by its own + # we try astype-conversion to "us"-resolution for datetimes and pd.Timestamp. + if ( + values.dtype.kind == "O" + and as_series.dtype.kind == "O" + and isinstance(as_series[0], datetime | pd.Timestamp) + ): + as_series = as_series.astype("=M8[us]") result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default @@ -253,14 +266,6 @@ def convert_non_numpy_type(data): if isinstance(data, tuple): data = utils.to_0d_object_array(data) - if isinstance(data, pd.Timestamp): - data = data.to_numpy() - - if isinstance(data, datetime): - data = np.datetime64(data) - if isinstance(data, timedelta): - data = np.timedelta64(data) - # we don't want nested self-described arrays if isinstance(data, pd.Series | pd.DataFrame): pandas_data = data.values From 4283f8ab0f01b41afe30b3fa8be8fcca249652f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Dec 2024 13:34:14 +0100 Subject: [PATCH 071/134] Add note on ``proleptic_gregorian`` calendar --- doc/user-guide/weather-climate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index dace53d8d9f..6a56e3030f0 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -132,7 +132,7 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use ``pandas`` when possible, i.e. when the calendar is standard and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). +use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). There is no such restriction when converting to ``proleptic_gregorian`` calendar. .. ipython:: python From 0ba848d277ce4a04b5284a000b1e2c7dc121a2a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Dec 2024 13:38:02 +0100 Subject: [PATCH 072/134] remove time_resolution from docstring --- xarray/core/options.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/core/options.py b/xarray/core/options.py index 23ec5bb3f73..2d69e4b6584 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -258,8 +258,6 @@ class set_options: warn_for_unclosed_files : bool, default: False Whether or not to issue a warning when unclosed files are deallocated. This is mostly useful for debugging. - time_resolution : {"s", "ms", "us", "ns"}, default: "s" - Time resolution used for CF encoding/decoding. Examples -------- From 6cb87029f18086120b22a359b80c83edbc3589a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Dec 2024 14:35:08 +0100 Subject: [PATCH 073/134] update time.coding.rst wrt default time unit --- doc/internals/time-coding.rst | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 09d73d8e7b8..2ad3f11b4d2 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -334,7 +334,7 @@ Decoding of ``values`` with time unit specification like ``seconds since 1992-10 3. As the unit (here ``seconds``) and the resolution of the reference time ``1992-10-8 15:15:42.5 -6:00`` (here ``milliseconds``) might be different, this has to be aligned to the higher resolution (retrieve new unit). User may also specify their wanted target resolution by setting kwarg ``time_unit`` to one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` (default ``'ns'``). This will be included into the alignment process. This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To not break consistency for ``NaT`` a mask is kept and re-introduced after the multiplication. -4. Times encoded as floating point values are checked for fractional parts and the resolution is enhanced in an iterative process until a fitting resolution (or nansosecond) is found. A ``SerializationWarning`` is issued to make the user aware of the possibly problematic encoding. +4. Times encoded as floating point values are checked for fractional parts and the resolution is enhanced in an iterative process until a fitting resolution (or ``'ns'``) is found. A ``SerializationWarning`` is issued to make the user aware of the possibly problematic encoding. 5. Finally, the ``values`` (``int64``) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. @@ -439,6 +439,4 @@ For encoding the process is more or less a reversal of the above, but we have to Default Time Unit ~~~~~~~~~~~~~~~~~ -The default time unit of xarray is ``'s'``. It aligns well with the lower resolution of pandas. For normal operation that has no consequences on the output as all decoded datetimes are already at least in second resolution. Setting the default time unit to ``'ns'`` (the former default) the datetimes will be converted to ``'ns'``-resolution, if possible. Same holds true for ``'us'`` and ``'ms'``. - -If the datetimes are decoded to ``'us'`` resolution, this resolution will be kept, even if the default resolution is set to ``'s'`` or ``'ms'``. +The current default time unit of xarray is ``'ns'``. Setting keyword argument ``time_unit`` unit to ``'s'`` (the lowest resolution pandas allows) datetimes will be converted to at least ``'s'``-resolution, if possible. Same holds true for ``'ms'`` and ``'us'``. From 5de8d0dad984c0f4c9c09e566204772eace5b070 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Dec 2024 14:56:01 +0100 Subject: [PATCH 074/134] fix empty array --- xarray/core/variable.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index d9f3998c9a6..496f956990f 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -219,6 +219,7 @@ def _possibly_convert_objects(values): if ( values.dtype.kind == "O" and as_series.dtype.kind == "O" + and as_series.size > 0 and isinstance(as_series[0], datetime | pd.Timestamp) ): as_series = as_series.astype("=M8[us]") From fc985d924c0d300d8d4873cd26a1ece4e125ca0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 16 Dec 2024 15:57:24 +0100 Subject: [PATCH 075/134] revert some tests to align with scalar logic handling --- xarray/core/variable.py | 9 ++++++--- xarray/tests/test_variable.py | 18 +++++++++--------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 496f956990f..53c1fd76f7b 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -215,14 +215,17 @@ def _possibly_convert_objects(values): """ as_series = pd.Series(values.ravel(), copy=False) # When receiving objects which pd.Series can't resolve by its own - # we try astype-conversion to "us"-resolution for datetimes and pd.Timestamp. + # we try astype-conversion to "ns"-resolution for datetimes and pd.Timestamp. if ( values.dtype.kind == "O" and as_series.dtype.kind == "O" and as_series.size > 0 - and isinstance(as_series[0], datetime | pd.Timestamp) + and ( + isinstance(as_series[0], datetime | pd.Timestamp) + or pd.api.types.is_datetime64_dtype(as_series[0]) + ) ): - as_series = as_series.astype("=M8[us]") + as_series = as_series.astype("=M8[ns]") result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 15a45b5d220..e3c55081d6a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1086,8 +1086,8 @@ def test_numpy_same_methods(self): "values, unit", [ (np.datetime64("2000-01-01"), "s"), - (pd.Timestamp("2000-01-01T00"), "s"), - (datetime(2000, 1, 1), "us"), + (pd.Timestamp("2000-01-01T00"), "ns"), + (datetime(2000, 1, 1), "ns"), ], ) def test_datetime64_conversion_scalar(self, values, unit): @@ -1102,8 +1102,8 @@ def test_datetime64_conversion_scalar(self, values, unit): "values, unit", [ (np.timedelta64(1, "D"), "s"), - (pd.Timedelta("1 day"), "us"), - (timedelta(days=1), "us"), + (pd.Timedelta("1 day"), "ns"), + (timedelta(days=1), "ns"), ], ) def test_timedelta64_conversion_scalar(self, values, unit): @@ -1126,12 +1126,12 @@ def test_0d_str(self): def test_0d_datetime(self): # todo: check, if this test is OK v = Variable([], pd.Timestamp("2000-01-01")) - assert v.dtype == np.dtype("datetime64[s]") + assert v.dtype == np.dtype("datetime64[ns]") assert v.values == np.datetime64("2000-01-01", "s") @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( - "values, unit", [(pd.to_timedelta("1s"), "us"), (np.timedelta64(1, "s"), "s")] + "values, unit", [(pd.to_timedelta("1s"), "ns"), (np.timedelta64(1, "s"), "s")] ) def test_0d_timedelta(self, values, unit): # todo: check, if this test is OK @@ -2679,7 +2679,7 @@ def test_datetime(self): actual = as_compatible_data(datetime(2000, 1, 1)) assert np.asarray(expected) == actual assert np.ndarray is type(actual) - assert np.dtype("datetime64[us]") == actual.dtype + assert np.dtype("datetime64[ns]") == actual.dtype def test_tz_datetime(self) -> None: # todo: check, if this test is OK @@ -3002,7 +3002,7 @@ def test_from_pint_wrapping_dask(self, Var): (np.array([np.datetime64("2000-01-01", "ns")]), "ns"), (np.array([np.datetime64("2000-01-01", "s")]), "s"), (pd.date_range("2000", periods=1), "ns"), - (datetime(2000, 1, 1), "us"), + (datetime(2000, 1, 1), "ns"), (np.array([datetime(2000, 1, 1)]), "ns"), (pd.date_range("2000", periods=1, tz=pytz.timezone("America/New_York")), "ns"), ( @@ -3079,7 +3079,7 @@ def test_pandas_two_only_datetime_conversion_warnings( (np.array([np.timedelta64(10, "ns")]), "ns"), (np.array([np.timedelta64(10, "s")]), "s"), (pd.timedelta_range("1", periods=1), "ns"), - (timedelta(days=1), "us"), + (timedelta(days=1), "ns"), (np.array([timedelta(days=1)]), "ns"), (pd.timedelta_range("1", periods=1).astype("timedelta64[s]"), "s"), ], From a2d8e69078805918519b819c8e2d31ac1d649eff Mon Sep 17 00:00:00 2001 From: Kai Muehlbauer Date: Fri, 22 Nov 2024 11:19:14 +0100 Subject: [PATCH 076/134] split out CFDatetimeCoder into coders, deprecate use_cftime as keyword argument --- doc/api.rst | 11 ++++++ doc/whats-new.rst | 3 ++ xarray/__init__.py | 3 +- xarray/backends/api.py | 47 +++++++++++++++++------ xarray/coders.py | 10 +++++ xarray/coding/times.py | 21 +++++++++-- xarray/conventions.py | 46 +++++++++++++++++----- xarray/convert.py | 3 +- xarray/tests/test_backends.py | 15 ++++++-- xarray/tests/test_coding_times.py | 63 ++++++++++++++++++++++--------- xarray/tests/test_conventions.py | 4 +- xarray/tests/test_dataarray.py | 2 +- 12 files changed, 176 insertions(+), 52 deletions(-) create mode 100644 xarray/coders.py diff --git a/doc/api.rst b/doc/api.rst index 342ae08e1a4..f731ac1c59a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -1096,6 +1096,17 @@ DataTree methods .. Missing: .. ``open_mfdatatree`` +Encoding/Decoding +================= + +Coder objects +------------- + +.. autosummary:: + :toctree: generated/ + + coders.CFDatetimeCoder + Coordinates objects =================== diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ecf1702c356..324da980261 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,6 +42,9 @@ Deprecations - Finalize deprecation of ``closed`` parameters of :py:func:`cftime_range` and :py:func:`date_range` (:pull:`9882`). By `Kai Mühlbauer `_. +- Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument + ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions + instead. Bug fixes ~~~~~~~~~ diff --git a/xarray/__init__.py b/xarray/__init__.py index 622c927b468..8af936ed27a 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -1,6 +1,6 @@ from importlib.metadata import version as _version -from xarray import groupers, testing, tutorial, ufuncs +from xarray import coders, groupers, testing, tutorial, ufuncs from xarray.backends.api import ( load_dataarray, load_dataset, @@ -66,6 +66,7 @@ # `mypy --strict` running in projects that import xarray. __all__ = ( # noqa: RUF022 # Sub-packages + "coders", "groupers", "testing", "tutorial", diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 785ab3913ef..12abb655e14 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -33,6 +33,7 @@ _normalize_path, ) from xarray.backends.locks import _get_scheduler +from xarray.coders import CFDatetimeCoder from xarray.core import indexing from xarray.core.combine import ( _infer_concat_order_from_positions, @@ -481,7 +482,10 @@ def open_dataset( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -543,9 +547,9 @@ def open_dataset( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -569,6 +573,8 @@ def open_dataset( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -698,7 +704,10 @@ def open_dataarray( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | None = None, - decode_times: bool | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | None = None, use_cftime: bool | None = None, concat_characters: bool | None = None, @@ -761,9 +770,11 @@ def open_dataarray( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. This keyword may not be supported by all the backends. - decode_times : bool, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + Pass a mapping, e.g. ``{"my_variable": False}``, + to toggle this feature per-variable individually. This keyword may not be supported by all the backends. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in @@ -781,6 +792,8 @@ def open_dataarray( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -903,7 +916,10 @@ def open_datatree( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -961,9 +977,9 @@ def open_datatree( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -987,6 +1003,8 @@ def open_datatree( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1118,7 +1136,10 @@ def open_groups( cache: bool | None = None, decode_cf: bool | None = None, mask_and_scale: bool | Mapping[str, bool] | None = None, - decode_times: bool | Mapping[str, bool] | None = None, + decode_times: bool + | CFDatetimeCoder + | Mapping[str, bool | CFDatetimeCoder] + | None = None, decode_timedelta: bool | Mapping[str, bool] | None = None, use_cftime: bool | Mapping[str, bool] | None = None, concat_characters: bool | Mapping[str, bool] | None = None, @@ -1180,9 +1201,9 @@ def open_groups( be replaced by NA. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - decode_times : bool or dict-like, optional + decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. + into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -1206,6 +1227,8 @@ def open_groups( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + Usage of 'use_cftime' as kwarg is deprecated. Please initialize it + with CFDatetimeCoder and 'decode_times' kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and diff --git a/xarray/coders.py b/xarray/coders.py new file mode 100644 index 00000000000..238ac714780 --- /dev/null +++ b/xarray/coders.py @@ -0,0 +1,10 @@ +""" +This module provides coder objects that encapsulate the +"encoding/decoding" process. +""" + +from xarray.coding.times import CFDatetimeCoder + +__all__ = [ + "CFDatetimeCoder", +] diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 4622298e152..b6bd11b4490 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -36,7 +36,11 @@ except ImportError: cftime = None -from xarray.core.types import CFCalendar, NPDatetimeUnitOptions, T_DuckArray +from xarray.core.types import ( + CFCalendar, + NPDatetimeUnitOptions, + T_DuckArray, +) T_Name = Union[Hashable, None] @@ -204,7 +208,10 @@ def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]: def _decode_cf_datetime_dtype( - data, units: str, calendar: str | None, use_cftime: bool | None + data, + units: str, + calendar: str | None, + use_cftime: bool | None, ) -> np.dtype: # Verify that at least the first and last date can be decoded # successfully. Otherwise, tracebacks end up swallowed by @@ -311,7 +318,10 @@ def _decode_datetime_with_pandas( def decode_cf_datetime( - num_dates, units: str, calendar: str | None = None, use_cftime: bool | None = None + num_dates, + units: str, + calendar: str | None = None, + use_cftime: bool | None = None, ) -> np.ndarray: """Given an array of numeric dates in netCDF format, convert it into a numpy array of date time objects. @@ -974,7 +984,10 @@ def _lazily_encode_cf_timedelta( class CFDatetimeCoder(VariableCoder): - def __init__(self, use_cftime: bool | None = None) -> None: + def __init__( + self, + use_cftime: bool | None = None, + ) -> None: self.use_cftime = use_cftime def encode(self, variable: Variable, name: T_Name = None) -> Variable: diff --git a/xarray/conventions.py b/xarray/conventions.py index 57407a15f51..042a7f14032 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -7,6 +7,7 @@ import numpy as np +from xarray.coders import CFDatetimeCoder from xarray.coding import strings, times, variables from xarray.coding.variables import SerializationWarning, pop_to from xarray.core import indexing @@ -88,7 +89,7 @@ def encode_cf_variable( ensure_not_multiindex(var, name=name) for coder in [ - times.CFDatetimeCoder(), + CFDatetimeCoder(), times.CFTimedeltaCoder(), variables.CFScaleOffsetCoder(), variables.CFMaskCoder(), @@ -109,7 +110,7 @@ def decode_cf_variable( var: Variable, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | CFDatetimeCoder = True, decode_endianness: bool = True, stack_char_dim: bool = True, use_cftime: bool | None = None, @@ -136,7 +137,7 @@ def decode_cf_variable( Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). If the _Unsigned attribute is present treat integer arrays as unsigned. - decode_times : bool + decode_times : bool or CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. decode_endianness : bool Decode arrays from non-native to native endianness. @@ -154,6 +155,8 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + Usage of use_cftime as kwarg is deprecated, please initialize it with + CFDatetimeCoder and ``decode_times``. Returns ------- @@ -167,7 +170,7 @@ def decode_cf_variable( original_dtype = var.dtype if decode_timedelta is None: - decode_timedelta = decode_times + decode_timedelta = True if decode_times else False if concat_characters: if stack_char_dim: @@ -191,7 +194,28 @@ def decode_cf_variable( if decode_timedelta: var = times.CFTimedeltaCoder().decode(var, name=name) if decode_times: - var = times.CFDatetimeCoder(use_cftime=use_cftime).decode(var, name=name) + # remove checks after end of deprecation cycle + if not isinstance(decode_times, CFDatetimeCoder): + if use_cftime is not None: + from warnings import warn + + warn( + "Usage of 'use_cftime' as kwarg is deprecated. " + "Please initialize it with CFDatetimeCoder and " + "'decode_times' kwarg.", + DeprecationWarning, + stacklevel=2, + ) + decode_times = CFDatetimeCoder(use_cftime=use_cftime) + else: + if use_cftime is not None: + raise TypeError( + "Usage of 'use_cftime' as kwarg is not allowed, " + "if 'decode_times' is initialized with " + "CFDatetimeCoder. Please add 'use_cftime' " + "when initializing CFDatetimeCoder." + ) + var = decode_times.decode(var, name=name) if decode_endianness and not var.dtype.isnative: var = variables.EndianCoder().decode(var) @@ -302,7 +326,7 @@ def decode_cf_variables( attributes: T_Attrs, concat_characters: bool | Mapping[str, bool] = True, mask_and_scale: bool | Mapping[str, bool] = True, - decode_times: bool | Mapping[str, bool] = True, + decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | Mapping[str, bool] | None = None, @@ -439,7 +463,7 @@ def decode_cf( obj: T_DatasetOrAbstractstore, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | CFDatetimeCoder = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, @@ -458,7 +482,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool, optional + decode_times : bool or CFDatetimeCoder, optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -483,6 +507,8 @@ def decode_cf( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. + Usage of use_cftime as kwarg is deprecated, please initialize it with + CFDatetimeCoder and ``decode_times``. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} @@ -536,7 +562,7 @@ def cf_decoder( attributes: T_Attrs, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool = True, + decode_times: bool | CFDatetimeCoder = True, ) -> tuple[T_Variables, T_Attrs]: """ Decode a set of CF encoded variables and attributes. @@ -553,7 +579,7 @@ def cf_decoder( mask_and_scale : bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool + decode_times : bool | CFDatetimeCoder Decode cf times ("hours since 2000-01-01") to np.datetime64. Returns diff --git a/xarray/convert.py b/xarray/convert.py index 14df7cadb9b..29d8f9650e3 100644 --- a/xarray/convert.py +++ b/xarray/convert.py @@ -4,7 +4,8 @@ import numpy as np -from xarray.coding.times import CFDatetimeCoder, CFTimedeltaCoder +from xarray.coders import CFDatetimeCoder +from xarray.coding.times import CFTimedeltaCoder from xarray.conventions import decode_cf from xarray.core import duck_array_ops from xarray.core.dataarray import DataArray diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index ff254225321..31e06e8efd8 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -48,6 +48,7 @@ ) from xarray.backends.pydap_ import PydapDataStore from xarray.backends.scipy_ import ScipyBackendEntrypoint +from xarray.coders import CFDatetimeCoder from xarray.coding.cftime_offsets import cftime_range from xarray.coding.strings import check_vlen_dtype, create_vlen_dtype from xarray.coding.variables import SerializationWarning @@ -3206,7 +3207,10 @@ def test_open_zarr_use_cftime(self) -> None: ds.to_zarr(store_target, **self.version_kwargs) ds_a = xr.open_zarr(store_target, **self.version_kwargs) assert_identical(ds, ds_a) - ds_b = xr.open_zarr(store_target, use_cftime=True, **self.version_kwargs) + decoder = CFDatetimeCoder(use_cftime=True) + ds_b = xr.open_zarr( + store_target, decode_times=decoder, **self.version_kwargs + ) assert xr.coding.times.contains_cftime_datetimes(ds_b.time.variable) def test_write_read_select_write(self) -> None: @@ -5622,7 +5626,8 @@ def test_use_cftime_true(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with warnings.catch_warnings(record=True) as record: - with open_dataset(tmp_file, use_cftime=True) as ds: + decoder = CFDatetimeCoder(use_cftime=True) + with open_dataset(tmp_file, decode_times=decoder) as ds: assert_identical(expected_x, ds.x) assert_identical(expected_time, ds.time) _assert_no_dates_out_of_range_warning(record) @@ -5674,7 +5679,8 @@ def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) - with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with pytest.raises((OutOfBoundsDatetime, ValueError)): - open_dataset(tmp_file, use_cftime=False) + decoder = CFDatetimeCoder(use_cftime=False) + open_dataset(tmp_file, decode_times=decoder) @requires_scipy_or_netCDF4 @@ -5692,7 +5698,8 @@ def test_use_cftime_false_nonstandard_calendar(calendar, units_year) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with pytest.raises((OutOfBoundsDatetime, ValueError)): - open_dataset(tmp_file, use_cftime=False) + decoder = CFDatetimeCoder(use_cftime=False) + open_dataset(tmp_file, decode_times=decoder) @pytest.mark.parametrize("engine", ["netcdf4", "scipy"]) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 9a51ca40d07..e05d303e17b 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -19,9 +19,9 @@ date_range, decode_cf, ) +from xarray.coders import CFDatetimeCoder from xarray.coding.times import _STANDARD_CALENDARS as _STANDARD_CALENDARS_UNSORTED from xarray.coding.times import ( - CFDatetimeCoder, _encode_datetime_with_cftime, _netcdf_to_numpy_timeunit, _numpy_to_netcdf_timeunit, @@ -123,7 +123,11 @@ def _all_cftime_date_types(): @pytest.mark.filterwarnings("ignore:Ambiguous reference date string") @pytest.mark.filterwarnings("ignore:Times can't be serialized faithfully") @pytest.mark.parametrize(["num_dates", "units", "calendar"], _CF_DATETIME_TESTS) -def test_cf_datetime(num_dates, units, calendar) -> None: +def test_cf_datetime( + num_dates, + units, + calendar, +) -> None: import cftime expected = cftime.num2date( @@ -167,8 +171,8 @@ def test_decode_cf_datetime_overflow() -> None: units = "days since 2000-01-01 00:00:00" # date after 2262 and before 1678 - days = (-117608, 95795) - expected = (datetime(1677, 12, 31), datetime(2262, 4, 12)) + days = (-117710, 95795) + expected = (datetime(1677, 9, 20), datetime(2262, 4, 12)) for i, day in enumerate(days): with warnings.catch_warnings(): @@ -277,15 +281,15 @@ def test_decode_dates_outside_timestamp_range(calendar) -> None: @requires_cftime @pytest.mark.parametrize("calendar", _STANDARD_CALENDARS) +@pytest.mark.parametrize("num_time", [735368, [735368], [[735368]]]) def test_decode_standard_calendar_single_element_inside_timestamp_range( - calendar, + calendar, num_time ) -> None: units = "days since 0001-01-01" - for num_time in [735368, [735368], [[735368]]]: - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "Unable to decode time axis") - actual = decode_cf_datetime(num_time, units, calendar=calendar) - assert actual.dtype == np.dtype("M8[ns]") + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "Unable to decode time axis") + actual = decode_cf_datetime(num_time, units, calendar=calendar) + assert actual.dtype == np.dtype("M8[ns]") @requires_cftime @@ -628,10 +632,10 @@ def test_cf_timedelta_2d() -> None: @pytest.mark.parametrize( ["deltas", "expected"], [ - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] - (pd.to_timedelta(["1 day", "2 days"]), "days"), # type: ignore[arg-type, unused-ignore] + (pd.to_timedelta(["1 day", "2 days"]), "days"), + (pd.to_timedelta(["1h", "1 day 1 hour"]), "hours"), + (pd.to_timedelta(["1m", "2m", np.nan]), "minutes"), + (pd.to_timedelta(["1m3s", "1m4s"]), "seconds"), ], ) def test_infer_timedelta_units(deltas, expected) -> None: @@ -675,7 +679,7 @@ def test_decode_cf(calendar) -> None: if calendar not in _STANDARD_CALENDARS: assert ds.test.dtype == np.dtype("O") else: - assert ds.test.dtype == np.dtype("M8[ns]") + assert ds.test.dtype == np.dtype("=M8[ns]") def test_decode_cf_time_bounds() -> None: @@ -700,7 +704,7 @@ def test_decode_cf_time_bounds() -> None: "calendar": "standard", } dsc = decode_cf(ds) - assert dsc.time_bnds.dtype == np.dtype("M8[ns]") + assert dsc.time_bnds.dtype == np.dtype("=M8[ns]") dsc = decode_cf(ds, decode_times=False) assert dsc.time_bnds.dtype == np.dtype("int64") @@ -1072,7 +1076,8 @@ def test_encode_decode_roundtrip_cftime(freq) -> None: ) variable = Variable(["time"], times) encoded = conventions.encode_cf_variable(variable) - decoded = conventions.decode_cf_variable("time", encoded, use_cftime=True) + decoder = CFDatetimeCoder(use_cftime=True) + decoded = conventions.decode_cf_variable("time", encoded, decode_times=decoder) assert_equal(variable, decoded) @@ -1182,7 +1187,7 @@ def test_decode_0size_datetime(use_cftime): if use_cftime and not has_cftime: pytest.skip() - dtype = object if use_cftime else "M8[ns]" + dtype = object if use_cftime else "=M8[ns]" expected = np.array([], dtype=dtype) actual = decode_cf_datetime( np.zeros(shape=0, dtype=np.int64), @@ -1209,6 +1214,28 @@ def test_decode_float_datetime(): np.testing.assert_equal(actual, expected) +def test_decode_float_datetime_with_decimals() -> None: + # test resolution enhancement for floats + values = np.array([0, 0.125, 0.25, 0.375, 0.75, 1.0], dtype="float32") + expected = np.array( + [ + "2000-01-01T00:00:00.000", + "2000-01-01T00:00:00.125", + "2000-01-01T00:00:00.250", + "2000-01-01T00:00:00.375", + "2000-01-01T00:00:00.750", + "2000-01-01T00:00:01.000", + ], + dtype="=M8[ns]", + ) + + units = "seconds since 2000-01-01" + calendar = "standard" + actual = decode_cf_datetime(values, units, calendar) + assert actual.dtype == expected.dtype + np.testing.assert_equal(actual, expected) + + @requires_cftime def test_scalar_unit() -> None: # test that a scalar units (often NaN when using to_netcdf) does not raise an error diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 495d760c534..7616f12957f 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -446,7 +446,9 @@ def test_dataset_repr_with_netcdf4_datetimes(self) -> None: assert "(time) object" in repr(ds) attrs = {"units": "days since 1900-01-01"} - ds = decode_cf(Dataset({"time": ("time", [0, 1], attrs)})) + ds = decode_cf( + Dataset({"time": ("time", [0, 1], attrs)}), + ) assert "(time) datetime64[ns]" in repr(ds) @requires_cftime diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index ea5186e59d0..a14f9990dca 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -30,7 +30,7 @@ broadcast, set_options, ) -from xarray.coding.times import CFDatetimeCoder +from xarray.coders import CFDatetimeCoder from xarray.core import dtypes from xarray.core.common import full_like from xarray.core.coordinates import Coordinates From d6fe95683d980017e6827a64c16f01c1a92faad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 16:53:37 +0100 Subject: [PATCH 077/134] add whats-new.rst entry --- doc/whats-new.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 324da980261..6d4dead1153 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,6 +27,9 @@ New Features - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). By `Kai Mühlbauer `_. +- Split out ``CFDatetimeCoder`` in ``xr.coders``, make ``decode_times`` keyword argument + consume ``CFDatetimeCoder``. + Breaking changes ~~~~~~~~~~~~~~~~ @@ -45,6 +48,7 @@ Deprecations - Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions instead. + By `Kai Mühlbauer `_. Bug fixes ~~~~~~~~~ From bd6a5d1e5403aba71191c8c4013d216f4f3be799 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 17:19:45 +0100 Subject: [PATCH 078/134] Apply suggestions from code review Co-authored-by: Deepak Cherian --- doc/whats-new.rst | 5 ++--- xarray/backends/api.py | 6 +++--- xarray/conventions.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6d4dead1153..b64a801e0a3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,7 +27,7 @@ New Features - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). By `Kai Mühlbauer `_. -- Split out ``CFDatetimeCoder`` in ``xr.coders``, make ``decode_times`` keyword argument +- Split out ``CFDatetimeCoder`` as public API in ``xr.coders``, make ``decode_times`` keyword argument consume ``CFDatetimeCoder``. @@ -46,8 +46,7 @@ Deprecations :py:func:`date_range` (:pull:`9882`). By `Kai Mühlbauer `_. - Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument - ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions - instead. + ``decode_times=CFDatetimeCoder(use_cftime=True)`` in :py:func:`~xarray.open_dataset`, :py:func:`~xarray.open_dataarray`, :py:func:`~xarray.open_datatree`, :py:func:`~xarray.open_groups`, :py:func:`~xarray.open_zarr` and :py:func:`~xarray.decode_cf` instead. By `Kai Mühlbauer `_. Bug fixes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 12abb655e14..4b71c6d2f91 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -549,7 +549,7 @@ def open_dataset( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + into datetime objects. Otherwise, use ``CFDatetimeCoder`` or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -573,8 +573,8 @@ def open_dataset( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with CFDatetimeCoder and 'decode_times' kwarg. + .. deprecated:: 2024.12.0 + Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and diff --git a/xarray/conventions.py b/xarray/conventions.py index 042a7f14032..1c2759cd631 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -199,7 +199,7 @@ def decode_cf_variable( if use_cftime is not None: from warnings import warn - warn( + emit_user_level_warning( "Usage of 'use_cftime' as kwarg is deprecated. " "Please initialize it with CFDatetimeCoder and " "'decode_times' kwarg.", From 6557ef97e398840a19291e49a4d474f9b7b8f0a0 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 16:20:05 +0000 Subject: [PATCH 079/134] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/conventions.py | 1 - 1 file changed, 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 1c2759cd631..e213c4258cc 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -197,7 +197,6 @@ def decode_cf_variable( # remove checks after end of deprecation cycle if not isinstance(decode_times, CFDatetimeCoder): if use_cftime is not None: - from warnings import warn emit_user_level_warning( "Usage of 'use_cftime' as kwarg is deprecated. " From 759fb72dc8122049270258fc589ac28313f72aad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 17:25:08 +0100 Subject: [PATCH 080/134] fix warning --- xarray/conventions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index e213c4258cc..7c076cfa3da 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -197,13 +197,11 @@ def decode_cf_variable( # remove checks after end of deprecation cycle if not isinstance(decode_times, CFDatetimeCoder): if use_cftime is not None: - emit_user_level_warning( "Usage of 'use_cftime' as kwarg is deprecated. " "Please initialize it with CFDatetimeCoder and " "'decode_times' kwarg.", DeprecationWarning, - stacklevel=2, ) decode_times = CFDatetimeCoder(use_cftime=use_cftime) else: From 21181917d0296cdd26c02aad4a4a4a5857e007ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 17:53:07 +0100 Subject: [PATCH 081/134] fix docstrings --- xarray/backends/api.py | 23 ++++++++++++++++------- xarray/conventions.py | 5 +++-- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4b71c6d2f91..603db4b8e54 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -549,7 +549,8 @@ def open_dataset( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use ``CFDatetimeCoder`` or leave them encoded as numbers. + into datetime objects. Otherwise, use ``CFDatetimeCoder`` or leave them + encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -573,8 +574,10 @@ def open_dataset( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. + .. deprecated:: 2024.12.0 Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -792,8 +795,10 @@ def open_dataarray( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. This keyword may not be supported by all the backends. - Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with CFDatetimeCoder and 'decode_times' kwarg. + + .. deprecated:: 2024.12.0 + Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1003,8 +1008,10 @@ def open_datatree( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with CFDatetimeCoder and 'decode_times' kwarg. + + .. deprecated:: 2024.12.0 + Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and @@ -1227,8 +1234,10 @@ def open_groups( raise an error. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - Usage of 'use_cftime' as kwarg is deprecated. Please initialize it - with CFDatetimeCoder and 'decode_times' kwarg. + + .. deprecated:: 2024.12.0 + Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and diff --git a/xarray/conventions.py b/xarray/conventions.py index 7c076cfa3da..e7bb1041a99 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -155,8 +155,9 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - Usage of use_cftime as kwarg is deprecated, please initialize it with - CFDatetimeCoder and ``decode_times``. + + .. deprecated:: 2024.12.0 + Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. Returns ------- From 262295a3490fb2420ea9098ccec44d6d4466346b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 17 Dec 2024 18:28:34 +0100 Subject: [PATCH 082/134] try fix typing --- xarray/conventions.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index e7bb1041a99..e90902b3f47 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -310,9 +310,10 @@ def _update_bounds_encoding(variables: T_Variables) -> None: T = TypeVar("T") +U = TypeVar("U") -def _item_or_default(obj: Mapping[Any, T] | T, key: Hashable, default: T) -> T: +def _item_or_default(obj: Mapping[Any, T | U] | T, key: Hashable, default: T) -> T | U: """ Return item by key if obj is mapping and key is present, else return default value. """ @@ -461,7 +462,7 @@ def decode_cf( obj: T_DatasetOrAbstractstore, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool | CFDatetimeCoder = True, + decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] = True, decode_coords: bool | Literal["coordinates", "all"] = True, drop_variables: T_DropVariables = None, use_cftime: bool | None = None, @@ -480,7 +481,7 @@ def decode_cf( mask_and_scale : bool, optional Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool or CFDatetimeCoder, optional + decode_times : bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder], optional Decode cf times (e.g., integers since "hours since 2000-01-01") to np.datetime64. decode_coords : bool or {"coordinates", "all"}, optional @@ -505,8 +506,10 @@ def decode_cf( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - Usage of use_cftime as kwarg is deprecated, please initialize it with - CFDatetimeCoder and ``decode_times``. + + .. deprecated:: 2024.12.0 + Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} @@ -560,7 +563,7 @@ def cf_decoder( attributes: T_Attrs, concat_characters: bool = True, mask_and_scale: bool = True, - decode_times: bool | CFDatetimeCoder = True, + decode_times: bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] = True, ) -> tuple[T_Variables, T_Attrs]: """ Decode a set of CF encoded variables and attributes. @@ -577,7 +580,7 @@ def cf_decoder( mask_and_scale : bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). - decode_times : bool | CFDatetimeCoder + decode_times : bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] Decode cf times ("hours since 2000-01-01") to np.datetime64. Returns From adebafa0cecd331de501da2c3c3b35173be53f71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 30 Dec 2024 12:06:37 +0100 Subject: [PATCH 083/134] Apply suggestions from code review Co-authored-by: Spencer Clark --- doc/whats-new.rst | 4 ++-- xarray/backends/api.py | 14 +++++++------- xarray/conventions.py | 17 ++++++++++------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 17910753bdd..c16283b68ae 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -35,8 +35,8 @@ New Features - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). By `Kai Mühlbauer `_. -- Split out ``CFDatetimeCoder`` as public API in ``xr.coders``, make ``decode_times`` keyword argument - consume ``CFDatetimeCoder``. +- Split out :py:class:`coders.CFDatetimeCoder` as public API in ``xr.coders``, make ``decode_times`` keyword argument + consume :py:class:`coders.CFDatetimeCoder`. Breaking changes diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 603db4b8e54..675e891fe68 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -549,7 +549,7 @@ def open_dataset( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use ``CFDatetimeCoder`` or leave them + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. @@ -576,7 +576,7 @@ def open_dataset( This keyword may not be supported by all the backends. .. deprecated:: 2024.12.0 - Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to @@ -775,7 +775,7 @@ def open_dataarray( be replaced by NA. This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -797,7 +797,7 @@ def open_dataarray( raise an error. This keyword may not be supported by all the backends. .. deprecated:: 2024.12.0 - Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to @@ -984,7 +984,7 @@ def open_datatree( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -1210,7 +1210,7 @@ def open_groups( This keyword may not be supported by all the backends. decode_times : bool, CFDatetimeCoder or dict-like, optional If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, use CFDatetimeCoder or leave them encoded as numbers. + into datetime objects. Otherwise, use :py:class:`coders.CFDatetimeCoder` or leave them encoded as numbers. Pass a mapping, e.g. ``{"my_variable": False}``, to toggle this feature per-variable individually. This keyword may not be supported by all the backends. @@ -1236,7 +1236,7 @@ def open_groups( This keyword may not be supported by all the backends. .. deprecated:: 2024.12.0 - Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to diff --git a/xarray/conventions.py b/xarray/conventions.py index e90902b3f47..b8b969ce34d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -157,7 +157,7 @@ def decode_cf_variable( raise an error. .. deprecated:: 2024.12.0 - Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. Returns ------- @@ -201,17 +201,20 @@ def decode_cf_variable( emit_user_level_warning( "Usage of 'use_cftime' as kwarg is deprecated. " "Please initialize it with CFDatetimeCoder and " - "'decode_times' kwarg.", + "'decode_times' kwarg.\n", + "Example usage:\n", + " time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)\n", + " ds = xr.open_dataset(decode_times=time_coder)\n", DeprecationWarning, ) decode_times = CFDatetimeCoder(use_cftime=use_cftime) else: if use_cftime is not None: raise TypeError( - "Usage of 'use_cftime' as kwarg is not allowed, " - "if 'decode_times' is initialized with " - "CFDatetimeCoder. Please add 'use_cftime' " - "when initializing CFDatetimeCoder." + "Usage of 'use_cftime' as a kwarg is not allowed " + "if a CFDatetimeCoder instance is passed to " + "decode_times. Please set use_cftime " + "when initializing CFDatetimeCoder instead." ) var = decode_times.decode(var, name=name) @@ -508,7 +511,7 @@ def decode_cf( raise an error. .. deprecated:: 2024.12.0 - Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. decode_timedelta : bool, optional If True, decode variables and coordinates with time units in From 6cd81e57c01cea046272b658154ab54e7fdda63a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 30 Dec 2024 12:10:09 +0100 Subject: [PATCH 084/134] Apply suggestions from code review Co-authored-by: Spencer Clark --- xarray/backends/api.py | 2 +- xarray/conventions.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 675e891fe68..40b3ba1ffba 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1010,7 +1010,7 @@ def open_datatree( This keyword may not be supported by all the backends. .. deprecated:: 2024.12.0 - Please initialize it with ``CFDatetimeCoder`` and ``decode_times`` kwarg. + Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool or dict-like, optional If True, concatenate along the last dimension of character arrays to diff --git a/xarray/conventions.py b/xarray/conventions.py index b8b969ce34d..01a50fa9a9b 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -199,9 +199,9 @@ def decode_cf_variable( if not isinstance(decode_times, CFDatetimeCoder): if use_cftime is not None: emit_user_level_warning( - "Usage of 'use_cftime' as kwarg is deprecated. " - "Please initialize it with CFDatetimeCoder and " - "'decode_times' kwarg.\n", + "Usage of 'use_cftime' as a kwarg is deprecated. " + "Please pass a CFDatetimeCoder instance initialized " + "with use_cftime to the decode_times kwarg instead.\n", "Example usage:\n", " time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)\n", " ds = xr.open_dataset(decode_times=time_coder)\n", From 1cec644710372ad09717df5f326b80f0da45d0bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 30 Dec 2024 12:16:06 +0100 Subject: [PATCH 085/134] Update xarray/conventions.py --- xarray/conventions.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 01a50fa9a9b..2d9d0e11d6d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -201,9 +201,9 @@ def decode_cf_variable( emit_user_level_warning( "Usage of 'use_cftime' as a kwarg is deprecated. " "Please pass a CFDatetimeCoder instance initialized " - "with use_cftime to the decode_times kwarg instead.\n", - "Example usage:\n", - " time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)\n", + "with use_cftime to the decode_times kwarg instead.\n" + "Example usage:\n" + " time_coder = xr.coders.CFDatetimeCoder(use_cftime=True)\n" " ds = xr.open_dataset(decode_times=time_coder)\n", DeprecationWarning, ) From 225c5b3871f8b7c7be8fa7aab9b6e5a5a6343be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 1 Jan 2025 16:38:46 +0100 Subject: [PATCH 086/134] remove duplicate function (introduced when merging main) --- xarray/coding/times.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 16e7c9b76b7..6df0d881771 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -291,18 +291,6 @@ def _unpack_time_unit_and_ref_date( return time_unit, ref_date -def _unpack_time_unit_and_ref_date( - units: str, -) -> tuple[NPDatetimeUnitOptions, pd.Timestamp]: - # same us _unpack_netcdf_time_units but finalizes time_unit and ref_date - # for processing in encode_cf_datetime - time_unit, _ref_date = _unpack_netcdf_time_units(units) - time_unit = _netcdf_to_numpy_timeunit(time_unit) - ref_date = pd.Timestamp(_ref_date) - ref_date = _maybe_strip_tz_from_timestamp(ref_date) - return time_unit, ref_date - - def _decode_cf_datetime_dtype( data, units: str, From 33a1563f7c2b017985aa7eba07175038dc295fdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 2 Jan 2025 08:02:44 +0100 Subject: [PATCH 087/134] Update deprecated directive --- xarray/backends/api.py | 8 ++++---- xarray/conventions.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 089d215b371..d8e3729528c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -575,7 +575,7 @@ def open_dataset( to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - .. deprecated:: 2024.12.0 + .. deprecated:: 2025.01.0 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool or dict-like, optional @@ -796,7 +796,7 @@ def open_dataarray( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. This keyword may not be supported by all the backends. - .. deprecated:: 2024.12.0 + .. deprecated:: 2025.01.0 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool, optional @@ -1009,7 +1009,7 @@ def open_datatree( to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - .. deprecated:: 2024.12.0 + .. deprecated:: 2025.01.0 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool or dict-like, optional @@ -1235,7 +1235,7 @@ def open_groups( to toggle this feature per-variable individually. This keyword may not be supported by all the backends. - .. deprecated:: 2024.12.0 + .. deprecated:: 2025.01.0 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. concat_characters : bool or dict-like, optional diff --git a/xarray/conventions.py b/xarray/conventions.py index 2d9d0e11d6d..f0fef471c26 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -156,7 +156,7 @@ def decode_cf_variable( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - .. deprecated:: 2024.12.0 + .. deprecated:: 2025.01.0 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. Returns @@ -510,7 +510,7 @@ def decode_cf( decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - .. deprecated:: 2024.12.0 + .. deprecated:: 2025.01.0 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. decode_timedelta : bool, optional From 1145f4b2a8826c4232b1032d8936d8736d8fd5df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 3 Jan 2025 09:57:29 +0100 Subject: [PATCH 088/134] fix typing --- xarray/coding/times.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 12efeb4a762..41dc5dc2e1a 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -365,12 +365,14 @@ def _check_date_for_units_since_refdate( return pd.Timestamp("NaT") -def _align_reference_date_and_unit(ref_date: pd.Timestamp, unit: str) -> pd.Timestamp: +def _align_reference_date_and_unit( + ref_date: pd.Timestamp, unit: NPDatetimeUnitOptions +) -> pd.Timestamp: # align to the highest needed resolution of ref_date or unit if np.timedelta64(1, ref_date.unit) > np.timedelta64(1, unit): # this will raise accordingly # if data can't be represented in the higher resolution - return timestamp_as_unit(ref_date, unit) + return timestamp_as_unit(ref_date, cast(PDDatetimeUnitOptions, unit)) return ref_date From a9990cf24bf2d991218231eb4f3aa9964afcce1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 3 Jan 2025 10:17:09 +0100 Subject: [PATCH 089/134] re-fix doctests --- xarray/core/common.py | 4 ++-- xarray/core/dataarray.py | 4 ++-- xarray/core/dataset.py | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/xarray/core/common.py b/xarray/core/common.py index 8c220aab423..3a70c9ec585 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -622,7 +622,7 @@ def assign_coords( lon (x, y) float64 32B 260.2 260.7 260.2 260.8 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09 - reference_time datetime64[s] 8B 2014-09-05 + reference_time datetime64[ns] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 128B 20.0 20.8 21.6 ... 30.4 31.2 32.0 @@ -636,7 +636,7 @@ def assign_coords( lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 32B 2014-09-06 ... 2014-09-09 - reference_time datetime64[s] 8B 2014-09-05 + reference_time datetime64[ns] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 128B 20.0 20.8 21.6 ... 30.4 31.2 32.0 diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index cd0428e73ca..d287564cfe5 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -394,7 +394,7 @@ class DataArray( lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[s] 8B 2014-09-05 + reference_time datetime64[ns] 8B 2014-09-05 Dimensions without coordinates: x, y Attributes: description: Ambient temperature. @@ -409,7 +409,7 @@ class DataArray( lon float64 8B -99.32 lat float64 8B 42.21 time datetime64[ns] 8B 2014-09-08 - reference_time datetime64[s] 8B 2014-09-05 + reference_time datetime64[ns] 8B 2014-09-05 Attributes: description: Ambient temperature. units: degC diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index e909b63a5cc..d6ffa7308a3 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -673,7 +673,7 @@ class Dataset( lat (loc) float64 16B 42.25 42.21 * instrument (instrument) Self: lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[s] 8B 2014-09-05 + reference_time datetime64[ns] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: precipitation (x, y, time) float64 96B 5.68 9.256 0.7104 ... 4.615 7.805 @@ -8908,7 +8908,7 @@ def filter_by_attrs(self, **kwargs) -> Self: lon (x, y) float64 32B -99.83 -99.32 -99.79 -99.23 lat (x, y) float64 32B 42.25 42.21 42.63 42.59 * time (time) datetime64[ns] 24B 2014-09-06 2014-09-07 2014-09-08 - reference_time datetime64[s] 8B 2014-09-05 + reference_time datetime64[ns] 8B 2014-09-05 Dimensions without coordinates: x, y Data variables: temperature (x, y, time) float64 96B 29.11 18.2 22.83 ... 16.15 26.63 From 43c85d19f1751b3356dd1ef799bf1579177eb4eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 4 Jan 2025 16:53:16 +0100 Subject: [PATCH 090/134] fix whats-new.rst after merging main --- doc/whats-new.rst | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 047c5cf59a3..a36b44206c0 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -25,6 +25,8 @@ New Features - Split out :py:class:`coders.CFDatetimeCoder` as public API in ``xr.coders``, make ``decode_times`` keyword argument consume :py:class:`coders.CFDatetimeCoder` (:pull:`9901`). By `Kai Mühlbauer `_. +- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). + By `Kai Mühlbauer `_. Breaking changes ~~~~~~~~~~~~~~~~ @@ -72,10 +74,7 @@ New Features latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett `_. - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). - By `Kai Mühlbauer `_. -- Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). - By `Kai Mühlbauer `_. - + By `Kai Mühlbauer `_ Breaking changes From a4702d611a9c5dd1cd587b1740d034ddbb4c992e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 4 Jan 2025 17:40:48 +0100 Subject: [PATCH 091/134] Apply suggestions from code review --- doc/user-guide/time-series.rst | 2 +- doc/user-guide/weather-climate.rst | 8 ++++---- doc/whats-new.rst | 6 +----- 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 9233791249e..aaec37bb9ef 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -83,7 +83,7 @@ You can manual decode arrays in this form by passing a dataset to coder = xr.coders.CFDatetimeCoder(time_unit="s") xr.decode_cf(ds, decode_times=coder) -From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15) and no ``proleptic_gregorian`` calendar is used, dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. See :ref:`CFTimeIndex` for more information. diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 6a56e3030f0..0890b56e0b7 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -74,14 +74,14 @@ using a standard calendar, but outside the `precision range`_ and dates [prior t any of the following are true: - The dates are from a non-standard calendar - - Any dates are outside the nanosecond-precision range (prior xarray version 2024.11) - - Any dates are outside the time span limited by the resolution (from xarray version v2024.11) + - Any dates are outside the nanosecond-precision range (prior xarray version 2025.01.1) + - Any dates are outside the time span limited by the resolution (from xarray version v2025.01.1) Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"], enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. + represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"]), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. From xarray version 2024.11 the relaxed non-nanosecond precision datetime values will be used. + values. From xarray version 2025.01.1 relaxed non-nanosecond precision datetime values can be parameterized via :py:class:`coders.CFDatetimeCoder` and ``decode_times` kwarg. For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a36b44206c0..d6a4be78b9e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -74,7 +74,7 @@ New Features latency. (:issue:`9853`, :pull:`9861`). By `Davis Bennett `_. - Add ``unit`` - keyword argument to :py:func:`date_range` and ``microsecond`` parsing to iso8601-parser (:pull:`9885`). - By `Kai Mühlbauer `_ + By `Kai Mühlbauer `_. Breaking changes @@ -88,10 +88,6 @@ Breaking changes Deprecations ~~~~~~~~~~~~ -- Time decoding related kwarg ``use_cftime`` is deprecated. Use keyword argument - ``decode_times=CFDatetimeCoder(use_cftime=True)`` in the respective functions - instead (:pull:`9618`). - By `Kai Mühlbauer `_. - Finalize deprecation of ``closed`` parameters of :py:func:`cftime_range` and :py:func:`date_range` (:pull:`9882`). By `Kai Mühlbauer `_. From 9bd292a0d77c54febd31819cff54e3d0a8166391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sat, 4 Jan 2025 17:44:30 +0100 Subject: [PATCH 092/134] Apply suggestions from code review --- xarray/conventions.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 54f118dac04..485c9ac0c71 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -155,11 +155,6 @@ def decode_cf_variable( represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible raise an error. - Usage of use_cftime as kwarg is deprecated, please initialize it with - CFDatetimeCoder and ``decode_times``. - - .. deprecated:: 2025.01.0 - Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. .. deprecated:: 2025.01.1 Please pass a :py:class:`coders.CFDatetimeCoder` instance initialized with ``use_cftime`` to the ``decode_times`` kwarg instead. From 25b797eae5a3d94ef6f0732614f832ec482ba5d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 5 Jan 2025 17:47:28 +0100 Subject: [PATCH 093/134] rewrite recursive function using for-loop --- xarray/coding/times.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 41dc5dc2e1a..ea16f6eb3cd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -2,7 +2,7 @@ import re import warnings -from collections.abc import Callable, Hashable, Iterator +from collections.abc import Callable, Hashable from datetime import datetime, timedelta from functools import partial from typing import TYPE_CHECKING, Union, cast @@ -391,15 +391,15 @@ def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: def _check_higher_resolution( flat_num_dates: np.ndarray, - iter_unit: Iterator[PDDatetimeUnitOptions], + time_unit: PDDatetimeUnitOptions, ) -> tuple[np.ndarray, PDDatetimeUnitOptions]: """Iterate until fitting resolution found.""" - new_time_unit: PDDatetimeUnitOptions = next(iter_unit) - if (np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns": - flat_num_dates, new_time_unit = _check_higher_resolution( - flat_num_dates * 1000, - iter_unit=iter_unit, - ) + res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] + new_units = res[res.index(cast(PDDatetimeUnitOptions, time_unit)) :] + for new_time_unit in new_units: + if not ((np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns"): + break + flat_num_dates *= 1000 return flat_num_dates, new_time_unit @@ -472,10 +472,8 @@ def _decode_datetime_with_pandas( # estimate fitting resolution for floating point values # this iterates until all floats are fractionless or time_unit == "ns" if flat_num_dates.dtype.kind == "f" and time_unit != "ns": - res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] - iter_unit = iter(res[res.index(cast(PDDatetimeUnitOptions, time_unit)) :]) flat_num_dates, new_time_unit = _check_higher_resolution( - flat_num_dates, iter_unit + flat_num_dates, time_unit ) if time_unit != new_time_unit: msg = ( From 3bd8cf4c7e991e976bd186b2c6e659c3c1466f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 5 Jan 2025 18:02:59 +0100 Subject: [PATCH 094/134] remove astype-construct in _possibly_convert_objects --- xarray/core/variable.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 53c1fd76f7b..35c6802d60e 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -6,7 +6,6 @@ import numbers import warnings from collections.abc import Callable, Hashable, Mapping, Sequence -from datetime import datetime from functools import partial from types import EllipsisType from typing import TYPE_CHECKING, Any, NoReturn, cast @@ -214,18 +213,6 @@ def _possibly_convert_objects(values): * pd.Timedelta """ as_series = pd.Series(values.ravel(), copy=False) - # When receiving objects which pd.Series can't resolve by its own - # we try astype-conversion to "ns"-resolution for datetimes and pd.Timestamp. - if ( - values.dtype.kind == "O" - and as_series.dtype.kind == "O" - and as_series.size > 0 - and ( - isinstance(as_series[0], datetime | pd.Timestamp) - or pd.api.types.is_datetime64_dtype(as_series[0]) - ) - ): - as_series = as_series.astype("=M8[ns]") result = np.asarray(as_series).reshape(values.shape) if not result.flags.writeable: # GH8843, pandas copy-on-write mode creates read-only arrays by default From 8b9c85a168ff6cbe26f92525c625e712a5f4428e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Sun, 5 Jan 2025 18:06:22 +0100 Subject: [PATCH 095/134] Update xarray/coding/times.py Co-authored-by: Stephan Hoyer --- xarray/coding/times.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index ea16f6eb3cd..a6215492aca 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -385,7 +385,7 @@ def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: if date < type(date)(1582, 10, 15): raise OutOfBoundsDatetime( f"Dates before 1582-10-15 cannot be decoded " - f"with pandas using {calendar!r} calendar." + f"with pandas using {calendar!r} calendar: {date}" ) From 3b2d861f9b71ecc925426251eea16f991059a276 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 08:08:30 +0100 Subject: [PATCH 096/134] add suggestions from code review --- doc/user-guide/time-series.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index aaec37bb9ef..596e51ecef5 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -83,8 +83,8 @@ You can manual decode arrays in this form by passing a dataset to coder = xr.coders.CFDatetimeCoder(time_unit="s") xr.decode_cf(ds, decode_times=coder) -From xarray TODO: version the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a netCDF file contains dates outside of these bounds (or dates < 1582-10-15) and no ``proleptic_gregorian`` calendar is used, dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. -:py:class:`~xarray.CFTimeIndex` enables a subset of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. +From xarray 2025.01.1 the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a store contains dates outside of these bounds (or dates < 1582-10-15 with a Gregorian, also known as standard, calendar), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +:py:class:`~xarray.CFTimeIndex` enables most of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. See :ref:`CFTimeIndex` for more information. Datetime indexing From 66e181cf2c22b7f319aff356f21bbae3a40df0ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 08:13:48 +0100 Subject: [PATCH 097/134] rephrase per suggestion --- doc/user-guide/weather-climate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 0890b56e0b7..a1ad8e0ee6e 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -81,7 +81,7 @@ using a standard calendar, but outside the `precision range`_ and dates [prior t represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"]), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. From xarray version 2025.01.1 relaxed non-nanosecond precision datetime values can be parameterized via :py:class:`coders.CFDatetimeCoder` and ``decode_times` kwarg. + values. From xarray version 2025.01.1 on, non-nanosecond precision datetime values are also supported in xarray (this can be parameterized via :py:class:`coders.CFDatetimeCoder` and ``decode_times` kwarg). For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a From e3809683433a0a8ade3053c0b27314b62f2856fb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 08:15:21 +0100 Subject: [PATCH 098/134] add article per suggestion --- doc/user-guide/weather-climate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index a1ad8e0ee6e..d80cb9e8e7d 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -132,7 +132,7 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). There is no such restriction when converting to ``proleptic_gregorian`` calendar. +use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). There is no such restriction when converting to a ``proleptic_gregorian`` calendar. .. ipython:: python From 305938c50bf047156d3cca000b54af9770424e87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 09:37:43 +0100 Subject: [PATCH 099/134] Apply suggestions from code review Rephrasing and additions to doc string, some test changes. Co-authored-by: Spencer Clark --- doc/internals/time-coding.rst | 46 +++++++++++++++---------------- xarray/coding/times.py | 3 +- xarray/tests/test_backends.py | 2 +- xarray/tests/test_coding_times.py | 4 ++- xarray/tests/test_concat.py | 1 - xarray/tests/test_dataset.py | 2 +- xarray/tests/test_variable.py | 13 ++++----- 7 files changed, 35 insertions(+), 36 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 2ad3f11b4d2..8aa15a80011 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -26,15 +26,15 @@ to_datetime The function :py:func:`pandas.to_datetime` is used within xarray for inferring units and for testing purposes. -In normal operation :py:func:`pandas.to_datetime` returns :py:class:`pandas.Timestamp` (scalar input) or :py:class:`pandas.DatetimeIndex` (array-like input) which are datetime64 with inherited resolution (from the source). If no resolution can be inherited ``'ns'`` is assumed. That has the implication, that the maximum usable timerange for those cases is +-292 years centered around the epoch. To accommodate for that, we are carefully checking the units/resolution in the encoding and decoding step. +In normal operation :py:func:`pandas.to_datetime` returns a :py:class:`pandas.Timestamp` (for scalar input) or :py:class:`pandas.DatetimeIndex` (for array-like input) which are related to ``np.datetime64`` values with a resolution inherited from the input. If no resolution can be inherited ``'ns'`` is assumed. That has the implication that the maximum usable time range for those cases is approximately +/- 292 years centered around the Unix epoch (1970-01-01). To accommodate that, we carefully check the units/resolution in the encoding and decoding step. -When args are numeric (no strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. +When the arguments are numeric (not strings or ``np.datetime64`` values) ``"unit"`` can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. .. ipython:: python f"Maximum datetime range: ({pd.to_datetime(int64_min, unit="ns")}, {pd.to_datetime(int64_max, unit="ns")})" -For input values which can't be represented in nanosecond resolution :py:class:`pandas.OutOfBoundsDatetime` exception is raised: +For input values which can't be represented in nanosecond resolution an :py:class:`pandas.OutOfBoundsDatetime` exception is raised: .. ipython:: python @@ -49,10 +49,10 @@ For input values which can't be represented in nanosecond resolution :py:class:` except Exception as err: print(err) -Numpy datetime64 can be extracted with :py:meth:`pandas.Datetime.to_numpy` and :py:meth:`pandas.DatetimeIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Datetime.as_unit` +``np.datetime64`` values can be extracted with :py:meth:`pandas.Timestamp.to_numpy` and :py:meth:`pandas.DatetimeIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timestamp.as_unit` and :py:meth:`pandas.DatetimeIndex.as_unit` respectively. -``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as argument. That means we are able to represent datetimes with second, millisecond, microsecond or nanosecond resolution. +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as an argument. That means we are able to represent datetimes with second, millisecond, microsecond or nanosecond resolution. .. ipython:: python @@ -122,13 +122,13 @@ to_timedelta The function :py:func:`pandas.to_timedelta` is used within xarray for inferring units and for testing purposes. -In normal operation :py:func:`pandas.to_timedelta` returns :py:class:`pandas.Timedelta` (scalar input) or :py:class:`pandas.TimedeltaIndex` (array-like input) which are timedelta64 with ``ns`` resolution internally. That has the implication, that the usable timedelta covers only roughly 585 years. To accommodate for that, we are working around that limitation in the encoding and decoding step. +In normal operation :py:func:`pandas.to_timedelta` returns a :py:class:`pandas.Timedelta` (for scalar input) or :py:class:`pandas.TimedeltaIndex` (for array-like input) which are ``np.timedelta64`` values with ``ns`` resolution internally. That has the implication, that the usable timedelta covers only roughly 585 years. To accommodate for that, we are working around that limitation in the encoding and decoding step. .. ipython:: python f"Maximum timedelta range: ({pd.to_timedelta(int64_min, unit="ns")}, {pd.to_timedelta(int64_max, unit="ns")})" -For input values which can't be represented in nanosecond resolution :py:class:`pandas.OutOfBoundsTimedelta` exception is raised: +For input values which can't be represented in nanosecond resolution an :py:class:`pandas.OutOfBoundsTimedelta` exception is raised: .. ipython:: python @@ -141,12 +141,12 @@ For input values which can't be represented in nanosecond resolution :py:class:` except Exception as err: print("Second:", err) -When args are numeric (no strings) "unit" can be anything from ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. +When arguments are numeric (not strings or ``np.timedelta64`` values) "unit" can be anything from ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. -Numpy timedelta64 can be extracted with :py:meth:`pandas.Timedelta.to_numpy` and :py:meth:`pandas.TimedeltaIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timedelta.as_unit` +``np.timedelta64`` values can be extracted with :py:meth:`pandas.Timedelta.to_numpy` and :py:meth:`pandas.TimedeltaIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timedelta.as_unit` and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. -``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as argument. That means we are able to represent timedeltas with second, millisecond, microsecond or nanosecond resolution. +``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as an argument. That means we are able to represent timedeltas with second, millisecond, microsecond or nanosecond resolution. .. ipython:: python @@ -197,13 +197,13 @@ and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. Timestamp ~~~~~~~~~ -:py:class:`pandas.Timestamp` is used within xarray to wrap strings of CF reference times and datetime.datetime. +:py:class:`pandas.Timestamp` is used within xarray to wrap strings of CF encoding reference times and datetime.datetime. -When args are numeric (no strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``. +When arguments are numeric (not strings) "unit" can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. In normal operation :py:class:`pandas.Timestamp` holds the timestamp in the provided resolution, but only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. -Same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see above). +The same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see above). Depending on the internal resolution Timestamps can be represented in the range: .. ipython:: python @@ -213,7 +213,7 @@ Depending on the internal resolution Timestamps can be represented in the range: f"unit: {unit!r} time range ({pd.Timestamp(int64_min, unit=unit)}, {pd.Timestamp(int64_max, unit=unit)})" ) -Since relaxing the resolution this enhances the range to several hundreds of thousands of centuries with microsecond representation. ``NaT`` will be at ``np.iinfo("int64").min`` for all of the different representations. +Since relaxing the resolution, this enhances the range to several hundreds of thousands of centuries with microsecond representation. ``NaT`` will be at ``np.iinfo("int64").min`` for all of the different representations. .. warning:: When initialized with a datetime string this is only defined from ``-9999-01-01`` to ``9999-12-31``. @@ -260,7 +260,7 @@ Since relaxing the resolution this enhances the range to several hundreds of tho DatetimeIndex ~~~~~~~~~~~~~ -:py:class:`pandas.DatetimeIndex` is used to wrap numpy datetime64 or other datetime-likes, when encoding. The resolution of the DatetimeIndex depends on the input, but can be only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. +:py:class:`pandas.DatetimeIndex` is used to wrap ``np.datetime64`` values or other datetime-likes when encoding. The resolution of the DatetimeIndex depends on the input, but can be only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cut to ``'ns'``. :py:class:`pandas.DatetimeIndex` will raise :py:class:`pandas.OutOfBoundsDatetime` if the input can't be represented in the given resolution. .. note:: @@ -326,17 +326,17 @@ Xarray tries to adhere to the latest version of the `CF Conventions`_. Relevant CF time decoding ~~~~~~~~~~~~~~~~ -Decoding of ``values`` with time unit specification like ``seconds since 1992-10-8 15:15:42.5 -6:00`` into datetimes (using CF convention) is a multistage process. +Decoding of ``values`` with a time unit specification like ``"seconds since 1992-10-8 15:15:42.5 -6:00"`` into datetimes using the CF conventions is a multistage process. -1. If we have a non-standard calendar (eg. ``noleap``) the decoding is done with ``cftime`` package (which is not covered in this section). For ``standard``/``gregorian`` calendar as well as ``proleptic_gregorian`` the above outlined pandas functionality is used. +1. If we have a non-standard calendar (e.g. ``"noleap"``) decoding is done with the ``cftime`` package, which is not covered in this section. For the``"standard"``/``"gregorian"`` calendar as well as the ``"proleptic_gregorian"`` calendar the above outlined pandas functionality is used. -2. ``standard``/``gregorian`` calendar and ``proleptic_gregorian`` are equivalent for any dates and reference times >= ``1582-10-15``. First the reference time is checked and any timezone information stripped off and in a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For ``standard``/``gregorian`` calendar the dates are checked to be >= ``1582-10-15``. If anything fails, the decoding is done with ``cftime``). +2. The ``"standard"``/``"gregorian"`` calendar and the ``"proleptic_gregorian"`` are equivalent for any dates and reference times >= ``"1582-10-15"``. First the reference time is checked and any timezone information stripped off. In a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For the ``"standard"``/``"gregorian"`` calendar the dates are checked to be >= ``"1582-10-15"``. If anything fails, the decoding is attempted with ``cftime``. -3. As the unit (here ``seconds``) and the resolution of the reference time ``1992-10-8 15:15:42.5 -6:00`` (here ``milliseconds``) might be different, this has to be aligned to the higher resolution (retrieve new unit). User may also specify their wanted target resolution by setting kwarg ``time_unit`` to one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` (default ``'ns'``). This will be included into the alignment process. This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To not break consistency for ``NaT`` a mask is kept and re-introduced after the multiplication. +3. As the unit (here ``"seconds"``) and the resolution of the reference time ``"1992-10-8 15:15:42.5 -6:00"`` (here ``"milliseconds"``) might be different, the decoding resolution is aligned to the higher resolution of the two. Users may also specify their wanted target resolution by setting the ``time_unit`` keyword argument to one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` (default ``'ns'``). This will be included in the alignment process. This is done by multiplying the ``values`` by the ratio of nanoseconds per time unit and nanoseconds per reference time unit. To retain consistency for ``NaT`` values a mask is kept and re-introduced after the multiplication. 4. Times encoded as floating point values are checked for fractional parts and the resolution is enhanced in an iterative process until a fitting resolution (or ``'ns'``) is found. A ``SerializationWarning`` is issued to make the user aware of the possibly problematic encoding. -5. Finally, the ``values`` (``int64``) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. +5. Finally, the ``values`` (at this point converted to ``int64`` values) are cast to ``datetime64[unit]`` (using the above retrieved unit) and added to the reference time :py:class:`pandas.Timestamp`. .. ipython:: python @@ -383,8 +383,8 @@ For encoding the process is more or less a reversal of the above, but we have to 1. Infer ``data_units`` from the given ``dates``. 2. Infer ``units`` (either cleanup given ``units`` or use ``data_units`` -3. Infer calendar name from given ``dates``. -4. If non standard calendar or object dates (CFTime) encode with ``cftime`` +3. Infer the calendar name from the given ``dates``. +4. If dates are :py:class:`cftime.datetime` objects then encode with ``cftime.date2num`` 5. Retrieve ``time_units`` and ``ref_date`` from ``units`` 6. Check ``ref_date`` >= ``1582-10-15``, otherwise -> ``cftime`` 7. Wrap ``dates`` with pd.DatetimeIndex @@ -439,4 +439,4 @@ For encoding the process is more or less a reversal of the above, but we have to Default Time Unit ~~~~~~~~~~~~~~~~~ -The current default time unit of xarray is ``'ns'``. Setting keyword argument ``time_unit`` unit to ``'s'`` (the lowest resolution pandas allows) datetimes will be converted to at least ``'s'``-resolution, if possible. Same holds true for ``'ms'`` and ``'us'``. +The current default time unit of xarray is ``'ns'``. When setting keyword argument ``time_unit`` unit to ``'s'`` (the lowest resolution pandas allows) datetimes will be converted to at least ``'s'``-resolution, if possible. The same holds true for ``'ms'`` and ``'us'``. diff --git a/xarray/coding/times.py b/xarray/coding/times.py index a6215492aca..3412aa88582 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -541,8 +541,7 @@ def decode_cf_datetime( lower = cftype(1677, 9, 21, 0, 12, 43, 145224) upper = cftype(2262, 4, 11, 23, 47, 16, 854775) - # todo: check if test for minimum date is enough - if dates_min < border or dates_max < border: + if dates_min < border: if _is_standard_calendar(calendar): warnings.warn( "Unable to decode time axis into full " diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index e27336ad230..cac23a52b44 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5831,7 +5831,7 @@ def test_open_fsspec() -> None: ds0 = ds.copy() # pd.to_timedelta returns ns-precision, but the example data is in second precision # so we need to fix this - ds0["time"] = ds.time + pd.to_timedelta("1 day").as_unit("s") + ds0["time"] = ds.time + np.timedelta64(1, "D") mm = m.get_mapper("out2.zarr") ds0.to_zarr(mm) # old interface diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index ca60b99b8ff..0d813dd94bd 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -223,7 +223,9 @@ def test_decode_standard_calendar_inside_timestamp_range( time = cftime.date2num(times.to_pydatetime(), units, calendar=calendar) expected = times.values # for cftime we get "us" resolution - # ns resolution is handled by cftime, too (OutOfBounds) + # ns resolution is handled by cftime due to the reference date + # being out of bounds, but the times themselves are + # representable with nanosecond resolution. actual = decode_cf_datetime(time, units, calendar=calendar, time_unit=time_unit) assert actual.dtype == np.dtype(f"=M8[{time_unit}]") abs_diff = abs(actual - expected) diff --git a/xarray/tests/test_concat.py b/xarray/tests/test_concat.py index c5b817d3401..9e8e06fc1ee 100644 --- a/xarray/tests/test_concat.py +++ b/xarray/tests/test_concat.py @@ -317,7 +317,6 @@ def test_concat_multiple_datasets_with_multiple_missing_variables() -> None: assert_identical(actual, expected) -@pytest.mark.filterwarnings("ignore:Converting non-default") def test_concat_type_of_missing_fill() -> None: datasets = create_typed_datasets(2, seed=123) expected1 = concat(datasets, dim="day", fill_value=dtypes.NA) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 257c61ae60f..cf10de13aee 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3551,7 +3551,7 @@ def test_expand_dims_create_index_from_iterable(self): def test_expand_dims_non_nanosecond_conversion(self) -> None: # Regression test for https://github.com/pydata/xarray/issues/7493#issuecomment-1953091000 # todo: test still needed? - ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "s")]}) + ds = Dataset().expand_dims({"time": [np.datetime64("2018-01-01", "m")]}) assert ds.time.dtype == np.dtype("datetime64[s]") def test_set_index(self) -> None: diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 6132b7fa19d..d470eb60698 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -216,9 +216,8 @@ def test_index_0d_datetime(self): def test_index_0d_timedelta64(self): td = timedelta(hours=1) # todo: discussion needed - td64 = np.timedelta64(td, "ns") - x = self.cls(["x"], [td64]) - self._assertIndexedLikeNDArray(x, td64, np.dtype("timedelta64[ns]")) + x = self.cls(["x"], [np.timedelta64(td)]) + self._assertIndexedLikeNDArray(x, np.timedelta64(td), np.dtype("timedelta64[us]")) x = self.cls(["x"], pd.to_timedelta([td])) self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") @@ -1128,7 +1127,7 @@ def test_0d_datetime(self): # todo: check, if this test is OK v = Variable([], pd.Timestamp("2000-01-01")) assert v.dtype == np.dtype("datetime64[ns]") - assert v.values == np.datetime64("2000-01-01", "s") + assert v.values == np.datetime64("2000-01-01", "ns") @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( @@ -2677,7 +2676,7 @@ def test_datetime(self): assert np.dtype("datetime64[ns]") == actual.dtype assert expected is source_ndarray(np.asarray(actual)) - expected = np.datetime64("2000-01-01", "us") + expected = np.datetime64("2000-01-01", "ns") actual = as_compatible_data(datetime(2000, 1, 1)) assert np.asarray(expected) == actual assert np.ndarray is type(actual) @@ -3016,7 +3015,7 @@ def test_from_pint_wrapping_dask(self, Var): ], ids=lambda x: f"{x}", ) -def test_datetime_conversion_warning(values, unit) -> None: +def test_datetime_conversion(values, unit) -> None: # todo: needs discussion # todo: check, if this test is OK dims = ["time"] if isinstance(values, np.ndarray | pd.Index | pd.Series) else [] @@ -3087,7 +3086,7 @@ def test_pandas_two_only_datetime_conversion_warnings( ], ids=lambda x: f"{x}", ) -def test_timedelta_conversion_warning(values, unit) -> None: +def test_timedelta_conversion(values, unit) -> None: dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] var = Variable(dims, values) assert var.dtype == np.dtype(f"timedelta64[{unit}]") From b32b02c88c3eaa6f717b66bd09948ec6e46c8899 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 08:38:34 +0000 Subject: [PATCH 100/134] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- xarray/tests/test_variable.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d470eb60698..1691c3df14a 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -217,7 +217,9 @@ def test_index_0d_timedelta64(self): td = timedelta(hours=1) # todo: discussion needed x = self.cls(["x"], [np.timedelta64(td)]) - self._assertIndexedLikeNDArray(x, np.timedelta64(td), np.dtype("timedelta64[us]")) + self._assertIndexedLikeNDArray( + x, np.timedelta64(td), np.dtype("timedelta64[us]") + ) x = self.cls(["x"], pd.to_timedelta([td])) self._assertIndexedLikeNDArray(x, np.timedelta64(td), "timedelta64[ns]") From a2c46b1d2dd414f998f9f0d2e94a7c7952eedee8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 12:20:41 +0100 Subject: [PATCH 101/134] fix scalar handling for timedelta based indexer --- xarray/core/indexing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index f185a05c2b9..66c84f6b8db 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1754,7 +1754,10 @@ def _convert_scalar(self, item): # (for now) item = np.datetime64("NaT", "ns") elif isinstance(item, timedelta): - item = np.timedelta64(getattr(item, "value", item), "ns") + # from xarray 2025.01.1 xarray allows non-nanosecond resolution + # so we just convert to_numpy if possible + if hasattr(item, "to_numpy"): + item = item.to_numpy() elif isinstance(item, pd.Timestamp): # Work around for GH: pydata/xarray#1932 and numpy/numpy#10668 # numpy fails to convert pd.Timestamp to np.datetime64[ns] From fa2c4b6579e6bef889e1936cc4d128d06b1a94ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 12:29:29 +0100 Subject: [PATCH 102/134] remove stale error message and "ignore:Converting non-default" in testsuite --- xarray/core/variable.py | 8 -------- xarray/tests/test_conventions.py | 2 -- xarray/tests/test_dataarray.py | 2 -- xarray/tests/test_dataset.py | 2 -- xarray/tests/test_groupby.py | 1 - xarray/tests/test_plot.py | 1 - xarray/tests/test_variable.py | 12 ------------ 7 files changed, 28 deletions(-) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 35c6802d60e..088c5f405ef 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -78,14 +78,6 @@ from xarray.namedarray.parallelcompat import ChunkManagerEntrypoint -NON_DEFAULTPRECISION_WARNING = ( - "Converting non-default precision {case} values to default precision. " - "This warning is caused by passing non-default np.datetime64 or " - "np.timedelta64 values to the DataArray or Variable constructor; it can be " - "silenced by converting the values to default precision {res!r} ahead of time." -) - - class MissingDimensionsError(ValueError): """Error class used when we can't safely guess a dimension name.""" diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 2886691ce32..b1bf9a762ea 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -214,7 +214,6 @@ def test_deterministic_coords_encoding(self) -> None: vars, attrs = conventions.encode_dataset_coordinates(ds) assert attrs["coordinates"] == "bar baz" - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_attrs(self) -> None: orig = Dataset( {"a": 1, "b": 1}, @@ -232,7 +231,6 @@ def test_emit_coordinates_attribute_in_attrs(self) -> None: assert enc["b"].attrs.get("coordinates") == "t" assert "coordinates" not in enc["b"].encoding - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_emit_coordinates_attribute_in_encoding(self) -> None: orig = Dataset( {"a": 1, "b": 1}, diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index d6bd7d63b0a..c94eefd74ea 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -3661,7 +3661,6 @@ def test_to_and_from_dict( actual_no_data = da.to_dict(data=False, encoding=encoding) assert expected_no_data == actual_no_data - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_time_dim(self) -> None: x = np.random.randn(10, 3) t = pd.date_range("20130101", periods=10) @@ -3670,7 +3669,6 @@ def test_to_and_from_dict_with_time_dim(self) -> None: roundtripped = DataArray.from_dict(da.to_dict()) assert_identical(da, roundtripped) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_to_and_from_dict_with_nan_nat(self) -> None: y = np.random.randn(10, 3) y[2] = np.nan diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index cf10de13aee..125cbd1e221 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -500,7 +500,6 @@ def test_constructor_1d(self) -> None: actual = Dataset({"x": [5, 6, 7, 8, 9]}) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_constructor_0d(self) -> None: expected = Dataset({"x": ([], 1)}) for arg in [1, np.array(1), expected["x"]]: @@ -6071,7 +6070,6 @@ def test_dataset_math_auto_align(self) -> None: expected = ds + other.reindex_like(ds) assert_identical(expected, actual) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_dataset_math_errors(self) -> None: ds = self.make_example_math_dataset() diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 512b3e8523d..8ea3b618c7e 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -2198,7 +2198,6 @@ def test_upsample_interpolate(self) -> None: assert_allclose(expected, actual, rtol=1e-16) @requires_scipy - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_upsample_interpolate_bug_2197(self) -> None: dates = pd.date_range("2007-02-01", "2007-03-01", freq="D", unit="s") da = xr.DataArray(np.arange(len(dates)), [("time", dates)]) diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 05200f62ce8..0a05451cb85 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -2962,7 +2962,6 @@ def test_datetime_plot1d(self) -> None: # mpl.dates.AutoDateLocator passes and no other subclasses: assert type(ax.xaxis.get_major_locator()) is mpl.dates.AutoDateLocator - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime_plot2d(self) -> None: # Test that matplotlib-native datetime works: da = DataArray( diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 1691c3df14a..de618eff6a2 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -200,7 +200,6 @@ def test_index_0d_string(self): x = self.cls(["x"], [value]) self._assertIndexedLikeNDArray(x, value, dtype) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_datetime(self): d = datetime(2000, 1, 1) x = self.cls(["x"], [d]) @@ -212,7 +211,6 @@ def test_index_0d_datetime(self): x = self.cls(["x"], pd.DatetimeIndex([d])) self._assertIndexedLikeNDArray(x, np.datetime64(d), "datetime64[ns]") - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_0d_timedelta64(self): td = timedelta(hours=1) # todo: discussion needed @@ -255,7 +253,6 @@ def test_0d_object_array_with_list(self): assert_array_equal(x[0].data, listarray.squeeze()) assert_array_equal(x.squeeze().data, listarray.squeeze()) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_index_and_concat_datetime(self): # regression test for #125 date_range = pd.date_range("2011-09-01", periods=10) @@ -278,7 +275,6 @@ def test_0d_time_data(self): dt64_data = pd.date_range("2000-01-01", periods=3) - @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( "values, unit", [ @@ -297,7 +293,6 @@ def test_datetime64_conversion(self, values, unit): td64_data = pd.timedelta_range(start=0, periods=3) - @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( "values, unit", [ @@ -319,13 +314,11 @@ def test_object_conversion(self): actual = self.cls("x", data) assert actual.dtype == data.dtype - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime64_valid_range(self): # todo: test still needed? data = np.datetime64("1250-01-01", "us") self.cls(["t"], [data]) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_timedelta64_valid_range(self): # todo: test still needed? data = np.timedelta64("200000", "D") @@ -1082,7 +1075,6 @@ def test_numpy_same_methods(self): v = IndexVariable("x", np.arange(5)) assert 2 == v.searchsorted(2) - @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( "values, unit", [ @@ -1098,7 +1090,6 @@ def test_datetime64_conversion_scalar(self, values, unit): assert np.issubdtype(v.values, "datetime64") assert v.values.dtype == np.dtype(f"datetime64[{unit}]") - @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( "values, unit", [ @@ -1131,7 +1122,6 @@ def test_0d_datetime(self): assert v.dtype == np.dtype("datetime64[ns]") assert v.values == np.datetime64("2000-01-01", "ns") - @pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize( "values, unit", [(pd.to_timedelta("1s"), "ns"), (np.timedelta64(1, "s"), "s")] ) @@ -1579,7 +1569,6 @@ def test_transpose(self): v.transpose(..., "not_a_dim", missing_dims="warn") assert_identical(expected_ell, actual) - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_transpose_0d(self): for value in [ 3.5, @@ -2656,7 +2645,6 @@ def test_masked_array(self): assert_array_equal(expected, actual) assert actual.dtype == expected.dtype - @pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(self): # todo: check, if this test is OK expected = np.datetime64("2000-01-01") From c65c9afadc343319d7752d2129cf6705e1419005 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 13:08:53 +0100 Subject: [PATCH 103/134] add per review suggestions --- xarray/tests/test_variable.py | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index de618eff6a2..7286c90f82c 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -273,19 +273,20 @@ def test_0d_time_data(self): expected = np.datetime64("2000-01-01", "ns") assert x[0].values == expected - dt64_data = pd.date_range("2000-01-01", periods=3) + dt64_data = pd.date_range("1970-01-01", periods=3) @pytest.mark.parametrize( "values, unit", [ (dt64_data, "ns"), (dt64_data.values, "ns"), + (dt64_data.values.astype("datetime64[m]"), "s"), (dt64_data.values.astype("datetime64[s]"), "s"), + (dt64_data.values.astype("datetime64[ps]"), "ns"), (dt64_data.to_pydatetime(), "ns"), ], ) def test_datetime64_conversion(self, values, unit): - # todo: check, if this test is OK v = self.cls(["t"], values) assert v.dtype == np.dtype(f"datetime64[{unit}]") assert_array_equal(v.values, self.dt64_data.values) @@ -298,12 +299,13 @@ def test_datetime64_conversion(self, values, unit): [ (td64_data, "ns"), (td64_data.values, "ns"), + (td64_data.values.astype("timedelta64[m]"), "s"), (td64_data.values.astype("timedelta64[s]"), "s"), + (td64_data.values.astype("timedelta64[ps]"), "ns"), (td64_data.to_pytimedelta(), "ns"), ], ) def test_timedelta64_conversion(self, values, unit): - # todo: check, if this test is OK v = self.cls(["t"], values) assert v.dtype == np.dtype(f"timedelta64[{unit}]") assert_array_equal(v.values, self.td64_data.values) @@ -314,16 +316,6 @@ def test_object_conversion(self): actual = self.cls("x", data) assert actual.dtype == data.dtype - def test_datetime64_valid_range(self): - # todo: test still needed? - data = np.datetime64("1250-01-01", "us") - self.cls(["t"], [data]) - - def test_timedelta64_valid_range(self): - # todo: test still needed? - data = np.timedelta64("200000", "D") - self.cls(["t"], [data]) - def test_pandas_data(self): v = self.cls(["x"], pd.Series([0, 1, 2], index=[3, 2, 1])) assert_identical(v, v[[0, 1, 2]]) @@ -1081,10 +1073,10 @@ def test_numpy_same_methods(self): (np.datetime64("2000-01-01"), "s"), (pd.Timestamp("2000-01-01T00"), "ns"), (datetime(2000, 1, 1), "ns"), + (np.datetime64("2000-01-01T00:00:00.1234567891"), "ns"), ], ) def test_datetime64_conversion_scalar(self, values, unit): - # todo: check, if this test is OK v = Variable([], values) assert v.dtype == np.dtype(f"datetime64[{unit}]") assert np.issubdtype(v.values, "datetime64") @@ -1093,14 +1085,14 @@ def test_datetime64_conversion_scalar(self, values, unit): @pytest.mark.parametrize( "values, unit", [ + (np.timedelta64(1, "m"), "s"), (np.timedelta64(1, "D"), "s"), + (np.timedelta64(1001, "ps"), "ns"), (pd.Timedelta("1 day"), "ns"), (timedelta(days=1), "ns"), ], ) def test_timedelta64_conversion_scalar(self, values, unit): - # todo: discussion needed - # todo: check, if this test is OK v = Variable([], values) assert v.dtype == np.dtype(f"timedelta64[{unit}]") assert np.issubdtype(v.values, "timedelta64") @@ -1115,9 +1107,7 @@ def test_0d_str(self): assert v.dtype == np.dtype("S3") assert v.values == "foo".encode("ascii") - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_0d_datetime(self): - # todo: check, if this test is OK v = Variable([], pd.Timestamp("2000-01-01")) assert v.dtype == np.dtype("datetime64[ns]") assert v.values == np.datetime64("2000-01-01", "ns") @@ -1964,7 +1954,6 @@ def test_big_endian_reduce(self): expected = Variable([], 5) assert_identical(expected, v.sum()) - @pytest.mark.filterwarnings("ignore:Converting non-nanosecond") def test_reduce_funcs(self): v = Variable("x", np.array([1, np.nan, 2, 3])) assert_identical(v.mean(), Variable([], 2)) @@ -2646,7 +2635,6 @@ def test_masked_array(self): assert actual.dtype == expected.dtype def test_datetime(self): - # todo: check, if this test is OK expected = np.datetime64("2000-01-01") actual = as_compatible_data(expected) assert expected == actual @@ -2673,7 +2661,6 @@ def test_datetime(self): assert np.dtype("datetime64[ns]") == actual.dtype def test_tz_datetime(self) -> None: - # todo: check, if this test is OK tz = pytz.timezone("America/New_York") times_ns = pd.date_range("2000", periods=1, tz=tz) @@ -3006,8 +2993,7 @@ def test_from_pint_wrapping_dask(self, Var): ids=lambda x: f"{x}", ) def test_datetime_conversion(values, unit) -> None: - # todo: needs discussion - # todo: check, if this test is OK + # todo: check for redundancy (suggested per review) dims = ["time"] if isinstance(values, np.ndarray | pd.Index | pd.Series) else [] var = Variable(dims, values) if var.dtype.kind == "M": @@ -3049,7 +3035,7 @@ def test_datetime_conversion(values, unit) -> None: def test_pandas_two_only_datetime_conversion_warnings( data: pd.DatetimeIndex | pd.Series, dtype: str | pd.DatetimeTZDtype ) -> None: - # todo: check, if this test is OK + # todo: check for redundancy (suggested per review) var = Variable(["time"], data.astype(dtype)) # type: ignore[arg-type] if var.dtype.kind == "M": From 21dffc10056c3336adce1b7f530cef943ecda058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 13:23:02 +0100 Subject: [PATCH 104/134] add/remove todo --- xarray/tests/test_dataset.py | 1 - xarray/tests/test_variable.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 125cbd1e221..7fc487e03bb 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -105,7 +105,6 @@ def create_append_test_data(seed=None) -> tuple[Dataset, Dataset, Dataset]: lon = [0, 1, 2] nt1 = 3 nt2 = 2 - # todo: check, if all changes below are correct time1 = pd.date_range("2000-01-01", periods=nt1).as_unit("ns") time2 = pd.date_range("2000-02-01", periods=nt2).as_unit("ns") string_var = np.array(["a", "bc", "def"], dtype=object) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 7286c90f82c..4cf4204649d 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -3063,6 +3063,7 @@ def test_pandas_two_only_datetime_conversion_warnings( ids=lambda x: f"{x}", ) def test_timedelta_conversion(values, unit) -> None: + # todo: check for redundancy dims = ["time"] if isinstance(values, np.ndarray | pd.Index) else [] var = Variable(dims, values) assert var.dtype == np.dtype(f"timedelta64[{unit}]") From 8eeeb78e7de3e43bbd0888f232ee9f3be84e6a32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 13:29:22 +0100 Subject: [PATCH 105/134] rename timeunit -> format --- xarray/tests/test_coding_times.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 0d813dd94bd..f3428a2742b 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1330,7 +1330,7 @@ def test_contains_cftime_lazy() -> None: @pytest.mark.parametrize( - "timestr, timeunit, dtype, fill_value, use_encoding", + "timestr, format, dtype, fill_value, use_encoding", [ ("1677-09-21T00:12:43.145224193", "ns", np.int64, 20, True), ("1970-09-21T00:12:44.145224808", "ns", np.float64, 1e30, True), @@ -1349,15 +1349,15 @@ def test_contains_cftime_lazy() -> None: ) def test_roundtrip_datetime64_nanosecond_precision( timestr: str, - timeunit: Literal["ns", "us"], + format: Literal["ns", "us"], dtype: np.typing.DTypeLike, fill_value: int | float | None, use_encoding: bool, time_unit: PDDatetimeUnitOptions, ) -> None: # test for GH7817 - time = np.datetime64(timestr, timeunit) - times = [np.datetime64("1970-01-01T00:00:00", timeunit), np.datetime64("NaT"), time] + time = np.datetime64(timestr, format) + times = [np.datetime64("1970-01-01T00:00:00", format), np.datetime64("NaT"), time] if use_encoding: encoding = dict(dtype=dtype, _FillValue=fill_value) @@ -1365,12 +1365,12 @@ def test_roundtrip_datetime64_nanosecond_precision( encoding = {} var = Variable(["time"], times, encoding=encoding) - assert var.dtype == np.dtype(f"=M8[{timeunit}]") + assert var.dtype == np.dtype(f"=M8[{format}]") encoded_var = conventions.encode_cf_variable(var) assert ( encoded_var.attrs["units"] - == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" + == f"{_numpy_to_netcdf_timeunit(format)} since 1970-01-01 00:00:00" ) assert encoded_var.attrs["calendar"] == "proleptic_gregorian" assert encoded_var.data.dtype == dtype @@ -1379,14 +1379,14 @@ def test_roundtrip_datetime64_nanosecond_precision( ) result_unit = ( - timeunit - if np.timedelta64(1, timeunit) <= np.timedelta64(1, time_unit) + format + if np.timedelta64(1, format) <= np.timedelta64(1, time_unit) else time_unit ) assert decoded_var.dtype == np.dtype(f"=M8[{result_unit}]") assert ( decoded_var.encoding["units"] - == f"{_numpy_to_netcdf_timeunit(timeunit)} since 1970-01-01 00:00:00" + == f"{_numpy_to_netcdf_timeunit(format)} since 1970-01-01 00:00:00" ) assert decoded_var.encoding["dtype"] == dtype assert decoded_var.encoding["calendar"] == "proleptic_gregorian" From 7ad21831c973772cd7c37201e61d87e53fd8b0ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 13:47:51 +0100 Subject: [PATCH 106/134] return "ns" resolution per default for timedeltas, if not specified --- xarray/coding/times.py | 6 ++++-- xarray/tests/test_coding_times.py | 5 ++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 3412aa88582..e04a8b71e8d 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -577,7 +577,8 @@ def to_timedelta_unboxed(value, **kwargs): unique_timedeltas = np.unique(result[pd.notnull(result)]) unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) if unit not in {"s", "ms", "us", "ns"}: - unit = "s" + # default to ns, when not specified + unit = "ns" result = result.astype(f"timedelta64[{unit}]") assert np.issubdtype(result.dtype, "timedelta64") return result @@ -598,7 +599,8 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: unit = _netcdf_to_numpy_timeunit(units) as_unit = unit if unit not in {"s", "ms", "us", "ns"}: - as_unit = "s" + # default to ns, when not specified + as_unit = "ns" result = ( pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() ) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index f3428a2742b..4c148195e26 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -626,9 +626,8 @@ def test_infer_cftime_datetime_units(calendar, date_args, expected) -> None: ], ) def test_cf_timedelta(timedeltas, units, numbers) -> None: - # todo: check, if this test is OK if timedeltas == "NaT": - timedeltas = np.timedelta64("NaT", "s") + timedeltas = np.timedelta64("NaT", "ns") else: timedeltas = to_timedelta_unboxed(timedeltas) numbers = np.array(numbers) @@ -644,7 +643,7 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: assert_array_equal(expected, actual) assert expected.dtype == actual.dtype - expected = np.timedelta64("NaT", "s") + expected = np.timedelta64("NaT", "ns") actual = decode_cf_timedelta(np.array(np.nan), "days") assert_array_equal(expected, actual) assert expected.dtype == actual.dtype From 9e4cab65d12052f84bdad1f939d2563b0658ed59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 14:02:14 +0100 Subject: [PATCH 107/134] Be specific on types/dtpyes --- xarray/tests/test_coding_times.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 4c148195e26..09de0c14afd 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -172,12 +172,18 @@ def test_decode_cf_datetime_overflow(time_unit: PDDatetimeUnitOptions) -> None: # date after 2262 and before 1678 days = (-117710, 95795) expected = (datetime(1677, 9, 20), datetime(2262, 4, 12)) - for i, day in enumerate(days): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") - result = decode_cf_datetime(day, units, time_unit=time_unit) + result = decode_cf_datetime( + day, units, calendar="standard", time_unit=time_unit + ) assert result == expected[i] + # additional check to see if type/dtypes are correct + if time_unit == "ns": + assert isinstance(result.item(), datetime) + else: + assert result.dtype == np.dtype(f"=M8[{time_unit}]") def test_decode_cf_datetime_non_standard_units() -> None: From 5964a9e4aa2fee10f44a8157902808af23fe3268 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 15:47:04 +0100 Subject: [PATCH 108/134] add comment --- xarray/tests/test_cftimeindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 95dd7fd5f81..23850652d9c 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1266,7 +1266,7 @@ def test_multiindex(): @pytest.mark.parametrize("freq", ["3663s", "33min", "2h"]) @pytest.mark.parametrize("method", ["floor", "ceil", "round"]) def test_rounding_methods_against_datetimeindex(freq, method): - # todo: check, if setting to "us" is enough + # for now unit="us" seems good enough expected = pd.date_range("2000-01-02T01:03:51", periods=10, freq="1777s", unit="us") expected = getattr(expected, method)(freq) result = xr.cftime_range("2000-01-02T01:03:51", periods=10, freq="1777s") From 308391d9fa526f9c00b8bc77ae6f1fc869764246 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Tue, 7 Jan 2025 16:15:04 +0100 Subject: [PATCH 109/134] add suggestions from code review --- xarray/coding/times.py | 17 ++++++++--------- xarray/tests/test_backends.py | 18 ++++++++---------- xarray/tests/test_cftimeindex.py | 6 +++++- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index e04a8b71e8d..ec1bca46a11 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -543,22 +543,22 @@ def decode_cf_datetime( if dates_min < border: if _is_standard_calendar(calendar): - warnings.warn( + emit_user_level_warning( "Unable to decode time axis into full " "numpy.datetime64 objects, continuing using " - "cftime.datetime objects instead, reason: dates out " - "of range", + "cftime.datetime objects instead, reason: dates prior " + "reform date (1582-10-15). To silence this warning specify " + "'use_cftime=True'.", SerializationWarning, - stacklevel=3, ) elif time_unit == "ns" and (dates_min < lower or dates_max > upper): - warnings.warn( + emit_user_level_warning( "Unable to decode time axis into full " - "numpy.datetime64 objects, continuing using " + "numpy.datetime64[ns] objects, continuing using " "cftime.datetime objects instead, reason: dates out " - "of range", + "of range. To silence this warning use a coarser resolution " + "'time_unit' or specify 'use_cftime=True'.", SerializationWarning, - stacklevel=3, ) else: if _is_standard_calendar(calendar): @@ -1114,7 +1114,6 @@ def _eagerly_encode_cf_timedelta( time_deltas = pd.TimedeltaIndex(ravel(timedeltas)) # get resolution of TimedeltaIndex and align time_delta deltas_unit = time_deltas.unit - # todo: check, if this works in any case time_delta = time_delta.astype(f"=m8[{deltas_unit}]") # retrieve needed units to faithfully encode to int64 diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index cac23a52b44..f85564b3fc4 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -633,7 +633,10 @@ def test_roundtrip_cftime_datetime_data(self) -> None: assert actual.t.encoding["calendar"] == expected_calendar def test_roundtrip_timedelta_data(self) -> None: - # todo: check, if default unit "s" is enough + # todo: suggestion from review: + # roundtrip large microsecond or coarser resolution timedeltas, + # though we cannot test that until we fix the timedelta decoding + # to support large ranges time_deltas = pd.to_timedelta(["1h", "2h", "NaT"]).as_unit("s") # type: ignore[arg-type, unused-ignore] expected = Dataset({"td": ("td", time_deltas), "td0": time_deltas[0]}) with self.roundtrip(expected) as actual: @@ -5627,16 +5630,13 @@ def test_use_cftime_standard_calendar_default_in_range(calendar) -> None: @requires_cftime @requires_scipy_or_netCDF4 @pytest.mark.parametrize("calendar", ["standard", "gregorian"]) -@pytest.mark.parametrize("units_year", [1500, 1582]) -def test_use_cftime_standard_calendar_default_out_of_range( - calendar, units_year -) -> None: +def test_use_cftime_standard_calendar_default_out_of_range(calendar) -> None: # todo: check, if we still need to test for two dates import cftime x = [0, 1] time = [0, 720] - units = f"days since {units_year}-01-01" + units = "days since 1582-01-01" original = DataArray(x, [("time", time)], name="x").to_dataset() for v in ["x", "time"]: original[v].attrs["units"] = units @@ -5722,12 +5722,10 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: @requires_scipy_or_netCDF4 @pytest.mark.parametrize("calendar", ["standard", "gregorian"]) -@pytest.mark.parametrize("units_year", [1500, 1582]) -def test_use_cftime_false_standard_calendar_out_of_range(calendar, units_year) -> None: - # todo: check, if we still need to check for two dates +def test_use_cftime_false_standard_calendar_out_of_range(calendar) -> None: x = [0, 1] time = [0, 720] - units = f"days since {units_year}-01-01" + units = "days since 1582-01-01" original = DataArray(x, [("time", time)], name="x").to_dataset() for v in ["x", "time"]: original[v].attrs["units"] = units diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py index 23850652d9c..8fc79a0cc53 100644 --- a/xarray/tests/test_cftimeindex.py +++ b/xarray/tests/test_cftimeindex.py @@ -1238,7 +1238,11 @@ def test_to_datetimeindex(calendar, unsafe): @pytest.mark.parametrize("calendar", _ALL_CALENDARS) def test_to_datetimeindex_out_of_range(calendar): index = xr.cftime_range("0001", periods=5, calendar=calendar) - # todo: needs discussion, do we need this test? + # todo: suggestion from code review: + # - still warn when converting from a non-standard calendar + # to a proleptic Gregorian calendar + # - also warn when converting from a Gregorian calendar + # to a proleptic Gregorian calendar when dates fall before the reform if calendar in _NON_STANDARD_CALENDARS: with pytest.warns(RuntimeWarning, match="non-standard"): index.to_datetimeindex() From d494fe04f4bc9094e557f3891eadd1318eb3d744 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 8 Jan 2025 07:50:09 +0100 Subject: [PATCH 110/134] fix docs --- doc/user-guide/weather-climate.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index d80cb9e8e7d..013e362519e 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -81,7 +81,7 @@ using a standard calendar, but outside the `precision range`_ and dates [prior t represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"]), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. From xarray version 2025.01.1 on, non-nanosecond precision datetime values are also supported in xarray (this can be parameterized via :py:class:`coders.CFDatetimeCoder` and ``decode_times` kwarg). + values. From xarray version 2025.01.1 on, non-nanosecond precision datetime values are also supported in xarray (this can be parameterized via :py:class:`~xarray.coders.CFDatetimeCoder` and ``decode_times`` kwarg). For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a From ef6f722484e68626b7060ea16e59e1924773c72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 8 Jan 2025 11:22:41 +0100 Subject: [PATCH 111/134] fix test which isn't run for numpy2 atm --- xarray/tests/test_backends.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 76b0e898514..8596526b2c2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -5707,7 +5707,8 @@ def test_use_cftime_false_standard_calendar_in_range(calendar) -> None: with create_tmp_file() as tmp_file: original.to_netcdf(tmp_file) with warnings.catch_warnings(record=True) as record: - with open_dataset(tmp_file, use_cftime=False) as ds: + coder = xr.coders.CFDatetimeCoder(use_cftime=False) + with open_dataset(tmp_file, decode_times=coder) as ds: assert_identical(expected_x, ds.x) assert_identical(expected_time, ds.time) _assert_no_dates_out_of_range_warning(record) From 4ea5241324a404ae5a9ccf0937b82f810c8080db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 8 Jan 2025 15:39:54 +0100 Subject: [PATCH 112/134] add notes on to_datetime section, update examples showing usage of 'as_unit' --- doc/internals/time-coding.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 8aa15a80011..47ec9618aa5 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -26,7 +26,7 @@ to_datetime The function :py:func:`pandas.to_datetime` is used within xarray for inferring units and for testing purposes. -In normal operation :py:func:`pandas.to_datetime` returns a :py:class:`pandas.Timestamp` (for scalar input) or :py:class:`pandas.DatetimeIndex` (for array-like input) which are related to ``np.datetime64`` values with a resolution inherited from the input. If no resolution can be inherited ``'ns'`` is assumed. That has the implication that the maximum usable time range for those cases is approximately +/- 292 years centered around the Unix epoch (1970-01-01). To accommodate that, we carefully check the units/resolution in the encoding and decoding step. +In normal operation :py:func:`pandas.to_datetime` returns a :py:class:`pandas.Timestamp` (for scalar input) or :py:class:`pandas.DatetimeIndex` (for array-like input) which are related to ``np.datetime64`` values with a resolution inherited from the input (can be one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``). If no resolution can be inherited ``'ns'`` is assumed. That has the implication that the maximum usable time range for those cases is approximately +/- 292 years centered around the Unix epoch (1970-01-01). To accommodate that, we carefully check the units/resolution in the encoding and decoding step. When the arguments are numeric (not strings or ``np.datetime64`` values) ``"unit"`` can be anything from ``'Y'``, ``'W'``, ``'D'``, ``'h'``, ``'m'``, ``'s'``, ``'ms'``, ``'us'`` or ``'ns'``, though the returned resolution will be ``"ns"``. @@ -52,18 +52,19 @@ For input values which can't be represented in nanosecond resolution an :py:clas ``np.datetime64`` values can be extracted with :py:meth:`pandas.Timestamp.to_numpy` and :py:meth:`pandas.DatetimeIndex.to_numpy`. The returned resolution depends on the internal representation. This representation can be changed using :py:meth:`pandas.Timestamp.as_unit` and :py:meth:`pandas.DatetimeIndex.as_unit` respectively. + ``as_unit`` takes one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'`` as an argument. That means we are able to represent datetimes with second, millisecond, microsecond or nanosecond resolution. .. ipython:: python time = pd.to_datetime(np.datetime64(0, "D")) print("Datetime:", time, np.asarray([time.to_numpy()]).dtype) - print("Datetime as_unit('s'):", time.as_unit("s")) - print("Datetime to_numpy():", time.as_unit("s").to_numpy()) + print("Datetime as_unit('ms'):", time.as_unit("ms")) + print("Datetime to_numpy():", time.as_unit("ms").to_numpy()) time = pd.to_datetime(np.array([-1000, 1, 2], dtype="datetime64[Y]")) print("DatetimeIndex:", time) - print("DatetimeIndex as_unit('s'):", time.as_unit("s")) - print("DatetimeIndex to_numpy():", time.as_unit("s").to_numpy()) + print("DatetimeIndex as_unit('us'):", time.as_unit("us")) + print("DatetimeIndex to_numpy():", time.as_unit("us").to_numpy()) .. warning:: Input data with resolution higher than ``'ns'`` (eg. ``'ps'``, ``'fs'``, ``'as'``) is truncated (not rounded) at the ``'ns'``-level. This is currently broken for the ``'ps'`` input, where it is interpreted as ``'ns'``. From 151e9cdad0c7e8848325cbcf105ef5bdd9504e5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 8 Jan 2025 15:48:11 +0100 Subject: [PATCH 113/134] use np.timedelta64 for to_timedelta example, update as_unit example, update note --- doc/internals/time-coding.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 47ec9618aa5..34a3b15b213 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -151,17 +151,17 @@ and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. .. ipython:: python - delta = pd.to_timedelta(1, unit="D") - print("Timedelta:", delta) - print("Timedelta as_unit('s'):", delta.as_unit("s")) - print("Timedelta to_numpy():", delta.as_unit("s").to_numpy()) + delta = pd.to_timedelta(np.timedelta64(1, "D")) + print("Timedelta:", delta, np.asarray([delta.to_numpy()]).dtype) + print("Timedelta as_unit('ms'):", delta.as_unit("ms")) + print("Timedelta to_numpy():", delta.as_unit("ms").to_numpy()) delta = pd.to_timedelta([0, 1, 2], unit="D") print("TimedeltaIndex:", delta) - print("TimedeltaIndex as_unit('s'):", delta.as_unit("s")) - print("TimedeltaIndex to_numpy():", delta.as_unit("s").to_numpy()) + print("TimedeltaIndex as_unit('ms'):", delta.as_unit("ms")) + print("TimedeltaIndex to_numpy():", delta.as_unit("ms").to_numpy()) .. note:: - For the functionality in xarray the output resolution is converted from ``'ns'`` to the lowest needed resolution. + For the functionality in xarray the resolution is converted from ``'ns'`` to the lowest needed resolution when decoding. .. warning:: Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_timedelta` when providing :py:class:`numpy.timedelta64` as scalar or numpy array as input. From 8ecda4e287c78e1f5833573d10f7d728ac915a0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 8 Jan 2025 15:50:59 +0100 Subject: [PATCH 114/134] remove note --- doc/internals/time-coding.rst | 3 --- 1 file changed, 3 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index 34a3b15b213..f3cfe824200 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -264,9 +264,6 @@ DatetimeIndex :py:class:`pandas.DatetimeIndex` is used to wrap ``np.datetime64`` values or other datetime-likes when encoding. The resolution of the DatetimeIndex depends on the input, but can be only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cut to ``'ns'``. :py:class:`pandas.DatetimeIndex` will raise :py:class:`pandas.OutOfBoundsDatetime` if the input can't be represented in the given resolution. -.. note:: - For xarray we assume that all :py:class:`numpy.datetime64` provided to :py:class:`pandas.DatetimeIndex` are up to the specs. This is especially true, when those values have been decoded upfront. If the data is provided by users, they should handle any issues before. - .. ipython:: python try: From 2bbf0ff2f43ce5c0e06f7853578b525d03b898ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Wed, 8 Jan 2025 18:32:02 +0100 Subject: [PATCH 115/134] Apply suggestions from code review Co-authored-by: Deepak Cherian --- doc/user-guide/time-series.rst | 2 +- doc/user-guide/weather-climate.rst | 2 +- xarray/tests/test_backends.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 596e51ecef5..8c4d9aa5de7 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -83,7 +83,7 @@ You can manual decode arrays in this form by passing a dataset to coder = xr.coders.CFDatetimeCoder(time_unit="s") xr.decode_cf(ds, decode_times=coder) -From xarray 2025.01.1 the resolution of the dates can be tuned between "s", "ms", "us" and "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a store contains dates outside of these bounds (or dates < 1582-10-15 with a Gregorian, also known as standard, calendar), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +From xarray 2025.01.1 the resolution of the dates can be one of "s", "ms", "us" or "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a store contains dates outside of these bounds (or dates < 1582-10-15 with a Gregorian, also known as standard, calendar), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables most of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. See :ref:`CFTimeIndex` for more information. diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 013e362519e..9f36eab7b81 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -75,7 +75,7 @@ using a standard calendar, but outside the `precision range`_ and dates [prior t - The dates are from a non-standard calendar - Any dates are outside the nanosecond-precision range (prior xarray version 2025.01.1) - - Any dates are outside the time span limited by the resolution (from xarray version v2025.01.1) + - Any dates are outside the time span limited by the resolution (from xarray version 2025.01.1) Otherwise pandas-compatible dates from a standard calendar will be represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"]), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 8596526b2c2..72078da11b9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -616,7 +616,7 @@ def test_roundtrip_cftime_datetime_data(self) -> None: # proleptic gregorian will be decoded into numpy datetime64 # fixing to expectations if actual.t.dtype.kind == "M": - dtype = f"datetime64[{np.datetime_data(actual.t)[0]}]" + dtype = actual.t.dtype expected_decoded_t = expected_decoded_t.astype(dtype) expected_decoded_t0 = expected_decoded_t0.astype(dtype) abs_diff = abs(actual.t.values - expected_decoded_t) From 03086723dd026970660f8d671575bd54bbef619a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 09:24:08 +0100 Subject: [PATCH 116/134] refactor timedelta decoding to _numbers_to_timedelta and res-use it within decode_cf_timedelta --- xarray/coding/times.py | 91 ++++++++++++++++++++++++------------------ 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index ec1bca46a11..579585850c4 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -458,41 +458,12 @@ def _decode_datetime_with_pandas( elif flat_num_dates.dtype.kind in "f": flat_num_dates = flat_num_dates.astype(np.float64) - # keep NaT/nan mask - nan = np.isnan(flat_num_dates) | (flat_num_dates == np.iinfo(np.int64).min) - - # in case we need to change the unit, we fix the numbers here - # this should be safe, as errors would have been raised above - ns_time_unit = _NS_PER_TIME_DELTA[time_unit] - ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_date.unit] - if ns_time_unit > ns_ref_date_unit: - flat_num_dates *= np.int64(ns_time_unit / ns_ref_date_unit) - time_unit = ref_date.unit - - # estimate fitting resolution for floating point values - # this iterates until all floats are fractionless or time_unit == "ns" - if flat_num_dates.dtype.kind == "f" and time_unit != "ns": - flat_num_dates, new_time_unit = _check_higher_resolution( - flat_num_dates, time_unit - ) - if time_unit != new_time_unit: - msg = ( - f"Can't decode floating point datetime to {time_unit!r} without " - f"precision loss, decoding to {new_time_unit!r} instead. " - f"To silence this warning use time_unit={new_time_unit!r} in call to " - f"decoding function." - ) - emit_user_level_warning(msg, SerializationWarning) - time_unit = new_time_unit - - # Cast input ordinals to integers and properly handle NaN/NaT - # to prevent casting NaN to int - flat_num_dates_int = np.zeros_like(flat_num_dates, dtype=np.int64) - flat_num_dates_int[nan] = np.iinfo(np.int64).min - flat_num_dates_int[~nan] = flat_num_dates[~nan].astype(np.int64) + timedeltas = _numbers_to_timedelta( + flat_num_dates, time_unit, ref_date.unit, "datetime" + ) - # cast to timedelta64[time_unit] and add to ref_date - return ref_date + flat_num_dates_int.astype(f"timedelta64[{time_unit}]") + # add timedeltas to ref_date + return ref_date + timedeltas def decode_cf_datetime( @@ -590,6 +561,49 @@ def to_datetime_unboxed(value, **kwargs): return result +def _numbers_to_timedelta( + flat_num: np.ndarray, + time_unit: NPDatetimeUnitOptions, + ref_unit: PDDatetimeUnitOptions, + datatype: str, +) -> np.ndarray: + """Transform numbers to np.timedelta64.""" + # keep NaT/nan mask + nan = np.isnan(flat_num) | (flat_num == np.iinfo(np.int64).min) + + # in case we need to change the unit, we fix the numbers here + # this should be safe, as errors would have been raised above + ns_time_unit = _NS_PER_TIME_DELTA[time_unit] + ns_ref_date_unit = _NS_PER_TIME_DELTA[ref_unit] + if ns_time_unit > ns_ref_date_unit: + flat_num *= np.int64(ns_time_unit / ns_ref_date_unit) + time_unit = ref_unit + + # estimate fitting resolution for floating point values + # this iterates until all floats are fractionless or time_unit == "ns" + if flat_num.dtype.kind == "f" and time_unit != "ns": + flat_num_dates, new_time_unit = _check_higher_resolution(flat_num, time_unit) + if time_unit != new_time_unit: + msg = ( + f"Can't decode floating point {datatype} to {time_unit!r} without " + f"precision loss, decoding to {new_time_unit!r} instead. " + f"To silence this warning use time_unit={new_time_unit!r} in call to " + f"decoding function." + ) + emit_user_level_warning(msg, SerializationWarning) + time_unit = new_time_unit + + # Cast input ordinals to integers and properly handle NaN/NaT + # to prevent casting NaN to int + with warnings.catch_warnings(): + warnings.simplefilter("ignore", RuntimeWarning) + flat_num = flat_num.astype(np.int64) + flat_num[nan] = np.iinfo(np.int64).min + + # cast to wanted type + return flat_num.astype(f"timedelta64[{time_unit}]") + + def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: # todo: check, if this works as intended """Given an array of numeric timedeltas in netCDF format, convert it into a @@ -597,14 +611,15 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: """ num_timedeltas = np.asarray(num_timedeltas) unit = _netcdf_to_numpy_timeunit(units) + + timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta") + as_unit = unit if unit not in {"s", "ms", "us", "ns"}: # default to ns, when not specified as_unit = "ns" - result = ( - pd.to_timedelta(ravel(num_timedeltas), unit=unit).as_unit(as_unit).to_numpy() - ) - return reshape(result, num_timedeltas.shape) + result = pd.to_timedelta(ravel(timedeltas)).as_unit(as_unit).to_numpy() + return reshape(result, timedeltas.shape) def _unit_timedelta_cftime(units: str) -> timedelta: From b043020e64e44ec42760c10d1bb74cc7c63f6575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 10:42:22 +0100 Subject: [PATCH 117/134] fix conventions test, add todo --- xarray/coding/times.py | 4 ++-- xarray/tests/test_conventions.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 579585850c4..2ec53d96606 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -548,7 +548,7 @@ def to_timedelta_unboxed(value, **kwargs): unique_timedeltas = np.unique(result[pd.notnull(result)]) unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) if unit not in {"s", "ms", "us", "ns"}: - # default to ns, when not specified + # default to "ns", when not specified unit = "ns" result = result.astype(f"timedelta64[{unit}]") assert np.issubdtype(result.dtype, "timedelta64") @@ -616,7 +616,7 @@ def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: as_unit = unit if unit not in {"s", "ms", "us", "ns"}: - # default to ns, when not specified + # default to "ns", when not specified as_unit = "ns" result = pd.to_timedelta(ravel(timedeltas)).as_unit(as_unit).to_numpy() return reshape(result, timedeltas.shape) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index b1bf9a762ea..346ad1c908b 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -511,13 +511,16 @@ def test_decode_dask_times(self) -> None: @pytest.mark.parametrize("time_unit", ["s", "ms", "us", "ns"]) def test_decode_cf_time_kwargs(self, time_unit) -> None: + # todo: if we set timedelta attrs "units": "days" + # this errors on the last decode_cf wrt to the lazy_elemwise_func + # trying to convert twice ds = Dataset.from_dict( { "coords": { "timedelta": { "data": np.array([1, 2, 3], dtype="int64"), "dims": "timedelta", - "attrs": {"units": "days"}, + "attrs": {"units": "seconds"}, }, "time": { "data": np.array([1, 2, 3], dtype="int64"), From 7182ce2ae7ec0859fbae75ef16f3528ff9259bf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 11:29:41 +0100 Subject: [PATCH 118/134] run times through pd.Timestamp to catch possible overflows --- xarray/coding/times.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 2ec53d96606..37c88089397 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -720,7 +720,9 @@ def cftime_to_nptime( try: # We expect either "us" resolution or "s" resolution depending on # whether 'microseconds' are defined for the input or not. - dt = np.datetime64(t.isoformat()).astype(f"=M8[{time_unit}]") + dt = ( + pd.Timestamp(np.datetime64(t.isoformat())).as_unit(time_unit).to_numpy() + ) except ValueError as e: if raise_on_invalid: raise ValueError( From 470235e1d0cddbbc5a7d2110916b06c423ee8a56 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 11:46:03 +0100 Subject: [PATCH 119/134] fix tests for cftime_to_nptime --- xarray/tests/test_coding_times.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 09de0c14afd..9d819688e72 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -138,7 +138,7 @@ def test_cf_datetime( actual = decode_cf_datetime(num_dates, units, calendar, time_unit=time_unit) if actual.dtype.kind != "O": - expected = cftime_to_nptime(expected) + expected = cftime_to_nptime(expected, time_unit=time_unit) abs_diff = np.asarray(abs(actual - expected)).ravel() abs_diff = pd.to_timedelta(abs_diff.tolist()).to_numpy() @@ -281,7 +281,7 @@ def test_decode_dates_outside_timestamp_range( time, units, calendar=calendar, only_use_cftime_datetimes=True ) if calendar == "proleptic_gregorian" and time_unit != "ns": - expected = cftime_to_nptime(expected) + expected = cftime_to_nptime(expected, time_unit=time_unit) expected_date_type = type(expected[0]) with warnings.catch_warnings(): @@ -441,8 +441,8 @@ def test_decode_multidim_time_outside_timestamp_range( expected2 = cftime.num2date(time2, units, calendar, only_use_cftime_datetimes=True) if calendar == "proleptic_gregorian" and time_unit != "ns": - expected1 = cftime_to_nptime(expected1) - expected2 = cftime_to_nptime(expected2) + expected1 = cftime_to_nptime(expected1, time_unit=time_unit) + expected2 = cftime_to_nptime(expected2, time_unit=time_unit) with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unable to decode time axis") From e619a4c96446f5dad39702bec4702abdf93cd230 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 11:54:58 +0100 Subject: [PATCH 120/134] fix cftime_to_nptime in cftimeindex --- xarray/coding/cftimeindex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py index 5eb5f6dce12..bd5f51551c7 100644 --- a/xarray/coding/cftimeindex.py +++ b/xarray/coding/cftimeindex.py @@ -588,7 +588,7 @@ def to_datetimeindex(self, unsafe=False): return pd.DatetimeIndex([]) # transform to us-resolution is needed for DatetimeIndex - nptimes = cftime_to_nptime(self).astype("=M8[us]") + nptimes = cftime_to_nptime(self, time_unit="us") calendar = infer_calendar_name(self) if calendar not in _STANDARD_CALENDARS and not unsafe: warnings.warn( From 700e78d291e88ad74f87de8dac8ade6a3e4f2d19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 12:00:02 +0100 Subject: [PATCH 121/134] introduce pd.Timestamp instance check --- xarray/core/indexing.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 66c84f6b8db..51fc4a00421 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -1753,11 +1753,10 @@ def _convert_scalar(self, item): # pd.Timestamp rather np.than datetime64 but this is easier # (for now) item = np.datetime64("NaT", "ns") + elif isinstance(item, pd.Timedelta): + item = item.to_numpy() elif isinstance(item, timedelta): - # from xarray 2025.01.1 xarray allows non-nanosecond resolution - # so we just convert to_numpy if possible - if hasattr(item, "to_numpy"): - item = item.to_numpy() + item = np.timedelta64(item) elif isinstance(item, pd.Timestamp): # Work around for GH: pydata/xarray#1932 and numpy/numpy#10668 # numpy fails to convert pd.Timestamp to np.datetime64[ns] From 4525ea192058482a2943a711a2ff34d18f43bb8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 15:02:54 +0100 Subject: [PATCH 122/134] warn if out-of-bound datetimes are encoded with standard calendar, fall back to cftime encoding, add fix for cftime issue where python datetimes are not encoded correctly with date2num. --- xarray/coding/times.py | 33 +++++++++++++++++++++++++++++-- xarray/tests/test_coding_times.py | 26 +++++++++++++++++++++++- 2 files changed, 56 insertions(+), 3 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 37c88089397..bcfd0f8cfbd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -872,6 +872,22 @@ def _encode_datetime_with_cftime(dates, units: str, calendar: str) -> np.ndarray # numpy's broken datetime conversion only works for us precision dates = dates.astype("M8[us]").astype(datetime) + def wrap_dt(dt): + # convert to cftime proleptic gregorian in case of datetime.datetime + # needed because of https://github.com/Unidata/cftime/issues/354 + if isinstance(dt, datetime) and not isinstance(dt, cftime.datetime): + dt = cftime.datetime( + dt.year, + dt.month, + dt.day, + dt.hour, + dt.minute, + dt.second, + dt.microsecond, + calendar="proleptic_gregorian", + ) + return dt + def encode_datetime(d): # Since netCDF files do not support storing float128 values, we ensure # that float64 values are used by setting longdouble=False in num2date. @@ -881,10 +897,10 @@ def encode_datetime(d): return ( np.nan if d is None - else cftime.date2num(d, units, calendar, longdouble=False) + else cftime.date2num(wrap_dt(d), units, calendar, longdouble=False) ) except TypeError: - return np.nan if d is None else cftime.date2num(d, units, calendar) + return np.nan if d is None else cftime.date2num(wrap_dt(d), units, calendar) return reshape(np.array([encode_datetime(d) for d in ravel(dates)]), dates.shape) @@ -987,6 +1003,19 @@ def _eagerly_encode_cf_datetime( # parse with cftime instead raise OutOfBoundsDatetime assert np.issubdtype(dates.dtype, "datetime64") + if calendar in ["standard", "gregorian"] and np.nanmin(dates).astype( + "=M8[us]" + ).astype(datetime) < datetime(1582, 10, 15): + # if we use standard calendar and for dates before the reform + # we need to use cftime instead + emit_user_level_warning( + f"Unable to encode numpy.datetime64 objects with {calendar} calendar." + "Using cftime.datetime objects instead, reason: dates prior " + "reform date (1582-10-15). To silence this warning transform " + "numpy.datetime64 to corresponding cftime.datetime beforehand.", + SerializationWarning, + ) + raise OutOfBoundsDatetime time_unit, ref_date = _unpack_time_unit_and_ref_date(units) # calendar equivalence only for days after the reform diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 9d819688e72..6be9b1c475a 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -1,7 +1,7 @@ from __future__ import annotations import warnings -from datetime import timedelta +from datetime import datetime, timedelta from itertools import product from typing import Literal @@ -1156,6 +1156,30 @@ def test__encode_datetime_with_cftime() -> None: np.testing.assert_equal(result, expected) +@requires_cftime +def test_encode_decode_cf_datetime_outofbounds_warnings( + time_unit: PDDatetimeUnitOptions, +) -> None: + import cftime + + if time_unit == "ns": + pytest.skip("does not work work out of bounds datetimes") + dates = np.array(["0001-01-01", "2001-01-01"], dtype=f"datetime64[{time_unit}]") + cfdates = np.array( + [ + cftime.datetime(t0.year, t0.month, t0.day, calendar="gregorian") + for t0 in dates.astype(datetime) + ] + ) + with pytest.warns( + SerializationWarning, match="Unable to encode numpy.datetime64 objects" + ): + encoded = encode_cf_datetime(dates, "seconds since 2000-01-01", "standard") + with pytest.warns(SerializationWarning, match="Unable to decode time axis"): + decoded = decode_cf_datetime(*encoded) + np.testing.assert_equal(decoded, cfdates) + + @pytest.mark.parametrize("calendar", ["gregorian", "Gregorian", "GREGORIAN"]) def test_decode_encode_roundtrip_with_non_lowercase_letters( calendar, time_unit: PDDatetimeUnitOptions From 0b93dbd83b1d0cce401bf2bd61bd34c6b1103a84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 15:23:05 +0100 Subject: [PATCH 123/134] fix time-coding.rst, add reference to time-series.rst. --- doc/internals/time-coding.rst | 2 +- doc/user-guide/time-series.rst | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index f3cfe824200..da025dea1e2 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -326,7 +326,7 @@ CF time decoding Decoding of ``values`` with a time unit specification like ``"seconds since 1992-10-8 15:15:42.5 -6:00"`` into datetimes using the CF conventions is a multistage process. -1. If we have a non-standard calendar (e.g. ``"noleap"``) decoding is done with the ``cftime`` package, which is not covered in this section. For the``"standard"``/``"gregorian"`` calendar as well as the ``"proleptic_gregorian"`` calendar the above outlined pandas functionality is used. +1. If we have a non-standard calendar (e.g. ``"noleap"``) decoding is done with the ``cftime`` package, which is not covered in this section. For the ``"standard"``/``"gregorian"`` calendar as well as the ``"proleptic_gregorian"`` calendar the above outlined pandas functionality is used. 2. The ``"standard"``/``"gregorian"`` calendar and the ``"proleptic_gregorian"`` are equivalent for any dates and reference times >= ``"1582-10-15"``. First the reference time is checked and any timezone information stripped off. In a second step, the minimum and maximum ``values`` are checked if they can be represented in the current reference time resolution. At the same time integer overflow would be caught. For the ``"standard"``/``"gregorian"`` calendar the dates are checked to be >= ``"1582-10-15"``. If anything fails, the decoding is attempted with ``cftime``. diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 8c4d9aa5de7..8205f19bc93 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -42,7 +42,8 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified and for :py:func:`pandas.to_datetime` the selection of the resolution isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used - directly. + directly. There is more in-depth information in section + :ref:``. Alternatively, you can supply arrays of Python ``datetime`` objects. These get converted automatically when used as arguments in xarray objects (with us-resolution): From b38cd7e1197d82ef8da939ae22e227c305a3d144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 15:38:41 +0100 Subject: [PATCH 124/134] try to fix typing, ignore one --- xarray/coding/times.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index bcfd0f8cfbd..0200f8867ac 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -395,7 +395,7 @@ def _check_higher_resolution( ) -> tuple[np.ndarray, PDDatetimeUnitOptions]: """Iterate until fitting resolution found.""" res: list[PDDatetimeUnitOptions] = ["s", "ms", "us", "ns"] - new_units = res[res.index(cast(PDDatetimeUnitOptions, time_unit)) :] + new_units = res[res.index(time_unit) :] for new_time_unit in new_units: if not ((np.unique(flat_num_dates % 1) > 0).any() and new_time_unit != "ns"): break @@ -582,7 +582,7 @@ def _numbers_to_timedelta( # estimate fitting resolution for floating point values # this iterates until all floats are fractionless or time_unit == "ns" if flat_num.dtype.kind == "f" and time_unit != "ns": - flat_num_dates, new_time_unit = _check_higher_resolution(flat_num, time_unit) + flat_num_dates, new_time_unit = _check_higher_resolution(flat_num, time_unit) # type: ignore[arg-type] if time_unit != new_time_unit: msg = ( f"Can't decode floating point {datatype} to {time_unit!r} without " From a2d1c96adf03f5baf6c02f5a3f1ab6abc94299e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 15:40:30 +0100 Subject: [PATCH 125/134] try to fix docs --- doc/user-guide/time-series.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 8205f19bc93..711c01610aa 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -43,7 +43,7 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: and for :py:func:`pandas.to_datetime` the selection of the resolution isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used directly. There is more in-depth information in section - :ref:``. + :ref:``. Alternatively, you can supply arrays of Python ``datetime`` objects. These get converted automatically when used as arguments in xarray objects (with us-resolution): From c4b2af31da19290067f49e9a47bb9a58de3de3b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Thu, 9 Jan 2025 15:54:14 +0100 Subject: [PATCH 126/134] revert doc-changes --- doc/user-guide/time-series.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 711c01610aa..8c4d9aa5de7 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -42,8 +42,7 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified and for :py:func:`pandas.to_datetime` the selection of the resolution isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used - directly. There is more in-depth information in section - :ref:``. + directly. Alternatively, you can supply arrays of Python ``datetime`` objects. These get converted automatically when used as arguments in xarray objects (with us-resolution): From 45a0d565e00f311952823f68483a2ae50ee4a52b Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Thu, 9 Jan 2025 09:49:40 -0700 Subject: [PATCH 127/134] Add a non-ns test for polyval, polyfit --- xarray/tests/test_computation.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/xarray/tests/test_computation.py b/xarray/tests/test_computation.py index fd9f6ef41ea..1d80d874df0 100644 --- a/xarray/tests/test_computation.py +++ b/xarray/tests/test_computation.py @@ -2343,6 +2343,20 @@ def test_where_attrs() -> None: ), id="datetime", ), + pytest.param( + # Force a non-ns unit for the coordinate, make sure we convert to `ns` + # for backwards compatibility at the moment. This can be relaxed in the future. + xr.DataArray( + pd.date_range("1970-01-01", freq="s", periods=3, unit="s"), dims="x" + ), + xr.DataArray([0, 1], dims="degree", coords={"degree": [0, 1]}), + xr.DataArray( + [0, 1e9, 2e9], + dims="x", + coords={"x": pd.date_range("1970-01-01", freq="s", periods=3)}, + ), + id="datetime-non-ns", + ), pytest.param( xr.DataArray( np.array([1000, 2000, 3000], dtype="timedelta64[ns]"), dims="x" @@ -2457,6 +2471,14 @@ def test_polyval_degree_dim_checks() -> None: xr.DataArray(pd.date_range("1970-01-01", freq="ns", periods=3), dims="x"), id="datetime", ), + # Force a non-ns unit for the coordinate, make sure we convert to `ns` in both polyfit & polval + # for backwards compatibility at the moment. This can be relaxed in the future. + pytest.param( + xr.DataArray( + pd.date_range("1970-01-01", freq="s", unit="s", periods=3), dims="x" + ), + id="datetime-non-ns", + ), pytest.param( xr.DataArray(np.array([0, 1, 2], dtype="timedelta64[ns]"), dims="x"), id="timedelta", From ac719e8d1581823ea7bd9d60ea3fb0372c904519 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 10 Jan 2025 11:46:07 +0100 Subject: [PATCH 128/134] more doc cosmetics --- doc/internals/time-coding.rst | 89 +++++++++++++++++++++--------- doc/user-guide/io.rst | 4 +- doc/user-guide/time-series.rst | 10 ++-- doc/user-guide/weather-climate.rst | 18 +++--- 4 files changed, 80 insertions(+), 41 deletions(-) diff --git a/doc/internals/time-coding.rst b/doc/internals/time-coding.rst index da025dea1e2..a7e0d5de23d 100644 --- a/doc/internals/time-coding.rst +++ b/doc/internals/time-coding.rst @@ -11,7 +11,7 @@ int64_min = np.iinfo("int64").min + 1 uint64_max = np.iinfo("uint64").max -.. internals.timecoding: +.. _internals.timecoding: Time Coding =========== @@ -32,7 +32,8 @@ When the arguments are numeric (not strings or ``np.datetime64`` values) ``"unit .. ipython:: python - f"Maximum datetime range: ({pd.to_datetime(int64_min, unit="ns")}, {pd.to_datetime(int64_max, unit="ns")})" + f"Minimum datetime: {pd.to_datetime(int64_min, unit="ns")}" + f"Maximum datetime: {pd.to_datetime(int64_max, unit="ns")}" For input values which can't be represented in nanosecond resolution an :py:class:`pandas.OutOfBoundsDatetime` exception is raised: @@ -67,23 +68,16 @@ and :py:meth:`pandas.DatetimeIndex.as_unit` respectively. print("DatetimeIndex to_numpy():", time.as_unit("us").to_numpy()) .. warning:: - Input data with resolution higher than ``'ns'`` (eg. ``'ps'``, ``'fs'``, ``'as'``) is truncated (not rounded) at the ``'ns'``-level. This is currently broken for the ``'ps'`` input, where it is interpreted as ``'ns'``. + Input data with resolution higher than ``'ns'`` (eg. ``'ps'``, ``'fs'``, ``'as'``) is truncated (not rounded) at the ``'ns'``-level. This is `currently broken `_ for the ``'ps'`` input, where it is interpreted as ``'ns'``. .. ipython:: python - try: - print("Good:", pd.to_datetime([np.datetime64(1901901901901, "as")])) - print("Good:", pd.to_datetime([np.datetime64(1901901901901, "fs")])) - print(" Bad:", pd.to_datetime([np.datetime64(1901901901901, "ps")])) - print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ns")])) - print("Good:", pd.to_datetime([np.datetime64(1901901901901, "us")])) - print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ms")])) - print( - "Good:", pd.to_datetime(np.array([np.datetime64(1901901901901, "s")])) - ) - print("Bad:", pd.to_datetime([np.datetime64(1901901901901, "s")])) - except Exception as err: - print("Raise:", err) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "as")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "fs")])) + print(" Bad:", pd.to_datetime([np.datetime64(1901901901901, "ps")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ns")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "us")])) + print("Good:", pd.to_datetime([np.datetime64(1901901901901, "ms")])) .. warning:: Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_datetime` when providing :py:class:`numpy.datetime64` as scalar or numpy array as input. @@ -160,9 +154,6 @@ and :py:meth:`pandas.TimedeltaIndex.as_unit` respectively. print("TimedeltaIndex as_unit('ms'):", delta.as_unit("ms")) print("TimedeltaIndex to_numpy():", delta.as_unit("ms").to_numpy()) -.. note:: - For the functionality in xarray the resolution is converted from ``'ns'`` to the lowest needed resolution when decoding. - .. warning:: Care has to be taken, as some configurations of input data will raise. The following shows, that we are safe to use :py:func:`pandas.to_timedelta` when providing :py:class:`numpy.timedelta64` as scalar or numpy array as input. @@ -204,7 +195,7 @@ When arguments are numeric (not strings) "unit" can be anything from ``'Y'``, `` In normal operation :py:class:`pandas.Timestamp` holds the timestamp in the provided resolution, but only one of ``'s'``, ``'ms'``, ``'us'``, ``'ns'``. Lower resolution input is automatically converted to ``'s'``, higher resolution input is cutted to ``'ns'``. -The same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see above). +The same conversion rules apply here as for :py:func:`pandas.to_timedelta` (see `to_timedelta`_). Depending on the internal resolution Timestamps can be represented in the range: .. ipython:: python @@ -229,7 +220,7 @@ Since relaxing the resolution, this enhances the range to several hundreds of th print("Errors:", err) .. note:: - :py:class:`pandas.Timestamp` is the only current possibility to correctly import time reference strings. It handles non-ISO formatted strings, keeps the resolution of the strings (``'s'``, ``''ms''`` etc.) and imports time zones. When initialized with :py:class:`numpy.datetime64` instead of a string it even overcomes the above limitation of the possible time range. + :py:class:`pandas.Timestamp` is the only current possibility to correctly import time reference strings. It handles non-ISO formatted strings, keeps the resolution of the strings (``'s'``, ``'ms'`` etc.) and imports time zones. When initialized with :py:class:`numpy.datetime64` instead of a string it even overcomes the above limitation of the possible time range. .. ipython:: python @@ -342,37 +333,47 @@ Decoding of ``values`` with a time unit specification like ``"seconds since 1992 values = np.array([-1000 * 365, 0, 1000 * 365], dtype="int64") units = "days since 2000-01-01 00:00:00.000001" dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") - print(dt) assert dt.dtype == "datetime64[us]" + dt + +.. ipython:: python units = "microseconds since 2000-01-01 00:00:00" dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") - print(dt) assert dt.dtype == "datetime64[us]" + dt + +.. ipython:: python values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") units = "days since 2000-01-01 00:00:00.001" dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") - print(dt) assert dt.dtype == "datetime64[ms]" + dt + +.. ipython:: python values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") units = "hours since 2000-01-01" dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") - print(dt) assert dt.dtype == "datetime64[s]" + dt + +.. ipython:: python values = np.array([0, 0.25, 0.5, 0.75, 1.0], dtype="float64") units = "hours since 2000-01-01 00:00:00 03:30" dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") - print(dt) assert dt.dtype == "datetime64[s]" + dt + +.. ipython:: python values = np.array([-2002 * 365 - 121, -366, 365, 2000 * 365 + 119], dtype="int64") units = "days since 0001-01-01 00:00:00" dt = xr.coding.times.decode_cf_datetime(values, units, calendar, time_unit="s") - print(dt) assert dt.dtype == "datetime64[s]" + dt CF time encoding ~~~~~~~~~~~~~~~~ @@ -433,8 +434,42 @@ For encoding the process is more or less a reversal of the above, but we have to ) print(values, units) +.. _internals.default_timeunit: Default Time Unit ~~~~~~~~~~~~~~~~~ The current default time unit of xarray is ``'ns'``. When setting keyword argument ``time_unit`` unit to ``'s'`` (the lowest resolution pandas allows) datetimes will be converted to at least ``'s'``-resolution, if possible. The same holds true for ``'ms'`` and ``'us'``. + +.. ipython:: python + + attrs = {"units": "hours since 2000-01-01"} + ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) + ds.to_netcdf("test-datetimes1.nc") + +.. ipython:: python + + xr.open_dataset("test-datetimes1.nc") + +.. ipython:: python + + coder = xr.coders.CFDatetimeCoder(time_unit="s") + xr.open_dataset("test-datetimes1.nc", decode_times=coder) + +If a coarser unit is requested the datetimes are decoded into their native +on-disk resolution, if possible. + +.. ipython:: python + + attrs = {"units": "milliseconds since 2000-01-01"} + ds = xr.Dataset({"time": ("time", [0, 1, 2, 3], attrs)}) + ds.to_netcdf("test-datetimes2.nc") + +.. ipython:: python + + xr.open_dataset("test-datetimes2.nc") + +.. ipython:: python + + coder = xr.coders.CFDatetimeCoder(time_unit="s") + xr.open_dataset("test-datetimes2.nc", decode_times=coder) diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst index 60ab1720ecf..986d43ce4b7 100644 --- a/doc/user-guide/io.rst +++ b/doc/user-guide/io.rst @@ -540,8 +540,8 @@ The ``units`` and ``calendar`` attributes control how xarray serializes ``dateti ``timedelta64`` arrays to datasets on disk as numeric values. The ``units`` encoding should be a string like ``'days since 1900-01-01'`` for ``datetime64`` data or a string like ``'days'`` for ``timedelta64`` data. ``calendar`` should be one of the calendar types -supported by netCDF4-python: 'standard', 'gregorian', 'proleptic_gregorian' 'noleap', -'365_day', '360_day', 'julian', 'all_leap', '366_day'. +supported by netCDF4-python: ``'standard'``, ``'gregorian'``, ``'proleptic_gregorian'``, ``'noleap'``, +``'365_day'``, ``'360_day'``, ``'julian'``, ``'all_leap'``, ``'366_day'``. By default, xarray uses the ``'proleptic_gregorian'`` calendar and units of the smallest time difference between values, with a reference time of the first time value. diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 8c4d9aa5de7..6686951b15e 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -22,7 +22,7 @@ Creating datetime64 data ------------------------ Xarray uses the numpy dtypes ``datetime64[unit]`` and ``timedelta64[unit]`` -(where unit is one of "s", "ms", "us" and "ns") to represent datetime +(where unit is one of ``"s"``, ``"ms"``, ``"us"`` and ``"ns"``) to represent datetime data, which offer vectorized operations with numpy and smooth integration with pandas. To convert to or create regular arrays of ``datetime64`` data, we recommend @@ -42,7 +42,8 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified and for :py:func:`pandas.to_datetime` the selection of the resolution isn't possible at all. For that :py:class:`pd.DatetimeIndex` can be used - directly. + directly. There is more in-depth information in section + :ref:`internals.timecoding`. Alternatively, you can supply arrays of Python ``datetime`` objects. These get converted automatically when used as arguments in xarray objects (with us-resolution): @@ -62,12 +63,13 @@ attribute like ``'days since 2000-01-01'``). .. note:: When decoding/encoding datetimes for non-standard calendars or for dates - before [1582-10-15](https://en.wikipedia.org/wiki/Gregorian_calendar), xarray uses the `cftime`_ library by default. + before `1582-10-15`_, xarray uses the `cftime`_ library by default. It was previously packaged with the ``netcdf4-python`` package under the name ``netcdftime`` but is now distributed separately. ``cftime`` is an :ref:`optional dependency` of xarray. .. _cftime: https://unidata.github.io/cftime +.. _1582-10-15: https://en.wikipedia.org/wiki/Gregorian_calendar You can manual decode arrays in this form by passing a dataset to @@ -83,7 +85,7 @@ You can manual decode arrays in this form by passing a dataset to coder = xr.coders.CFDatetimeCoder(time_unit="s") xr.decode_cf(ds, decode_times=coder) -From xarray 2025.01.1 the resolution of the dates can be one of "s", "ms", "us" or "ns". One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a store contains dates outside of these bounds (or dates < 1582-10-15 with a Gregorian, also known as standard, calendar), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. +From xarray 2025.01.2 the resolution of the dates can be one of ``"s"``, ``"ms"``, ``"us"`` or ``"ns"``. One limitation of using ``datetime64[ns]`` is that it limits the native representation of dates to those that fall between the years 1678 and 2262, which gets increased significantly with lower resolutions. When a store contains dates outside of these bounds (or dates < `1582-10-15`_ with a Gregorian, also known as standard, calendar), dates will be returned as arrays of :py:class:`cftime.datetime` objects and a :py:class:`~xarray.CFTimeIndex` will be used for indexing. :py:class:`~xarray.CFTimeIndex` enables most of the indexing functionality of a :py:class:`pandas.DatetimeIndex`. See :ref:`CFTimeIndex` for more information. diff --git a/doc/user-guide/weather-climate.rst b/doc/user-guide/weather-climate.rst index 9f36eab7b81..ac50c27d233 100644 --- a/doc/user-guide/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -57,14 +57,14 @@ CF-compliant coordinate variables .. _CFTimeIndex: -Non-standard calendars and dates outside the nanosecond-precision range ------------------------------------------------------------------------ +Non-standard calendars and dates outside the precision range +------------------------------------------------------------ Through the standalone ``cftime`` library and a custom subclass of :py:class:`pandas.Index`, xarray supports a subset of the indexing functionality enabled through the standard :py:class:`pandas.DatetimeIndex` for dates from non-standard calendars commonly used in climate science or dates -using a standard calendar, but outside the `precision range`_ and dates [prior to 1582-10-15](https://en.wikipedia.org/wiki/Gregorian_calendar). +using a standard calendar, but outside the `precision range`_ and dates prior to `1582-10-15`_. .. note:: @@ -74,14 +74,14 @@ using a standard calendar, but outside the `precision range`_ and dates [prior t any of the following are true: - The dates are from a non-standard calendar - - Any dates are outside the nanosecond-precision range (prior xarray version 2025.01.1) - - Any dates are outside the time span limited by the resolution (from xarray version 2025.01.1) + - Any dates are outside the nanosecond-precision range (prior xarray version 2025.01.2) + - Any dates are outside the time span limited by the resolution (from xarray version 2025.01.2) Otherwise pandas-compatible dates from a standard calendar will be - represented with the ``np.datetime64[unit]`` data type (where unit can be one of ["s", "ms", "us", "ns"]), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. + represented with the ``np.datetime64[unit]`` data type (where unit can be one of ``"s"``, ``"ms"``, ``"us"``, ``"ns"``), enabling the use of a :py:class:`pandas.DatetimeIndex` or arrays with dtype ``np.datetime64[unit]`` and their full set of associated features. As of pandas version 2.0.0, pandas supports non-nanosecond precision datetime - values. From xarray version 2025.01.1 on, non-nanosecond precision datetime values are also supported in xarray (this can be parameterized via :py:class:`~xarray.coders.CFDatetimeCoder` and ``decode_times`` kwarg). + values. From xarray version 2025.01.2 on, non-nanosecond precision datetime values are also supported in xarray (this can be parameterized via :py:class:`~xarray.coders.CFDatetimeCoder` and ``decode_times`` kwarg). See also :ref:`internals.timecoding`. For example, you can create a DataArray indexed by a time coordinate with dates from a no-leap calendar and a @@ -132,7 +132,9 @@ Conversion between non-standard calendar and to/from pandas DatetimeIndexes is facilitated with the :py:meth:`xarray.Dataset.convert_calendar` method (also available as :py:meth:`xarray.DataArray.convert_calendar`). Here, like elsewhere in xarray, the ``use_cftime`` argument controls which datetime backend is used in the output. The default (``None``) is to -use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates [starting with 1582-10-15]((https://en.wikipedia.org/wiki/Gregorian_calendar)). There is no such restriction when converting to a ``proleptic_gregorian`` calendar. +use ``pandas`` when possible, i.e. when the calendar is ``standard``/``gregorian`` and dates starting with `1582-10-15`_. There is no such restriction when converting to a ``proleptic_gregorian`` calendar. + +.. _1582-10-15: https://en.wikipedia.org/wiki/Gregorian_calendar .. ipython:: python From 5292569937f4c9f89e3900cfbf5a8d7451e930f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 10 Jan 2025 12:25:37 +0100 Subject: [PATCH 129/134] add whats-new.rst entry --- doc/whats-new.rst | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c639ebed209..f54d61feff3 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -19,6 +19,35 @@ What's New v2025.01.2 (unreleased) ----------------------- +This release brings non-nanosecond datetime resolution to xarray. In the +last couple of releases xarray has been prepared for that change. The code had +to be changed and adapted in numerous places, affecting especially the test suite. +The documentation has been updated accordingly and a new internal chapter +on :ref:`internals.timecoding` has been added. + +To make the transition as smooth as possible this is designed to be fully backwards +compatible, keeping the current default of ``'ns'`` resolution on decoding. +To opt-in decoding into other resolutions (``'us'``, ``'ms'`` or ``'s'``) the +new :py:class:`coders.CFDatetimeCoder` is used as parameter to ``decode_times`` +kwarg (see also :ref:`internals.default_timeunit`): + +.. code-block:: python + + coder = xr.coders.CFDatetimeCoder(time_unit="s") + ds = xr.open_dataset(filename, decode_times=coder) + +There might slight changes when encoding/decoding times as some warning and +error messages have been removed or rewritten. Xarray will now also allow +non-nanosecond datetimes (with ``'us'``, ``'ms'`` or ``'s'`` resolution) when +creating DataArray's from scratch, picking the lowest possible resolution: + +.. ipython:: python + + xr.DataArray(data=[np.datetime64("2000-01-01", "D")], dims=("time",)) + +In a future release the current default of ``'ns'`` resolution on decoding will +eventually be deprecated. + New Features ~~~~~~~~~~~~ - Relax nanosecond datetime restriction in CF time decoding (:issue:`7493`, :pull:`9618`). @@ -38,7 +67,8 @@ Bug fixes Documentation ~~~~~~~~~~~~~ - +- A chapter on :ref:`internals.timecoding` is added to the internal section (:pull:`9618`). + By `Kai Mühlbauer `_. Internal Changes ~~~~~~~~~~~~~~~~ From ecd603b4504516451456b2a5258277f2fbe73f49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 10 Jan 2025 14:45:17 +0100 Subject: [PATCH 130/134] add/fix coder docstring --- xarray/coding/times.py | 19 +++++++++++++++++++ xarray/coding/variables.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 0200f8867ac..e23aa793ebd 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -1241,6 +1241,25 @@ def _lazily_encode_cf_timedelta( class CFDatetimeCoder(VariableCoder): + """Coder for CF Datetime coding. + + Parameters + ---------- + use_cftime : bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64`` objects. If False, always + decode times to ``np.datetime64`` objects; if this is not possible + raise an error. + May not be supported by all the backends. + time_unit : PDDatetimeUnitOptions + Target resolution when decoding dates. Defaults to "ns". + """ + def __init__( self, use_cftime: bool | None = None, diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 8154f044332..83112628dbb 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -46,7 +46,7 @@ def encode(self, variable: Variable, name: T_Name = None) -> Variable: raise NotImplementedError() def decode(self, variable: Variable, name: T_Name = None) -> Variable: - """Convert an decoded variable to a encoded variable""" + """Convert a decoded variable to an encoded variable""" raise NotImplementedError() From f6716dc24c6f0305868befef70b7bcd8db3de486 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Fri, 10 Jan 2025 14:51:22 +0100 Subject: [PATCH 131/134] add xr.date_range example as suggested per review --- doc/user-guide/time-series.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/doc/user-guide/time-series.rst b/doc/user-guide/time-series.rst index 6686951b15e..d131ae74b9f 100644 --- a/doc/user-guide/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -37,6 +37,14 @@ using :py:func:`pandas.to_datetime` and :py:func:`pandas.date_range`: pd.date_range("2000-01-01", periods=365) pd.date_range("2000-01-01", periods=365, unit="s") +It is also possible to use corresponding :py:func:`xarray.date_range`: + +.. ipython:: python + + xr.date_range("2000-01-01", periods=365) + xr.date_range("2000-01-01", periods=365, unit="s") + + .. note:: Care has to be taken to create the output with the wanted resolution. For :py:func:`pandas.date_range` the ``unit``-kwarg has to be specified From 055637611d0669fbc533345e3c61f4cc0da8bf34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 13 Jan 2025 07:39:55 +0100 Subject: [PATCH 132/134] Apply suggestions from code review Co-authored-by: Spencer Clark --- xarray/tests/test_dataset.py | 2 -- xarray/tests/test_groupby.py | 1 - xarray/tests/test_interp.py | 1 - 3 files changed, 4 deletions(-) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 7fc487e03bb..8a90a05a4e3 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -7208,7 +7208,6 @@ def test_differentiate(dask, edge_order) -> None: da.differentiate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) def test_differentiate_datetime(dask) -> None: rs = np.random.default_rng(42) @@ -7403,7 +7402,6 @@ def test_cumulative_integrate(dask) -> None: da.cumulative_integrate("x2d") -@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.parametrize("dask", [True, False]) @pytest.mark.parametrize("which_datetime", ["np", "cftime"]) def test_trapezoid_datetime(dask, which_datetime) -> None: diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 8ea3b618c7e..7a86ed2f512 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -636,7 +636,6 @@ def test_groupby_repr_datetime(obj) -> None: @pytest.mark.filterwarnings("ignore:No index created for dimension id:UserWarning") -@pytest.mark.filterwarnings("ignore:Converting non-default") @pytest.mark.filterwarnings("ignore:invalid value encountered in divide:RuntimeWarning") @pytest.mark.parametrize("shuffle", [True, False]) @pytest.mark.parametrize( diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index 15bb50cb143..b2171f31c33 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -718,7 +718,6 @@ def test_interp_like() -> None: pytest.param("2000-01-01T12:00", 0.5, marks=pytest.mark.xfail), ], ) -@pytest.mark.filterwarnings("ignore:Converting non-default") def test_datetime(x_new, expected) -> None: da = xr.DataArray( np.arange(24), From ffc1828d9b352a06b2b598e230d7af3b30541f10 Mon Sep 17 00:00:00 2001 From: Spencer Clark Date: Mon, 13 Jan 2025 02:20:41 -0500 Subject: [PATCH 133/134] Implement `time_unit` option for `decode_cf_timedelta` (#3) * Fix timedelta encoding overflow issue; always decode to ns resolution * Implement time_unit for decode_cf_timedelta * Reduce diff --- xarray/coding/times.py | 60 +++++++++++++++++++------------ xarray/tests/test_coding_times.py | 37 +++++++++++++++++-- 2 files changed, 72 insertions(+), 25 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index e23aa793ebd..39e7c94c366 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -365,6 +365,21 @@ def _check_date_for_units_since_refdate( return pd.Timestamp("NaT") +def _check_timedelta_range(value, data_unit, time_unit): + if value > np.iinfo("int64").max or value < np.iinfo("int64").min: + OutOfBoundsTimedelta(f"Value {value} can't be represented as Timedelta.") + delta = value * np.timedelta64(1, data_unit) + if not np.isnan(delta): + # this will raise on dtype overflow for integer dtypes + if value.dtype.kind in "u" and not np.int64(delta) == value: + raise OutOfBoundsTimedelta( + "DType overflow in Datetime/Timedelta calculation." + ) + # this will raise on overflow if delta cannot be represented with the + # resolutions supported by pandas. + pd.to_timedelta(delta) + + def _align_reference_date_and_unit( ref_date: pd.Timestamp, unit: NPDatetimeUnitOptions ) -> pd.Timestamp: @@ -542,19 +557,6 @@ def decode_cf_datetime( return reshape(dates, num_dates.shape) -def to_timedelta_unboxed(value, **kwargs): - # todo: check, if the procedure here is correct - result = pd.to_timedelta(value, **kwargs).to_numpy() - unique_timedeltas = np.unique(result[pd.notnull(result)]) - unit = _netcdf_to_numpy_timeunit(_infer_time_units_from_diff(unique_timedeltas)) - if unit not in {"s", "ms", "us", "ns"}: - # default to "ns", when not specified - unit = "ns" - result = result.astype(f"timedelta64[{unit}]") - assert np.issubdtype(result.dtype, "timedelta64") - return result - - def to_datetime_unboxed(value, **kwargs): result = pd.to_datetime(value, **kwargs).to_numpy() assert np.issubdtype(result.dtype, "datetime64") @@ -604,22 +606,36 @@ def _numbers_to_timedelta( return flat_num.astype(f"timedelta64[{time_unit}]") -def decode_cf_timedelta(num_timedeltas, units: str) -> np.ndarray: - # todo: check, if this works as intended +def decode_cf_timedelta( + num_timedeltas, units: str, time_unit: str = "ns" +) -> np.ndarray: """Given an array of numeric timedeltas in netCDF format, convert it into a numpy timedelta64 ["s", "ms", "us", "ns"] array. """ num_timedeltas = np.asarray(num_timedeltas) unit = _netcdf_to_numpy_timeunit(units) + _check_timedelta_range(num_timedeltas.min(), unit, time_unit) + _check_timedelta_range(num_timedeltas.max(), unit, time_unit) + timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta") + timedeltas = pd.to_timedelta(ravel(timedeltas)) + + if np.isnat(timedeltas).all(): + empirical_unit = time_unit + else: + empirical_unit = timedeltas.unit + + if np.timedelta64(1, time_unit) > np.timedelta64(1, empirical_unit): + time_unit = empirical_unit + + if time_unit not in {"s", "ms", "us", "ns"}: + raise ValueError( + f"time_unit must be one of 's', 'ms', 'us', or 'ns'. Got: {time_unit}" + ) - as_unit = unit - if unit not in {"s", "ms", "us", "ns"}: - # default to "ns", when not specified - as_unit = "ns" - result = pd.to_timedelta(ravel(timedeltas)).as_unit(as_unit).to_numpy() - return reshape(result, timedeltas.shape) + result = timedeltas.as_unit(time_unit).to_numpy() + return reshape(result, num_timedeltas.shape) def _unit_timedelta_cftime(units: str) -> timedelta: @@ -700,7 +716,7 @@ def infer_timedelta_units(deltas) -> str: {'days', 'hours', 'minutes' 'seconds'} (the first one that can evenly divide all unique time deltas in `deltas`) """ - deltas = to_timedelta_unboxed(ravel(np.asarray(deltas))) + deltas = ravel(deltas) unique_timedeltas = np.unique(deltas[pd.notnull(deltas)]) return _infer_time_units_from_diff(unique_timedeltas) diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index 6be9b1c475a..ed387e22d60 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -34,7 +34,6 @@ format_cftime_datetime, infer_datetime_units, infer_timedelta_units, - to_timedelta_unboxed, ) from xarray.coding.variables import SerializationWarning from xarray.conventions import _update_bounds_attributes, cf_encoder @@ -635,7 +634,7 @@ def test_cf_timedelta(timedeltas, units, numbers) -> None: if timedeltas == "NaT": timedeltas = np.timedelta64("NaT", "ns") else: - timedeltas = to_timedelta_unboxed(timedeltas) + timedeltas = pd.to_timedelta(timedeltas).to_numpy() numbers = np.array(numbers) expected = numbers @@ -659,7 +658,7 @@ def test_cf_timedelta_2d() -> None: units = "days" numbers = np.atleast_2d([1, 2, 3]) - timedeltas = np.atleast_2d(to_timedelta_unboxed(["1D", "2D", "3D"])) + timedeltas = np.atleast_2d(pd.to_timedelta(["1D", "2D", "3D"]).to_numpy()) expected = timedeltas actual = decode_cf_timedelta(numbers, units) @@ -667,6 +666,38 @@ def test_cf_timedelta_2d() -> None: assert expected.dtype == actual.dtype +@pytest.mark.parametrize("encoding_unit", FREQUENCIES_TO_ENCODING_UNITS.values()) +def test_decode_cf_timedelta_time_unit(time_unit, encoding_unit) -> None: + encoded = 1 + encoding_unit_as_numpy = _netcdf_to_numpy_timeunit(encoding_unit) + if np.timedelta64(1, time_unit) > np.timedelta64(1, encoding_unit_as_numpy): + expected = np.timedelta64(encoded, encoding_unit_as_numpy) + else: + expected = np.timedelta64(encoded, encoding_unit_as_numpy).astype( + f"timedelta64[{time_unit}]" + ) + result = decode_cf_timedelta(encoded, encoding_unit, time_unit) + assert result == expected + assert result.dtype == expected.dtype + + +def test_decode_cf_timedelta_time_unit_out_of_bounds(time_unit): + # Define a scale factor that will guarantee overflow with the given + # time_unit. + scale_factor = np.timedelta64(1, time_unit) // np.timedelta64(1, "ns") + encoded = scale_factor * 300 * 365 + with pytest.raises(OutOfBoundsTimedelta): + decode_cf_timedelta(encoded, "days", time_unit) + + +def test_cf_timedelta_roundtrip_large_value(time_unit): + value = np.timedelta64(np.iinfo(np.int64).max, time_unit) + encoded, units = encode_cf_timedelta(value) + decoded = decode_cf_timedelta(encoded, units, time_unit=time_unit) + assert value == decoded + assert value.dtype == decoded.dtype + + @pytest.mark.parametrize( ["deltas", "expected"], [ From eaf3c731cb8b69b329fe677241544a1337290014 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=BChlbauer?= Date: Mon, 13 Jan 2025 09:21:07 +0100 Subject: [PATCH 134/134] fix typing --- xarray/coding/times.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/xarray/coding/times.py b/xarray/coding/times.py index 39e7c94c366..d000e1703f3 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -38,6 +38,7 @@ from xarray.core.types import ( CFCalendar, + CFTimeDatetime, NPDatetimeUnitOptions, PDDatetimeUnitOptions, T_DuckArray, @@ -342,7 +343,7 @@ def _decode_datetime_with_cftime( def _check_date_for_units_since_refdate( - date, unit: str, ref_date: pd.Timestamp + date, unit: NPDatetimeUnitOptions, ref_date: pd.Timestamp ) -> pd.Timestamp: # check for out-of-bounds floats and raise if date > np.iinfo("int64").max or date < np.iinfo("int64").min: @@ -391,7 +392,9 @@ def _align_reference_date_and_unit( return ref_date -def _check_date_is_after_shift(date: pd.Timestamp, calendar: str) -> None: +def _check_date_is_after_shift( + date: pd.Timestamp | datetime | CFTimeDatetime, calendar: str +) -> None: # if we have gregorian/standard we need to raise # if we are outside the well-defined date range # proleptic_gregorian and standard/gregorian are only equivalent @@ -607,7 +610,7 @@ def _numbers_to_timedelta( def decode_cf_timedelta( - num_timedeltas, units: str, time_unit: str = "ns" + num_timedeltas, units: str, time_unit: PDDatetimeUnitOptions = "ns" ) -> np.ndarray: """Given an array of numeric timedeltas in netCDF format, convert it into a numpy timedelta64 ["s", "ms", "us", "ns"] array. @@ -619,12 +622,12 @@ def decode_cf_timedelta( _check_timedelta_range(num_timedeltas.max(), unit, time_unit) timedeltas = _numbers_to_timedelta(num_timedeltas, unit, "s", "timedelta") - timedeltas = pd.to_timedelta(ravel(timedeltas)) + pd_timedeltas = pd.to_timedelta(ravel(timedeltas)) if np.isnat(timedeltas).all(): empirical_unit = time_unit else: - empirical_unit = timedeltas.unit + empirical_unit = pd_timedeltas.unit if np.timedelta64(1, time_unit) > np.timedelta64(1, empirical_unit): time_unit = empirical_unit @@ -634,7 +637,7 @@ def decode_cf_timedelta( f"time_unit must be one of 's', 'ms', 'us', or 'ns'. Got: {time_unit}" ) - result = timedeltas.as_unit(time_unit).to_numpy() + result = pd_timedeltas.as_unit(time_unit).to_numpy() return reshape(result, num_timedeltas.shape)