Source code for clisops.core.average

"""Average module."""

import warnings
from pathlib import Path
from typing import Sequence, Tuple, Union

import cf_xarray  # noqa
import geopandas as gpd
import numpy as np
import xarray as xr
from roocs_utils.exceptions import InvalidParameterValue
from roocs_utils.xarray_utils.xarray_utils import (
    get_coord_by_type,
    get_coord_type,
    known_coord_types,
)

from clisops.utils.time_utils import create_time_bounds

from .regrid import XESMF_MINIMUM_VERSION
from .subset import shape_bbox_indexer

__all__ = ["average_over_dims", "average_shape", "average_time"]

# see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects
freqs = {"day": "1D", "month": "1MS", "year": "1AS"}



[docs]
def average_shape(
    ds: xr.Dataset,
    shape: Union[str, Path, gpd.GeoDataFrame],
    variable: Union[str, Sequence[str]] = None,
) -> Union[xr.DataArray, xr.Dataset]:
    """Average a DataArray or Dataset spatially using vector shapes.

    Return a DataArray or Dataset averaged over each Polygon given. Requires xESMF.

    Parameters
    ----------
    ds : xarray.Dataset
        Input values, coordinate attributes must be CF-compliant.
    shape : Union[str, Path, gpd.GeoDataFrame]
        Path to shape file, or directly a GeoDataFrame. Supports formats compatible with geopandas.
        Will be converted to EPSG:4326 if needed.
    variable : Union[str, Sequence[str], None]
        Variables to average. If None, average over all data variables.

    Returns
    -------
    Union[xarray.DataArray, xarray.Dataset]
        `ds` spatially-averaged over the polygon(s) in `shape`.
        Has a new `geom` dimension corresponding to the index of the input GeoDataFrame.
        Non-geometry columns of the GeoDataFrame are copied as auxiliary coordinates.

    Notes
    -----
    The spatial weights are computed with ESMF, which uses corners given in lat/lon format (EPSG:4326),
    the input dataset `ds` must provide those. In opposition to `subset.subset_shape`, the
    weights computed here take partial overlaps and holes into account.

    As xESMF computes the weight masks only once, skipping missing values is not really feasible. Thus,
    all NaNs propagate when performing the average.

    Examples
    --------
    .. code-block:: python

        import xarray as xr  # doctest: +SKIP
        from clisops.core.average import average_shape

        pr = xr.open_dataset(path_to_pr_file).pr

        # Average data array over shape
        prAvg = average_shape(pr, shape=path_to_shape_file)

        # Average multiple variables in a single dataset
        ds = xr.open_mfdataset([path_to_tasmin_file, path_to_tasmax_file])
        dsAvg = average_shape(ds, shape=path_to_shape_file)
    """
    try:
        from xesmf import SpatialAverager
    except ImportError:
        raise ValueError(
            f"Package xesmf {XESMF_MINIMUM_VERSION} is required to use `average_shape`."
        )

    if isinstance(ds, xr.DataArray):
        warnings.warn(
            "Pass a Dataset object instead of a DataArray.", DeprecationWarning
        )
        ds_copy = ds.to_dataset(name=ds.name)
    else:
        ds_copy = ds.copy()

    if isinstance(shape, gpd.GeoDataFrame):
        poly = shape.copy()
    else:
        poly = gpd.GeoDataFrame.from_file(shape)

    if poly.crs is not None:
        poly = poly.to_crs(4326)

    # First subset to bounding box to reduce memory usage.
    indexer = shape_bbox_indexer(ds_copy, poly)
    ds_sub = ds_copy.isel(indexer)

    # Compute the weights
    savger = SpatialAverager(ds_sub, poly.geometry)

    # Check that some weights are not null. Handle both sparse and scipy weights.
    nonnull = (
        savger.weights.data.nnz
        if isinstance(savger.weights, xr.DataArray)
        else savger.weights.nnz
    )
    if nonnull == 0:
        raise ValueError(
            "There were no valid data points found in the requested averaging region. Verify objects overlap."
        )

    # Select variables to average
    if variable is not None:
        ds_sub = ds_sub[variable]

    # Apply the weights to the actual data -> spatial average
    # We transfer the global and variable attributes of the input to the output
    ds_out = savger(ds_sub, keep_attrs=True)

    # Set geom coords to poly's index
    ds_out["geom"] = poly.index

    # Add polygon attributes to Dataset output as coordinates
    ds_meta = (
        poly.drop("geometry", axis=1)
        .to_xarray()
        .rename(**{poly.index.name or "index": "geom"})
    )
    ds_meta = ds_meta.set_coords(ds_meta.data_vars)
    ds_out = xr.merge([ds_out, ds_meta])

    # Maybe returning a DataArray should be deprecated.
    if isinstance(ds, xr.DataArray):
        return ds_out[ds.name]
    return ds_out




[docs]
def average_over_dims(
    ds: Union[xr.DataArray, xr.Dataset],
    dims: Sequence[str] = None,
    ignore_undetected_dims: bool = False,
) -> Union[xr.DataArray, xr.Dataset]:
    """Average a DataArray or Dataset over the dimensions specified.

    Parameters
    ----------
    ds : Union[xr.DataArray, xr.Dataset]
        Input values.
    dims : Sequence[{"time", "level", "latitude", "longitude"}]
        The dimensions over which to apply the average. If None, none of the dimensions are averaged over.
        Dimensions must be one of ["time", "level", "latitude", "longitude"].
    ignore_undetected_dims : bool
        If the dimensions specified are not found in the dataset, an Exception will be raised if set to True.
        If False, an exception will not be raised and the other dimensions will be averaged over. Default = False

    Returns
    -------
    Union[xr.DataArray, xr.Dataset]
        New Dataset or DataArray object averaged over the indicated dimensions.
        The indicated dimensions will have been removed.

    Examples
    --------
    .. code-block:: python

        from clisops.core.average import average_over_dims

        pr = xr.open_dataset(path_to_pr_file).pr

        # Average data array over latitude and longitude
        prAvg = average_over_dims(
            pr, dims=["latitude", "longitude"], ignore_undetected_dims=True
        )
    """

    if not dims:
        raise InvalidParameterValue(
            "At least one dimension for averaging must be provided"
        )

    if not set(dims).issubset(set(known_coord_types)):
        raise InvalidParameterValue(
            f"Dimensions for averaging must be one of {known_coord_types}"
        )

    found_dims = dict()

    # Work out real coordinate types for each dimension
    for coord in ds.coords:
        coord = ds.coords[coord]
        coord_type = get_coord_type(coord)

        if coord_type:
            # Check if the coordinate is a dimension
            if coord.name in ds.dims:
                found_dims[coord_type] = coord.name

    # Set a variable for requested dimensions that were not detected
    undetected_dims = set(dims) - set(found_dims.keys())

    if ignore_undetected_dims is False and not set(dims).issubset(
        set(found_dims.keys())
    ):
        raise InvalidParameterValue(
            f"Requested dimensions were not found in input dataset: {undetected_dims}."
        )
    else:
        # Remove undetected dim so that it can be ignored
        dims = [dim for dim in dims if dim not in undetected_dims]

    # Get dims by the name used in dataset
    dims_to_average = []

    for dim in dims:
        dims_to_average.append(found_dims[dim])

    # The mean will be carried out on a Dataset or DataArray
    # Calculate the mean, skip missing values and retain original attributes

    # Short-term solution to error: "NotImplementedError: Computing the mean of an " ...
    #    "array containing cftime.datetime objects is not yet implemented on dask arrays."
    # See GITHUB ISSUE: https://github.com/roocs/clisops/issues/185
    if isinstance(ds, xr.Dataset):
        untouched_ds = ds.drop_dims(dims_to_average)
        ds = ds.drop_vars(untouched_ds.data_vars.keys())

    ds_averaged_over_dims = ds.mean(dim=dims_to_average, skipna=True, keep_attrs=True)

    if isinstance(ds, xr.Dataset):
        return xr.merge((ds_averaged_over_dims, untouched_ds))
    return ds_averaged_over_dims




[docs]
def average_time(
    ds: Union[xr.DataArray, xr.Dataset],
    freq: str,
) -> Union[xr.DataArray, xr.Dataset]:
    """Average a DataArray or Dataset over the time frequency specified.

    Parameters
    ----------
    ds : Union[xr.DataArray, xr.Dataset]
      Input values.
    freq : str
      The frequency to average over. One of "month", "year".

    Returns
    -------
    Union[xr.DataArray, xr.Dataset]
      New Dataset or DataArray object averaged over the indicated time frequency.

    Examples
    --------
    .. code-block:: python

        from clisops.core.average import average_time

        pr = xr.open_dataset(path_to_pr_file).pr

        # Average data array over each month
        prAvg = average_time(pr, freq="month")
    """

    if not freq:
        raise InvalidParameterValue(
            "At least one frequency for averaging must be provided"
        )

    if freq not in list(freqs.keys()):
        raise InvalidParameterValue(
            f"Time frequency for averaging must be one of {list(freqs.keys())}."
        )

    # check time coordinate exists and get name
    # For roocs_utils 0.5.0
    t = get_coord_by_type(ds, "time", ignore_aux_coords=False)
    if t is None:
        raise Exception("Time dimension could not be found")
    # For latest roocs_utils (master)
    # try:
    #    t = get_coord_by_type(ds, "time", ignore_aux_coords=False)
    # except KeyError:
    #    raise Exception("Time dimension could not be found")

    # resample and average over time
    ds_t_avg = ds.resample(indexer={t.name: freqs[freq]}).mean(
        dim=t.name, skipna=True, keep_attrs=True
    )

    # generate time_bounds depending on frequency
    time_bounds = create_time_bounds(ds_t_avg, freq)

    # get name of bounds dimension for time
    bnds = ds.cf.get_bounds_dim_name("time")

    # add time bounds to dataset
    ds_t_avg = ds_t_avg.assign({"time_bnds": ((t.name, bnds), np.asarray(time_bounds))})

    return ds_t_avg