Source code for deepsensor.data.utils

from typing import Union

import numpy as np
import pandas as pd
import scipy
import xarray as xr



[docs]
def construct_x1x2_ds(gridded_ds):
    """Construct an :class:`xarray.Dataset` containing two vars, where each var is
    a 2D gridded channel whose values contain the x_1 and x_2 coordinate
    values, respectively.

    Args:
        gridded_ds (:class:`xarray.Dataset`):
            ...

    Returns:
        :class:`xarray.Dataset`
            ...
    """
    X1, X2 = np.meshgrid(gridded_ds.x1, gridded_ds.x2, indexing="ij")
    ds = xr.Dataset(
        coords={"x1": gridded_ds.x1, "x2": gridded_ds.x2},
        data_vars={"x1_arr": (("x1", "x2"), X1), "x2_arr": (("x1", "x2"), X2)},
    )
    return ds




[docs]
def construct_circ_time_ds(dates, freq):
    """Return an :class:`xarray.Dataset` containing a circular variable for time.

    The ``freq`` entry dictates the frequency of cycling of the circular
    variable. E.g.:

        - ``'H'``: cycles once per day at hourly intervals
        - ``'D'``: cycles once per year at daily intervals
        - ``'M'``: cycles once per year at monthly intervals

    Args:
        dates (...):
            ...
        freq (...):
            ...

    Returns:
        :class:`xarray.Dataset`
            ...
    """
    # Ensure dates are pandas
    dates = pd.DatetimeIndex(dates)
    if freq == "D":
        time_var = dates.dayofyear
        mod = 365.25
    elif freq == "H":
        time_var = dates.hour
        mod = 24
    elif freq == "M":
        time_var = dates.month
        mod = 12
    else:
        raise ValueError("Circular time variable not implemented for this frequency.")

    cos_time = np.cos(2 * np.pi * time_var / mod)
    sin_time = np.sin(2 * np.pi * time_var / mod)

    ds = xr.Dataset(
        coords={"time": dates},
        data_vars={
            f"cos_{freq}": ("time", cos_time),
            f"sin_{freq}": ("time", sin_time),
        },
    )
    return ds




[docs]
def compute_xarray_data_resolution(ds: Union[xr.DataArray, xr.Dataset]) -> float:
    """Computes the resolution of an xarray object with coordinates x1 and x2.

    The data resolution is the finer of the two coordinate resolutions (x1 and
    x2). For example, if x1 has a resolution of 0.1 degrees and x2 has a
    resolution of 0.2 degrees, the data resolution returned will be 0.1
    degrees.

    Args:
        ds (:class:`xarray.DataArray` | :class:`xarray.Dataset`):
            Xarray object with coordinates x1 and x2.

    Returns:
        float: Resolution of the data (in spatial units, e.g. 0.1 degrees).
    """
    x1_res = np.abs(np.mean(np.diff(ds["x1"])))
    x2_res = np.abs(np.mean(np.diff(ds["x2"])))

    # ensure float type, since numpy 2, np.mean returns a numpy float32
    data_resolution = float(np.min([x1_res, x2_res]))
    return data_resolution




[docs]
def compute_pandas_data_resolution(
    df: Union[pd.DataFrame, pd.Series],
    n_times: int = 1000,
    percentile: int = 5,
) -> float:
    """Approximates the resolution of non-gridded pandas data with indexes time,
    x1, and x2.

    The resolution is approximated as the Nth percentile of the distances
    between neighbouring observations, possibly using a subset of the dates in
    the data. The default is to use 1000 dates (or all dates if there are fewer
    than 1000) and to use the 5th percentile. This means that the resolution is
    the distance between the closest 5% of neighbouring observations.

    Args:
        df (:class:`pandas.DataFrame` | :class:`pandas.Series`):
            Dataframe or series with indexes time, x1, and x2.
        n_times (int, optional):
            Number of dates to sample. Defaults to 1000. If "all", all dates
            are used.
        percentile (int, optional):
            Percentile of pairwise distances for computing the resolution.
            Defaults to 5.

    Returns:
        float: Resolution of the data (in spatial units, e.g. 0.1 degrees).
    """
    dates = df.index.get_level_values("time").unique()

    if n_times != "all" and len(dates) > n_times:
        rng = np.random.default_rng(42)
        dates = rng.choice(dates, size=n_times, replace=False)

    closest_distances = []
    df = df.reset_index().set_index("time")
    for time in dates:
        df_t = df.loc[[time]]
        X = df_t[["x1", "x2"]].values  # (N, 2) array of coordinates
        if X.shape[0] < 2:
            # Skip this time if there are fewer than 2 stationS
            continue
        X_unique = np.unique(X, axis=0)  # (N_unique, 2) array of unique coordinates

        pairwise_distances = scipy.spatial.distance.cdist(X_unique, X_unique)
        percentile_distances_without_self = np.ma.masked_equal(pairwise_distances, 0)

        # Compute the closest distance from each station to each other station
        closest_distances_t = np.min(percentile_distances_without_self, axis=1)
        closest_distances.extend(closest_distances_t)

    data_resolution = np.percentile(closest_distances, percentile)
    return data_resolution