import xarray
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.axes import Axes
import pandas as pd
= ["01", "05", "06", "07", "08"]
RUN_VARS = ["tasmax", "tasmin", "pr"]
VARS
def get_root_filepath():
return "https://climrecal.blob.core.windows.net/analysis/cpm-median-time-series"
def add_cols(df):
"day"] = df["time"].apply(lambda x: x.timetuple().tm_yday)
df["month"] = df["time"].apply(lambda x: x.month)
df["year"] = df["time"].apply(lambda x: x.year)
df["day_of_month"] = df["time"].apply(lambda x: x.day)
df["leap_year"] = df["time"].apply(lambda x: x.year % 4 == 0)
df[return df
def get_days(is_leap_year: bool) -> np.array:
if not is_leap_year:
# https://docs.xarray.dev/en/stable/generated/xarray.Dataset.convert_calendar.html
# February 6th (36), April 19th (109), July 2nd (183), September 12th (255), November 25th (329).
# First missing day should be 37, not 36 since February 6th is 37
return np.array([37, 109, 183, 255, 329])
else:
# January 31st (31), March 31st (91), June 1st (153), July 31st (213), September 31st (275) and November 30th (335).
return np.array([31, 91, 153, 213, 275, 335])
def get_data() -> dict[str, dict[str, tuple[pd.DataFrame, pd.DataFrame]]]:
= {}
data for var in VARS:
= {}
data[var] for run_var in RUN_VARS:
= f"{get_root_filepath()}/cpm-raw-medians/median-{var}-{run_var}.nc#mode=bytes"
path_raw = f"{get_root_filepath()}/cpm-converted-linear-medians/median-{var}-{run_var}.nc#mode=bytes"
path_con = f"{get_root_filepath()}/cpm-converted-nearest-medians/median-{var}-{run_var}.nc#mode=bytes"
path_con_near = xarray.load_dataset(path_raw)
x_raw = xarray.load_dataset(path_con)
x_con = xarray.load_dataset(path_con_near)
x_con_near = add_cols(
df_raw "standard", align_on="year")
x_raw.convert_calendar(
.to_pandas()
.reset_index()
)= add_cols(x_con.to_pandas().reset_index())
df_con = add_cols(x_con_near.to_pandas().reset_index())
df_con_near = (df_raw, df_con, df_con_near)
data[var][run_var] return data
# Load all data
= get_data() data
Warning 3: Cannot find header.dxf (GDAL_DATA is not defined)
# Example
= "tasmax", "01"
var, run_var = data[var][run_var] df_raw, df_con, df_con_near
# Missing Day 37
64:70] df_raw.iloc[
time | tasmax | day | month | year | day_of_month | leap_year | |
---|---|---|---|---|---|---|---|
64 | 1981-02-04 12:00:00 | 7.901025 | 35 | 2 | 1981 | 4 | False |
65 | 1981-02-05 12:00:00 | 9.846338 | 36 | 2 | 1981 | 5 | False |
66 | 1981-02-07 12:00:00 | 10.244776 | 38 | 2 | 1981 | 7 | False |
67 | 1981-02-08 12:00:00 | 7.123193 | 39 | 2 | 1981 | 8 | False |
68 | 1981-02-09 12:00:00 | 7.830713 | 40 | 2 | 1981 | 9 | False |
69 | 1981-02-10 12:00:00 | 7.916040 | 41 | 2 | 1981 | 10 | False |
def plot_by_day(
data,str,
var: str,
run_var: int,
year: bool,
leap_year: bool,
plot_diff:
ax: Axes,=0.8,
lw
):= data[var][run_var]
df_raw, df_con, df_con_near if not plot_diff:
for i, df in enumerate([df_raw, df_con, df_con_near]):
if leap_year is not None:
= df[df["leap_year"].eq(leap_year)]
df2 else:
= df
df2 if year is None:
= df2.groupby("day")[var].median()
x else:
= df2[df2["year"].eq(year)].groupby("day")[var].first()
x if i == 0:
if leap_year is not None:
ax.vlines(
get_days(leap_year),min(),
x.max(),
x.=-1,
zorder=0.4,
lw="k",
color=":",
ls
)=lw, alpha=0.8)
ax.plot(x.index, x, lwelse:
=lw, alpha=0.8)
ax.plot(x.index, x, lw# pass
else:
= []
series for i, df in enumerate([df_raw, df_con, df_con_near]):
if leap_year is not None:
= df[df["leap_year"].eq(leap_year)]
df2 else:
= df
df2 if year is None:
= df2.groupby("day")[var].median()
x else:
= df2[df2["year"].eq(year)].groupby("day")[var].first()
x
series.append(x)
# Use absolute difference
= series[1] - series[0]
x2 = series[2] - series[0]
x3
ax.vlines(
get_days(leap_year),min([x2.min(), x3.min()]),
max([x2.max(), x3.max()]),
=-1,
zorder=0.4,
lw="k",
color=":",
ls
)
=lw, alpha=0.8, color="C1", ls="-")
ax.plot(x2.index, x2, lw=lw, alpha=0.8, color="C2", ls="-")
ax.plot(x3.index, x3, lw
f"Year: {year}, Leap year: {leap_year}")
ax.set_title(if not plot_diff:
f"Difference ({var})")
ax.set_ylabel(else:
f"Value ({var})")
ax.set_ylabel(
def plot_by_day_pair(
data,str,
var: str,
run_var: int,
year: bool,
leap_year:
axs: Axes,=0.8,
lwlist[tuple] = [(0, 1), (0, 2)],
series_diff_tuples:
):= axs[0], axs[1]
ax1, ax2 = data[var][run_var]
df_raw, df_con, df_con_near = []
series for i, df in enumerate([df_raw, df_con, df_con_near]):
if leap_year is not None:
= df[df["leap_year"].eq(leap_year)]
df2 else:
= df
df2 if year is None:
= df2.groupby("day")[var].median()
x else:
= df2[df2["year"].eq(year)].groupby("day")[var].first()
x
series.append(x)
# Use difference
= series[series_diff_tuples[0][1]] - series[series_diff_tuples[0][0]]
x2 = series[series_diff_tuples[1][1]] - series[series_diff_tuples[1][0]]
x3
for i, ax in enumerate([ax1, ax2]):
= "linear" if i == 0 else "nearest"
interp
ax.vlines(
get_days(leap_year),min([x2.min(), x3.min()]),
max([x2.max(), x3.max()]),
=-1,
zorder=0.4,
lw="k",
color=":",
ls
)
ax.set_title(f"Year: {year}, Leap year: {leap_year}, Interpolation: {interp}",
="medium",
fontsize
)=lw, alpha=0.8, color="C1", ls="-")
ax1.plot(x2.index, x2, lw=lw, alpha=0.8, color="C2", ls="-")
ax2.plot(x3.index, x3, lwf"Value ({var})") ax1.set_ylabel(
def plot_array(data, year, leap_year=True, plot_diff=False, lw=0.8):
= RUN_VARS
run_vars vars = VARS
= plt.subplots(
fig, axs len(vars),
len(run_vars),
=False,
squeeze=True,
sharex="row",
sharey=(14, 6),
figsize
)for row, var in enumerate(vars):
for col, run_var in enumerate(run_vars):
= axs[row][col]
ax =ax, lw=lw)
plot_by_day(data, var, run_var, year, leap_year, plot_diff, axif col == 0:
="medium")
ax.set_ylabel(var, fontsizeelse:
"")
ax.set_ylabel(if row == 0:
f"Run: {run_var}", fontsize="medium")
ax.set_title(else:
"")
ax.set_title(
def plot_array_pair_diff(
data,
year,=True,
leap_year="01",
run_var=0.8,
lwlist[tuple] = [(0, 1), (0, 2)],
series_diff_tuples:
):= [run_var]
run_vars vars = VARS
= plt.subplots(
fig, axs len(vars),
2,
=False,
squeeze=True,
sharex="row",
sharey=(14, 6),
figsize
)for row, var in enumerate(vars):
for col, run_var in enumerate(run_vars):
= axs[row]
ax_pair
plot_by_day_pair(
data,
var,
run_var,
year,
leap_year,=ax_pair,
axs=lw,
lw=series_diff_tuples,
series_diff_tuples
)0].set_ylabel(var, fontsize="medium") ax_pair[
"tasmax", "01", None, leap_year=True, plot_diff=False, ax=plt.gca()) plot_by_day(data,
# Differences between series:
# (orange) linear - raw
# (green) nearest - raw
plot_by_day_pair(
data,"tasmax",
"01",
None,
=True,
leap_year=plt.subplots(1, 2, sharey="row", figsize=(12, 5))[1],
axs=[(0, 1), (0, 2)],
series_diff_tuples )
# Differences between series:
# (orange) linear - raw
# (green) nearest - linear
# fig, axs = plt.subplots(1, 2, sharey="row", figsize=(12, 5))
plot_by_day_pair(
data,"tasmax",
"01",
None,
=True,
leap_year=plt.subplots(1, 2, sharey="row", figsize=(12, 5))[1],
axs=[(0, 1), (1, 2)],
series_diff_tuples
)# plt.show()
"tasmax", "01", None, leap_year=False, plot_diff=True, ax=plt.gca()) plot_by_day(data,
# Plot arrays
for year in [None, 1984]:
for leap_year in [True, False]:
for plot_diff in [False, True]:
if year is not None:
= year if leap_year else 1983
actual_year else:
= None
actual_year print(
f"Plot diff: {plot_diff}; year: {actual_year}; leap_year: {leap_year}"
)if not plot_diff:
plot_array(=actual_year, leap_year=leap_year, plot_diff=plot_diff
data, year
)else:
# run_var="01" as an example
plot_array_pair_diff(="01", year=actual_year, leap_year=leap_year
data, run_var
) plt.show()
Plot diff: False; year: None; leap_year: True
Plot diff: True; year: None; leap_year: True
Plot diff: False; year: None; leap_year: False
Plot diff: True; year: None; leap_year: False
Plot diff: False; year: 1984; leap_year: True
Plot diff: True; year: 1984; leap_year: True
Plot diff: False; year: 1983; leap_year: False
Plot diff: True; year: 1983; leap_year: False
def explore_year_range_xlim(
bool, var="tasmax", run_var="01", xlim=(320, 340)
year_range, leap_years:
):for year in year_range:
if year % 4 == (1 if leap_years else 0):
continue
print(f"raw (blue), linear (orange), nearest (green) values; year: {year}")
# Explore location of spike in difference in var="tasmax", run_var="08"
plot_by_day(
data,
var,
run_var,=year,
year=False,
leap_year=False,
plot_diff=plt.gca(),
ax
)# Subset days to region where spike is
*xlim)
plt.xlim(
plt.show()print(f"linear - raw (orange), nearest - raw (green); year: {year}")
plot_by_day(
data,
var,
run_var,=year,
year=False,
leap_year=True,
plot_diff=plt.gca(),
ax
)*xlim)
plt.xlim( plt.show()
list(range(1981, 1989)), leap_years=False) explore_year_range_xlim(
raw (blue), linear (orange), nearest (green) values; year: 1981
linear - raw (orange), nearest - raw (green); year: 1981
raw (blue), linear (orange), nearest (green) values; year: 1982
linear - raw (orange), nearest - raw (green); year: 1982
raw (blue), linear (orange), nearest (green) values; year: 1983
linear - raw (orange), nearest - raw (green); year: 1983
raw (blue), linear (orange), nearest (green) values; year: 1985
linear - raw (orange), nearest - raw (green); year: 1985
raw (blue), linear (orange), nearest (green) values; year: 1986
linear - raw (orange), nearest - raw (green); year: 1986
raw (blue), linear (orange), nearest (green) values; year: 1987
linear - raw (orange), nearest - raw (green); year: 1987
list(range(1981, 1989)), leap_years=False) explore_year_range_xlim(
raw (blue), linear (orange), nearest (green) values; year: 1981
linear - raw (orange), nearest - raw (green); year: 1981
raw (blue), linear (orange), nearest (green) values; year: 1982
linear - raw (orange), nearest - raw (green); year: 1982
raw (blue), linear (orange), nearest (green) values; year: 1983
linear - raw (orange), nearest - raw (green); year: 1983
raw (blue), linear (orange), nearest (green) values; year: 1985
linear - raw (orange), nearest - raw (green); year: 1985
raw (blue), linear (orange), nearest (green) values; year: 1986
linear - raw (orange), nearest - raw (green); year: 1986
raw (blue), linear (orange), nearest (green) values; year: 1987
linear - raw (orange), nearest - raw (green); year: 1987