"""Test utilities for clisops."""
import importlib.resources as ilr
import os
import warnings
from pathlib import Path
from shutil import copytree
from sys import platform
from urllib.error import HTTPError, URLError
from urllib.parse import urlparse
from urllib.request import urlretrieve
from filelock import FileLock
from jinja2 import Template
from loguru import logger
try:
import pooch
except ImportError:
warnings.warn("The `pooch` library is not installed. The default cache directory for testing data will not be set.")
pooch = None
__all__ = [
"ESGF_TEST_DATA_CACHE_DIR",
"ESGF_TEST_DATA_REPO_URL",
"ESGF_TEST_DATA_VERSION",
"XCLIM_TEST_DATA_CACHE_DIR",
"XCLIM_TEST_DATA_REPO_URL",
"XCLIM_TEST_DATA_VERSION",
"ContextLogger",
"default_esgf_test_data_cache",
"default_xclim_test_data_cache",
"get_esgf_file_paths",
"get_esgf_glob_paths",
"load_registry",
"stratus",
"write_roocs_cfg",
]
try:
default_esgf_test_data_cache = pooch.os_cache("mini-esgf-data")
default_xclim_test_data_cache = pooch.os_cache("xclim-testdata")
except (AttributeError, TypeError):
default_esgf_test_data_cache = None
default_xclim_test_data_cache = None
ESGF_TEST_DATA_REPO_URL = os.getenv("ESGF_TEST_DATA_REPO_UR", "https://raw.githubusercontent.com/roocs/mini-esgf-data")
default_esgf_test_data_version = "v1"
ESGF_TEST_DATA_VERSION = os.getenv("ESGF_TEST_DATA_VERSION", default_esgf_test_data_version)
ESGF_TEST_DATA_CACHE_DIR = os.getenv("ESGF_TEST_DATA_CACHE_DIR", default_esgf_test_data_cache)
XCLIM_TEST_DATA_REPO_URL = os.getenv(
"XCLIM_TEST_DATA_REPO_URL",
"https://raw.githubusercontent.com/Ouranosinc/xclim-testdata",
)
default_xclim_test_data_version = "v2024.8.23"
XCLIM_TEST_DATA_VERSION = os.getenv("XCLIM_TEST_DATA_VERSION", default_xclim_test_data_version)
XCLIM_TEST_DATA_CACHE_DIR = os.getenv("XCLIM_TEST_DATA_CACHE_DIR", default_xclim_test_data_cache)
[docs]
def write_roocs_cfg(
template: str | None = None,
cache_dir: str | Path = default_esgf_test_data_cache,
) -> str:
"""
Write a ROOCS configuration file for testing purposes.
Parameters
----------
template : str, optional
A custom template for the ROOCS configuration file. If not provided, a default template is used.
cache_dir : str or Path, optional
The directory where the configuration file will be written. Default to the ESGF test data cache directory.
Returns
-------
str
The path to the written ROOCS configuration file.
"""
default_template = """
[project:cmip5]
base_dir = {{ base_dir }}/badc/cmip5/data/cmip5
[project:cmip6]
base_dir = {{ base_dir }}/badc/cmip6/data/CMIP6
[project:cordex]
base_dir = {{ base_dir }}/badc/cordex/data/cordex
[project:c3s-cmip5]
base_dir = {{ base_dir }}/gws/nopw/j04/cp4cds1_vol1/data/c3s-cmip5
[project:c3s-cmip6]
base_dir = {{ base_dir }}/badc/cmip6/data/CMIP6
[project:c3s-cordex]
base_dir = {{ base_dir }}/pool/data/CORDEX/data/cordex
[project:proj_test]
base_dir = /projects/test/proj
fixed_path_modifiers =
variable:rain sun cloud
fixed_path_mappings =
proj_test.my.first.test:first/test/something.nc
proj_test.my.second.test:second/test/data_*.txt
proj_test.another.{variable}.test:good/test/{variable}.nc
"""
cfg_template = template or default_template
roocs_config = Path(cache_dir, "roocs.ini")
cfg = Template(cfg_template).render(
base_dir=Path(ESGF_TEST_DATA_CACHE_DIR).joinpath(ESGF_TEST_DATA_VERSION).as_posix()
)
with open(roocs_config, "w") as fp:
fp.write(cfg)
return roocs_config.as_posix()
[docs]
def get_esgf_file_paths(esgf_cache_dir: str | os.PathLike[str]) -> dict[str, str]:
"""
Get a dictionary of example ESGF file paths for testing purposes.
Parameters
----------
esgf_cache_dir : str or os.PathLike
The base directory where ESGF test data is cached.
Returns
-------
dict[str, str]
A dictionary where keys are descriptive names of datasets and values are their corresponding file paths.
"""
return {
"CMIP5_ZOSTOGA": Path(
esgf_cache_dir,
"badc/cmip5/data/cmip5/output1/INM/inmcm4/rcp45/mon/ocean/Omon/r1i1p1/latest/zostoga/zostoga_Omon_inmcm4_rcp45_r1i1p1_200601-210012.nc",
).as_posix(),
"CMIP6_RLDS": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/historical/r1i1p1f1/Amon/rlds/gr/v20180803/rlds_Amon_IPSL-CM6A-LR_historical_r1i1p1f1_gr_185001-201412.nc",
).as_posix(),
"CMIP6_RLDS_ONE_TIME_STEP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/historical/r1i1p1f1/Amon/rlds/gr/v20180803/rlds_Amon_IPSL-CM6A-LR_historical_r1i1p1f1_gr_185001.nc",
).as_posix(),
"CMIP6_RLUS_ONE_TIME_STEP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/historical/r1i1p1f1/Amon/rlus/gr/v20180803/rlus_Amon_IPSL-CM6A-LR_historical_r1i1p1f1_gr_185001.nc",
).as_posix(),
"CMIP6_MRSOFC": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp119/r1i1p1f1/fx/mrsofc/gr/v20190410/mrsofc_fx_IPSL-CM6A-LR_ssp119_r1i1p1f1_gr.nc",
).as_posix(),
"CMIP6_SICONC": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/CCCma/CanESM5/historical/r1i1p1f1/SImon/siconc/gn/latest/siconc_SImon_CanESM5_historical_r1i1p1f1_gn_185001-201412.nc",
).as_posix(),
"CMIP6_SICONC_DAY": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/CCCma/CanESM5/historical/r1i1p1f1/SIday/siconc/gn/v20190429/siconc_SIday_CanESM5_historical_r1i1p1f1_gn_18500101-20141231.nc",
).as_posix(),
"CMIP6_TA": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/MIROC/MIROC6/ssp119/r1i1p1f1/Amon/ta/gn/files/d20190807/ta_Amon_MIROC6_ssp119_r1i1p1f1_gn_201501-202412.nc",
).as_posix(),
"CMIP6_TASMIN": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/Amon/tasmin/gn/v20190710/tasmin_Amon_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_201001-201412.nc",
).as_posix(),
"CMIP6_JULIAN": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/CCCR-IITM/IITM-ESM/1pctCO2/r1i1p1f1/Omon/tos/gn/v20191204/tos_Omon_IITM-ESM_1pctCO2_r1i1p1f1_gn_193001-193412.nc",
).as_posix(),
"CMIP6_TOS": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/Omon/tos/gn/v20190710/tos_Omon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_185001-186912.nc",
).as_posix(),
"CMIP6_AREACELLO": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/historical/r1i1p1f1/Ofx/areacello/gn/v20190726/areacello_Ofx_GFDL-ESM4_historical_r1i1p1f1_gn.nc",
).as_posix(),
"CMIP6_TOS_CNRM": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/CNRM-CERFACS/CNRM-CM6-1/ssp245/r1i1p1f2/Omon/tos/gn/v20190219/tos_Omon_CNRM-CM6-1_ssp245_r1i1p1f2_gn_201501.nc",
).as_posix(),
"CMIP6_TAS_DAY": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/MIROC/MIROC6/ssp119/r1i1p1f1/day/tas/gn/v20191016/tas_day_MIROC6_ssp119_r1i1p1f1_gn_20150101.nc",
).as_posix(),
"CMIP6_SFTOF": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/NCC/NorESM2-MM/ssp126/r1i1p1f1/Ofx/sftof/gn/v20191108/sftof_Ofx_NorESM2-MM_ssp126_r1i1p1f1_gn.nc",
).as_posix(),
"CMIP6_TAS_ONE_TIME_STEP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/CAS/FGOALS-g3/historical/r1i1p1f1/Amon/tas/gn/v20190818/tas_Amon_FGOALS-g3_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_TOS_ONE_TIME_STEP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/Omon/tos/gn/v20190710/tos_Omon_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
# CMIP6 ocean with collapsing cells
"CMIP6_TOS_LR_DEGEN": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/HAMMOZ-Consortium/MPI-ESM-1-2-HAM/ssp370/r1i1p1f1/Omon/tos/gn/v20190628/tos_Omon_MPI-ESM-1-2-HAM_ssp370_r1i1p1f1_gn_201501.nc",
).as_posix(),
# 2nd dataset CMIP6 ocean with collapsing cells
"CMIP6_FX_DEGEN": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/EC-Earth-Consortium/EC-Earth3-Veg/ssp245/r5i1p1f1/Ofx/deptho/gn/v20200312/deptho_Ofx_EC-Earth3-Veg_ssp245_r5i1p1f1_gn.nc",
).as_posix(),
# CMIP6 ocean with collapsing cells, cells extending over 50 degrees, missing_values in lat/lon
"CMIP6_SIMASS_DEGEN": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/NCC/NorESM2-MM/ssp126/r1i1p1f1/SImon/simass/gn/v20191108/simass_SImon_NorESM2-MM_ssp126_r1i1p1f1_gn_201501.nc",
).as_posix(),
# CMIP5 rlat,rlon uncompliant CF units
"CMIP5_WRONG_CF_UNITS": Path(
esgf_cache_dir,
"pool/data/C3SCMIP5/BCC/bcc-csm1-1/rcp85/mon/ocean/Omon/r1i1p1/zos/v20120705/zos_Omon_bcc-csm1-1_rcp85_r1i1p1_200601.nc",
).as_posix(),
# CMIP6 rlat,rlon uncompliant CF units
"CMIP6_WRONG_CF_UNITS": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/AerChemMIP/BCC/BCC-ESM1/ssp370/r1i1p1f1/Omon/pbo/gn/v20190624/pbo_Omon_BCC-ESM1_ssp370_r1i1p1f1_gn_201501.nc",
).as_posix(),
# CMIP6 lat, lon with uncompliant CF units and standard_name
"CMIP6_WRONG_CF_ATTRS": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/HighResMIP/BCC/BCC-CSM2-HR/hist-1950/r1i1p1f1/Omon/tos/gn/v20200922/tos_Omon_BCC-CSM2-HR_hist-1950_r1i1p1f1_gn_198001.nc",
).as_posix(),
"CMIP5_MRSOS_ONE_TIME_STEP": Path(
esgf_cache_dir,
"badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/rcp85/day/land/day/r1i1p1/latest/mrsos/mrsos_day_HadGEM2-ES_rcp85_r1i1p1_20051201.nc",
).as_posix(),
"CMIP6_GFDL_EXTENT": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/historical/r1i1p1f1/Omon/sos/gn/v20180701/sos_Omon_GFDL-CM4_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_TAS_PRECISION_A": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/AWI/AWI-ESM-1-1-LR/1pctCO2/r1i1p1f1/Amon/tas/gn/v20200212/tas_Amon_AWI-ESM-1-1-LR_1pctCO2_r1i1p1f1_gn_185501.nc",
).as_posix(),
"CMIP6_TAS_PRECISION_B": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/AWI/AWI-ESM-1-1-LR/1pctCO2/r1i1p1f1/Amon/tas/gn/v20200212/tas_Amon_AWI-ESM-1-1-LR_1pctCO2_r1i1p1f1_gn_209901.nc",
).as_posix(),
"CMIP6_ATM_VERT_ONE_TIMESTEP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/AERmon/o3/gn/v20190710/o3_AERmon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_ATM_VERT_ONE_TIMESTEP_ZONMEAN": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/AERmon/o3/gn/v20190710/o3_AERmon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_185001_zm.nc",
).as_posix(),
"CMIP6_IITM_EXTENT": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/CCCR-IITM/IITM-ESM/1pctCO2/r1i1p1f1/Omon/tos/gn/v20191204/tos_Omon_IITM-ESM_1pctCO2_r1i1p1f1_gn_193001.nc",
).as_posix(),
# CMIP6 dataset with weird range in its longitude coordinate (-300, 60)
# and unmasked missing values in the latitude and longitude coordinates
"CMIP6_EXTENT_UNMASKED": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/OMIP/NOAA-GFDL/GFDL-OM4p5B/omip1/r1i1p1f1/Omon/volcello/gn/v20180701/volcello_Omon_GFDL-OM4p5B_omip1_r1i1p1f1_gn_176801.nc",
).as_posix(),
"CMIP6_OCE_HALO_CNRM": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/CNRM-CERFACS/CNRM-CM6-1-HR/historical/r1i1p1f2/Omon/tos/gn/v20191021/tos_Omon_CNRM-CM6-1-HR_historical_r1i1p1f2_gn_185001.nc",
).as_posix(),
"CMIP6_UNSTR_FESOM_LR": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/AWI/AWI-ESM-1-1-LR/historical/r1i1p1f1/Omon/tos/gn/v20200212/tos_Omon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_UNSTR_ICON_A": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/ICON-ESM-LR/historical/r1i1p1f1/Amon/tas/gn/v20210215/tas_Amon_ICON-ESM-LR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_UNSTR_VERT_ICON_O": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/ICON-ESM-LR/historical/r1i1p1f1/Omon/thetao/gn/v20210215/thetao_Omon_ICON-ESM-LR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_UNTAGGED_MISSVALS": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/NCAR/CESM2-FV2/historical/r1i1p1f1/Omon/tos/gn/v20191120/tos_Omon_CESM2-FV2_historical_r1i1p1f1_gn_200001.nc",
).as_posix(),
"CMIP6_STAGGERED_UCOMP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/Omon/tauuo/gn/v20200909/tauuo_Omon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_STAGGERED_VCOMP": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/historical/r1i1p1f1/Omon/tauvo/gn/v20190710/tauvo_Omon_MPI-ESM1-2-LR_historical_r1i1p1f1_gn_185001.nc",
).as_posix(),
"CMIP6_FILLVALUE": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/NCAR/CESM2-WACCM/historical/r1i1p1f1/day/tas/gn/v20190227/tas_day_CESM2-WACCM_historical_r1i1p1f1_gn_20000101-20091231.nc",
).as_posix(),
"CMIP6_ZONMEAN_A": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/historical/r1i1p1f1/Omon/msftmz/gn/v20190710/msftmz_Omon_MPI-ESM1-2-HR_historical_r1i1p1f1_gn_191001.nc",
).as_posix(),
"CMIP6_ZONMEAN_B": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/CMIP/NCC/NorCPM1/historical/r22i1p1f1/Omon/msftmz/grz/v20200724/msftmz_Omon_NorCPM1_historical_r22i1p1f1_grz_185001.nc",
).as_posix(),
# CMIP6 dataset without defined bounds on curvilinear grid
"CMIP6_NO_BOUNDS": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/CAS/FGOALS-f3-L/ssp126/r1i1p1f1/Omon/tos/gn/v20191008/tos_Omon_FGOALS-f3-L_ssp126_r1i1p1f1_gn_201501.nc",
).as_posix(),
# CMIP6 dataset with character dimension 'sector'
"CMIP6_CHAR_DIM": Path(
esgf_cache_dir,
"badc/cmip6/data/CMIP6/ScenarioMIP/IPSL/IPSL-CM6A-LR/ssp245/r1i1p1f1/Lmon/landCoverFrac/gr/v20190119/landCoverFrac_Lmon_IPSL-CM6A-LR_ssp245_r1i1p1f1_gr_201501.nc",
).as_posix(),
# CORDEX dataset with maldefined bounds
"CORDEX_ERRONEOUS_BOUNDS": Path(
esgf_cache_dir,
"pool/data/C3SCORDEX/data/c3s-cordex/output/ARC-44/BCCR/ECMWF-ERAINT/evaluation/r1i1p1/BCCR-WRF331/v1/day/tas/v20200915/tas_ARC-44_ECMWF-ERAINT_evaluation_r1i1p1_BCCR-WRF331_v1_day_20010101.nc",
).as_posix(),
"CORDEX_TAS_ONE_TIMESTEP": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/EUR-22/GERICS/MPI-M-MPI-ESM-LR/rcp85/r1i1p1/GERICS-REMO2015/v1/mon/tas/v20191029/tas_EUR-22_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_GERICS-REMO2015_v1_mon_202101.nc",
).as_posix(),
"CORDEX_TAS_ONE_TIMESTEP_ANT": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/ANT-44/KNMI/ECMWF-ERAINT/evaluation/r1i1p1/DMI-HIRHAM5/v1/day/tas/v20201001/tas_ANT-44_ECMWF-ERAINT_evaluation_r1i1p1_DMI-HIRHAM5_v1_day_20060101.nc",
).as_posix(),
"CORDEX_TAS_NO_BOUNDS": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/EUR-11/KNMI/MPI-M-MPI-ESM-LR/rcp85/r1i1p1/KNMI-RACMO22E/v1/mon/tas/v20190625/tas_EUR-11_MPI-M-MPI-ESM-LR_rcp85_r1i1p1_KNMI-RACMO22E_v1_mon_209101.nc",
).as_posix(),
"ATLAS_v1_CMIP5": Path(
esgf_cache_dir,
"pool/data/c3s-cica-atlas/CMIP5/rcp26/pr_CMIP5_rcp26_mon_200601-210012.nc",
).as_posix(),
"ATLAS_v1_EOBS": Path(
esgf_cache_dir,
"pool/data/c3s-cica-atlas/E-OBS/sfcwind_E-OBS_mon_195001-202112.nc",
).as_posix(),
"ATLAS_v1_ERA5": Path(
esgf_cache_dir,
"pool/data/c3s-cica-atlas/ERA5/psl_ERA5_mon_194001-202212.nc",
).as_posix(),
"ATLAS_v1_CORDEX": Path(
esgf_cache_dir,
"pool/data/c3s-cica-atlas/CORDEX-CORE/historical/huss_CORDEX-CORE_historical_mon_197001.nc",
).as_posix(),
"ATLAS_v1_EOBS_GRID": Path(
esgf_cache_dir,
"pool/data/c3s-cica-atlas/E-OBS/t_E-OBS_mon_195001.nc",
).as_posix(),
"ATLAS_v0_CORDEX_NAM": Path(
esgf_cache_dir,
"pool/data/c3s-ipcc-ar6-atlas/CORDEX-NAM/historical/rx1day_CORDEX-NAM_historical_mon_197001-200512.nc",
).as_posix(),
"ATLAS_v0_CMIP6": Path(
esgf_cache_dir,
"pool/data/c3s-ipcc-ar6-atlas/CMIP6/ssp245/sst_CMIP6_ssp245_mon_201501-210012.nc",
).as_posix(),
"ATLAS_v0_CORDEX_ANT": Path(
esgf_cache_dir,
"pool/data/c3s-ipcc-ar6-atlas/CORDEX-ANT/rcp45/tnn_CORDEX-ANT_rcp45_mon_200601.nc",
).as_posix(),
}
def get_kerchunk_datasets():
"""
Return a dictionary of Kerchunk datasets for testing purposes.
Returns
-------
dict[str, str]
A dictionary where keys are dataset identifiers and values are URLs to the Kerchunk JSON files.
"""
kerchunk = {
# Kerchunk datasets
"CMIP6_KERCHUNK_HTTPS_OPEN_JSON": (
"https://gws-access.jasmin.ac.uk/public/cmip6_prep/eodh-eocis/kc-indexes-cmip6-http-v1/"
"CMIP6.CMIP.MOHC.UKESM1-1-LL.1pctCO2.r1i1p1f2.Amon.tasmax.gn.v20220513.json"
)
}
kerchunk["CMIP6_KERCHUNK_HTTPS_OPEN_ZST"] = f"{kerchunk['CMIP6_KERCHUNK_HTTPS_OPEN_JSON']}.zst"
return kerchunk
[docs]
def get_esgf_glob_paths(esgf_cache_dir: str | os.PathLike[str]) -> dict[str, str]:
"""
Return a dictionary of glob paths for ESGF test data.
Parameters
----------
esgf_cache_dir : str or os.PathLike
The base directory where ESGF test data is cached.
Returns
-------
dict
A dictionary where keys are dataset identifiers and values are glob paths to the datasets.
"""
return {
"CMIP5_TAS": Path(
esgf_cache_dir,
"badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/rcp85/mon/atmos/Amon/r1i1p1/latest/tas/*.nc",
).as_posix(),
"CMIP5_TAS_EC_EARTH": Path(
esgf_cache_dir,
"badc/cmip5/data/cmip5/output1/ICHEC/EC-EARTH/historical/mon/atmos/Amon/r1i1p1/latest/tas/*.nc",
).as_posix(),
"CMIP5_RH": Path(
esgf_cache_dir,
"badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/historical/mon/land/Lmon/r1i1p1/latest/rh/*.nc",
).as_posix(),
"C3S_CMIP5_TSICE": Path(
esgf_cache_dir,
"gws/nopw/j04/cp4cds1_vol1/data/c3s-cmip5/output1/NCC/NorESM1-ME/rcp60/mon/seaIce/OImon/r1i1p1/tsice/v20120614/*.nc",
).as_posix(),
"C3S_CORDEX_AFR_TAS": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/AFR-22/GERICS/MPI-M-MPI-ESM-LR/historical/r1i1p1/GERICS-REMO2015/v1/day/tas/v20201015/*.nc",
).as_posix(),
"C3S_CORDEX_NAM_PR": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/NAM-22/OURANOS/NOAA-GFDL-GFDL-ESM2M/rcp45/r1i1p1/OURANOS-CRCM5/v1/day/pr/v20200831/*.nc",
).as_posix(),
"C3S_CORDEX_EUR_ZG500": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/EUR-11/IPSL/IPSL-IPSL-CM5A-MR/rcp85/r1i1p1/IPSL-WRF381P/v1/day/zg500/v20190919/*.nc",
).as_posix(),
"C3S_CORDEX_ANT_SFC_WIND": Path(
esgf_cache_dir,
"pool/data/CORDEX/data/cordex/output/ANT-44/KNMI/ECMWF-ERAINT/evaluation/r1i1p1/KNMI-RACMO21P/v1/day/sfcWind/v20201001/*.nc",
).as_posix(),
"CMIP5_MRSOS_MULTIPLE_TIME_STEPS": Path(
esgf_cache_dir,
"badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/rcp45/day/land/day/r1i1p1/latest/mrsos/*.nc",
).as_posix(),
"C3S_CMIP5_TAS": Path(
esgf_cache_dir,
"gws/nopw/j04/cp4cds1_vol1/data/c3s-cmip5/output1/ICHEC/EC-EARTH/historical/day/atmos/day/r1i1p1/tas/v20131231/*.nc",
).as_posix(),
}
[docs]
class ContextLogger:
"""
Helper function for safe logging management in pytests.
This class manages the loguru logger context, enabling and disabling logging
for a specific package during the test execution. It also handles the case
where pytest's caplog fixture is used, allowing for log capturing without
interfering with the logger's configuration.
Parameters
----------
caplog : CaplogFixture, optional
The pytest caplog fixture, if provided, to capture logs during tests.
"""
def __init__(self, caplog=False):
"""Initialize the ContextLogger."""
from loguru import logger
self.logger = logger
self.using_caplog = False
if caplog:
self.using_caplog = True
def __enter__(self, package_name: str = "clisops"):
"""If test is supplying caplog, pytest will manage setup."""
self.logger.enable(package_name)
self._package = package_name
return self.logger
def __exit__(self, exc_type, exc_val, exc_tb):
"""If test is supplying caplog, pytest will manage teardown."""
self.logger.disable(self._package)
if not self.using_caplog:
try:
self.logger.remove()
except ValueError: # noqa: S110
pass
[docs]
def load_registry(branch: str, repo: str) -> dict[str, str]:
"""
Load the registry file for the test data.
Parameters
----------
branch : str
The branch of the repository to use for the registry.
repo : str
The URL of the repository to use for the registry.
Returns
-------
dict
Dictionary of filenames and hashes.
"""
if repo == ESGF_TEST_DATA_REPO_URL:
project = "mini-esgf-data"
default_testdata_version = ESGF_TEST_DATA_VERSION
default_testdata_repo_url = ESGF_TEST_DATA_REPO_URL
elif repo == XCLIM_TEST_DATA_REPO_URL:
project = "xclim-testdata"
default_testdata_version = XCLIM_TEST_DATA_VERSION
default_testdata_repo_url = XCLIM_TEST_DATA_REPO_URL
else:
raise ValueError(
f"Repository URL {repo} not recognized. "
f"Please use one of {ESGF_TEST_DATA_REPO_URL} or {XCLIM_TEST_DATA_REPO_URL}"
)
remote_registry = audit_url(f"{repo}{branch}/data/{project}_registry.txt")
if branch != default_testdata_version:
custom_registry_folder = Path(str(ilr.files("clisops").joinpath(f"utils/registries/{branch}")))
custom_registry_folder.mkdir(parents=True, exist_ok=True)
registry_file = custom_registry_folder.joinpath(f"{project}_registry.txt")
urlretrieve(remote_registry, registry_file) # noqa: S310
elif repo != default_testdata_repo_url:
registry_file = Path(str(ilr.files("clisops").joinpath(f"utils/{project}_registry.txt")))
urlretrieve(remote_registry, registry_file) # noqa: S310
registry_file = Path(str(ilr.files("clisops").joinpath(f"utils/{project}_registry.txt")))
if not registry_file.exists():
raise FileNotFoundError(f"Registry file not found: {registry_file}")
# Load the registry file
with registry_file.open() as f:
registry = {line.split()[0]: line.split()[1] for line in f}
return registry
[docs]
def stratus(
repo: str,
branch: str,
cache_dir: str | Path,
data_updates: bool = True,
):
"""
Pooch registry instance for xclim test data.
Parameters
----------
repo : str
URL of the repository to use when fetching testing datasets.
branch : str
Branch of repository to use when fetching testing datasets.
cache_dir : str or Path
The path to the directory where the data files are stored.
data_updates : bool
If True, allow updates to the data files. Default is True.
Returns
-------
pooch.Pooch
The Pooch instance for accessing the testing data.
Examples
--------
Using the registry to download a file:
.. code-block:: python
import xarray as xr
from clisops.utils.testing import stratus
s = stratus(data_dir=..., repo=..., branch=...)
example_file = s.fetch("example.nc")
data = xr.open_dataset(example_file)
"""
if pooch is None:
raise ImportError(
"The `pooch` package is required to fetch the remote testing data. "
"You can install it with `pip install pooch` or `pip install roocs-utils[dev]`."
)
if repo.endswith("xclim-testdata"):
_version = XCLIM_TEST_DATA_VERSION
_default_version = default_xclim_test_data_version
elif repo.endswith("mini-esgf-data"):
_version = ESGF_TEST_DATA_VERSION
_default_version = default_esgf_test_data_version
else:
raise ValueError(
f"Repository URL {repo} not recognized. "
f"Please use one of {ESGF_TEST_DATA_REPO_URL} or {XCLIM_TEST_DATA_REPO_URL}"
)
remote = audit_url(f"{repo}/{branch}/data")
return pooch.create(
path=cache_dir,
base_url=remote,
version=_default_version,
version_dev=_version,
allow_updates=data_updates,
registry=load_registry(branch=branch, repo=repo),
)
def populate_testing_data(
repo: str,
branch: str,
cache_dir: Path,
):
"""
Populate the local cache with the testing data.
Parameters
----------
repo : str, optional
URL of the repository to use when fetching testing datasets.
branch : str, optional
Branch of repository to use when fetching testing datasets.
cache_dir : Path
The path to the local cache. Defaults to the location set by the platformdirs library.
The testing data will be downloaded to this local cache.
"""
# Create the Pooch instance
n = stratus(cache_dir=cache_dir, repo=repo, branch=branch)
# Download the files
errored_files = []
for file in load_registry(branch=branch, repo=repo):
try:
n.fetch(file)
except HTTPError:
msg = f"File `{file}` not accessible in remote repository."
logger.error(msg)
errored_files.append(file)
else:
logger.info("Files were downloaded successfully.")
if errored_files:
logger.error(
"The following files were unable to be downloaded: %s",
errored_files,
)
def gather_testing_data(
worker_cache_dir: str | os.PathLike[str] | Path,
worker_id: str,
branch: str,
repo: str,
cache_dir: str | os.PathLike[str] | Path,
):
"""
Gather testing data across workers.
Parameters
----------
worker_cache_dir : str or Path
The path to the worker's cache directory where the testing data will be copied.
worker_id : str
The ID of the worker. If 'master', the testing data will be populated.
branch : str
The branch of the repository to use when fetching testing datasets.
repo : str
The URL of the repository to use when fetching testing datasets.
cache_dir : str or Path
The path to the local cache where the testing data is stored.
Raises
------
ValueError
If the repository URL is not recognised.
FileNotFoundError
If the testing data is not found and UNIX-style file-locking is not supported on Windows.
"""
cache_dir = Path(cache_dir)
if repo.endswith("xclim-testdata"):
version = default_xclim_test_data_version
elif repo.endswith("mini-esgf-data"):
version = default_esgf_test_data_version
else:
raise ValueError(
f"Repository URL {repo} not recognized. "
f"Please use one of {ESGF_TEST_DATA_REPO_URL} or {XCLIM_TEST_DATA_REPO_URL}"
)
if worker_id == "master":
populate_testing_data(branch=branch, repo=repo, cache_dir=cache_dir)
else:
if platform == "win32":
if not cache_dir.joinpath(branch).exists():
raise FileNotFoundError(
"Testing data not found and UNIX-style file-locking is not supported on Windows. "
"Consider running `populate_testing_data()` to download testing data beforehand."
)
else:
cache_dir.mkdir(exist_ok=True, parents=True)
lockfile = cache_dir.joinpath(".lock")
test_data_being_written = FileLock(lockfile)
with test_data_being_written:
# This flag prevents multiple calls from re-attempting to download testing data in the same pytest run
populate_testing_data(branch=branch, repo=repo, cache_dir=cache_dir)
cache_dir.joinpath(".data_written").touch()
with test_data_being_written.acquire():
if lockfile.exists():
lockfile.unlink()
copytree(cache_dir.joinpath(version), worker_cache_dir)
def audit_url(url: str, context: str | None = None) -> str:
"""
Check if the URL is well-formed.
Parameters
----------
url : str
The URL to check.
context : str, optional
Context for the error message, if the URL is not well-formed.
Returns
-------
str
The original URL if it is well-formed and uses secure HTTP (https).
Raises
------
URLError
If the URL is not well-formed.
"""
msg = ""
result = urlparse(url)
if result.scheme == "http":
msg = f"{context if context else ''} URL is not using secure HTTP: '{url}'".strip()
if not all([result.scheme, result.netloc]):
msg = f"{context if context else ''} URL is not well-formed: '{url}'".strip()
if msg:
logger.error(msg)
raise URLError(msg)
return url