Source code for clisops.project_utils

"""Project utilities for CLISOPS."""

import glob
import os

import xarray as xr
from loguru import logger

from clisops import CONFIG
from clisops.exceptions import InvalidProject
from clisops.utils.file_utils import FileMapper


[docs] class DatasetMapper: # noqa: E501 r""" Class to map to data path, dataset ID and files from any dataset input. Dset must be a string and can be input as: - A dataset ID: e.g. "cmip5.output1.INM.inmcm4.rcp45.mon.ocean.Omon.r1i1p1.latest.zostoga". - A file path: e.g. "/badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/rcp85/mon/atmos/Amon/r1i1p1/latest/tas/tas_Amon_HadGEM2-ES_rcp85_r1i1p1_200512-203011.nc". - A path to a group of files: e.g. "/badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/rcp85/mon/atmos/Amon/r1i1p1/latest/tas/\*.nc". - A directory e.g. "/badc/cmip5/data/cmip5/output1/MOHC/HadGEM2-ES/rcp85/mon/atmos/Amon/r1i1p1/latest/tas". - An instance of the FileMapper class (that represents a set of files within a single directory). When force=True, if the project cannot be identified, any attempt to use the base_dir of a project to resolve the data path will be ignored. Any of data_path, ds_id, and files that can be set will be set. Parameters ---------- dset : str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. project : str, optional The project name to use for mapping the dataset. If not provided, it will be deduced from the dataset input. force : bool, optional If True, the function will attempt to find files even if the project of the input dataset cannot be identified. Default is False. """ SUPPORTED_EXTENSIONS = (".nc", ".gz") def __init__(self, dset, project=None, force=False): self._project = project self.dset = dset self._base_dir = None self._ds_id = None self._data_path = None self._files = [] self._parse(force)
[docs] @staticmethod def _get_base_dirs_dict(): projects = get_projects() base_dirs = {project: CONFIG[f"project:{project}"]["base_dir"] for project in projects} return base_dirs
[docs] @staticmethod def _is_ds_id(dset): return dset.count(".") > 1
[docs] def _deduce_project(self, dset): if isinstance(dset, str): if dset.startswith("/"): # by default this returns c3s-cmip6 not cmip6 (as they have the same base_dir) base_dirs_dict = self._get_base_dirs_dict() for project, base_dir in base_dirs_dict.items(): if dset.startswith(base_dir) and CONFIG[f"project:{project}"].get("is_default_for_path") is True: return project elif self._is_ds_id(dset): return dset.split(".")[0].lower() # this will not return c3s project names elif dset.endswith(".nc") or os.path.isfile(dset): dset = xr.open_dataset(dset, decode_times=xr.coders.CFDatetimeCoder(use_cftime=True)) return get_project_from_ds(dset) else: raise InvalidProject(f"The format of {dset} is not known and the project name could not be found.")
[docs] def _parse(self, force): # if instance of FileMapper if isinstance(self.dset, FileMapper): dset = self.dset.dirpath else: dset = self.dset # set project and base_dir if not self._project: try: self._project = self._deduce_project(dset) self._base_dir = get_project_base_dir(self._project) except InvalidProject: logger.info("The project could not be identified") if not force: raise InvalidProject("The project could not be identified and force was set to false") # get base_dir in the case where project has been supplied if not self._base_dir and self._project: self._base_dir = get_project_base_dir(self._project) # if a file, group of files or directory to files - find files if dset.startswith("/") or dset.endswith(".nc"): # if instance of FileMapper if isinstance(self.dset, FileMapper): self._files = self.dset.file_paths self._data_path = self.dset.dirpath if os.path.splitext(dset)[-1] in self.SUPPORTED_EXTENSIONS: if "*" in dset: self._files = sorted(glob.glob(dset)) else: self._files.append(dset) # remove file extension to create data_path self._data_path = "/".join(dset.split("/")[:-1]) # if base_dir identified, insert into data_path if self._base_dir: self._ds_id = ".".join(self._data_path.replace(self._base_dir, self._project).strip("/").split("/")) # test if dataset id elif self._is_ds_id(dset): self._ds_id = dset mappings = CONFIG.get(f"project:{self.project}", {}).get("fixed_path_mappings", {}) # If the dataset uses a fixed path mapping (from the config file) then use it if self._ds_id in mappings: data_path = mappings[self._ds_id] self._data_path = os.path.join(self._base_dir, data_path) # Use pattern of fixed file mapping as glob pattern self._files = sorted(glob.glob(self._data_path)) # Default mapping is done by converting '.' characters to '/' separators in path else: self._data_path = os.path.join(self._base_dir, "/".join(dset.split(".")[1:])) # use to data_path to find files if not set already if len(self._files) < 1: self._files = sorted(glob.glob(os.path.join(self._data_path, "*.nc"))) print(self._data_path) print(self._files)
@property def raw(self): """ Raw dataset input. Returns ------- xarray.Dataset or xarray.DataArray or str or FileMapper The original dataset input provided to the DatasetMapper. """ return self.dset @property def data_path(self): """ Dataset input converted to a data path. Returns ------- str The data path derived from the input dataset, which can be a file path or a directory path. """ return self._data_path @property def ds_id(self): """ Dataset input converted to a ds id. Returns ------- str The dataset ID derived from the input dataset, which is typically in the format "project.dataset_id". """ return self._ds_id @property def base_dir(self): """ The base directory of the input dataset. Returns ------- str The base directory where the dataset files are located, derived from the project configuration. """ return self._base_dir @property def files(self): """ The files found from the input dataset. Returns ------- list A list of file paths deduced from the input dataset. If the dataset is a directory or a file pattern, it will return all matching files. If the dataset is a dataset ID, it will return files based on the dataset ID mapping. """ return self._files @property def project(self): """ The project of the dataset input. Returns ------- str The project name derived from the input dataset. If the project cannot be identified, it will return None. """ return self._project
[docs] def derive_dset(dset: xr.Dataset | xr.DataArray | str | FileMapper) -> str: """ Derive the dataset path of the provided dset. Parameters ---------- dset : xarray.Dataset or xarray.DataArray or str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. Returns ------- str The dataset path derived from the input dataset. """ return DatasetMapper(dset).data_path
[docs] def derive_ds_id(dset: xr.Dataset | xr.DataArray | str | FileMapper) -> str: """ Derive the dataset id of the provided dset. Parameters ---------- dset : xarray.Dataset or xarray.DataArray or str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. Returns ------- str The dataset id derived from the input dataset. """ return DatasetMapper(dset).ds_id
[docs] def datapath_to_dsid(datapath: xr.Dataset | xr.DataArray | str | FileMapper) -> str: """ Switch from dataset path to ds id. Parameters ---------- datapath : xarray.Dataset or xarray.DataArray or str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. Returns ------- str The dataset id derived from the input dataset path. """ return DatasetMapper(datapath).ds_id
[docs] def dsid_to_datapath(dsid: str) -> str: """ Switch from ds id to dataset path. Parameters ---------- dsid : str The dataset ID, which should be in the format "project.dataset_id". Returns ------- str The dataset path derived from the input dataset ID. """ return DatasetMapper(dsid).data_path
[docs] def dset_to_filepaths(dset, force=False): """ Get filepaths deduced from input dset. Parameters ---------- dset : xarray.Dataset or xarray.DataArray or str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. force : bool, optional If True, the function will attempt to find files even if the project of the input dset cannot be identified. Default is False. Returns ------- list A list of file paths deduced from the input dataset. """ mapper = DatasetMapper(dset, force=force) return mapper.files
[docs] def switch_dset(dset: xr.Dataset | xr.DataArray | str | FileMapper) -> str: """ Switch between dataset path and ds id. Parameters ---------- dset : xarray.Dataset or xarray.DataArray or str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. Returns ------- str The dataset path or dataset ID derived from the input dataset, switched from the input. """ if dset.startswith("/"): return datapath_to_dsid(dset) else: return dsid_to_datapath(dset)
[docs] def get_projects() -> list[str]: """ Get all the projects available in the config. Returns ------- list of str A list of project names derived from the configuration. """ return [_.split(":")[1] for _ in CONFIG.keys() if _.startswith("project:")]
[docs] def get_project_from_ds(ds: xr.Dataset | xr.DataArray) -> str | None: """ Get the project from an xarray Dataset/DataArray. Parameters ---------- ds : xarray.Dataset or xarray.DataArray The xarray Dataset or DataArray from which to derive the project. Returns ------- str | None The project derived from the input dataset. """ for project in get_projects(): key = map_facet("project", project) if ds.attrs.get(key, "").lower() == project: return project
[docs] def get_project_name(dset: xr.Dataset | xr.DataArray | str | FileMapper) -> str | None: """ Get the project from an input dset. Parameters ---------- dset : xarray.Dataset or xarray.DataArray or str or FileMapper The dataset input, which can be a Dataset/DataArray, a string representing a dataset ID or file path, or an instance of FileMapper. Returns ------- str | None The project name derived from the input dataset, or None if the project cannot be identified. """ if type(dset) in (xr.core.dataarray.DataArray, xr.core.dataset.Dataset): return get_project_from_ds(dset) # will not return c3s dataset else: return DatasetMapper(dset).project
[docs] def map_facet(facet: str, project: str) -> str: """ Return mapped facet value from config or facet name if not found. Parameters ---------- facet : str The facet name to map. project : str The project name to use for mapping. Returns ------- str The mapped facet value or the original facet name if no mapping is found. """ # Return mapped value or the same facet name proj_mappings = CONFIG[f"project:{project}"]["mappings"] return proj_mappings.get(facet, facet)
[docs] def get_facet(facet_name: str, facets: dict, project: str) -> str: """ Get facet from project config. Parameters ---------- facet_name : str The name of the facet to retrieve. facets : dict A dictionary of facets from the project configuration. project : str The project name to use for mapping the facet. Returns ------- str The mapped facet value from the project configuration. """ return facets[map_facet(facet_name, project)]
[docs] def get_project_base_dir(project: str) -> str: """ Get the base directory of a project from the config. Parameters ---------- project : str The name of the project for which to retrieve the base directory. Returns ------- str The base directory of the specified project. """ try: return CONFIG[f"project:{project}"]["base_dir"] except KeyError: raise InvalidProject("The project supplied is not known.")
[docs] def get_data_node_dirs_dict() -> dict[str, str]: """ Get a dictionary of the data node roots used for retrieving original files. Returns ------- dict A dictionary where keys are project names and values are the data node root directories. """ projects = get_projects() data_node_dirs = { project: CONFIG[f"project:{project}"].get("data_node_root") for project in projects if CONFIG[f"project:{project}"].get("data_node_root") } return data_node_dirs
[docs] def get_project_from_data_node_root(url: str) -> str: """ Identify the project from data node root by identifying the data node root in the input url. Parameters ---------- url : str The URL of the original file, which should contain the data node root. Returns ------- str The project name derived from the data node root in the input URL. Raises ------ InvalidProject If the project cannot be identified from the URL. """ data_node_dict = get_data_node_dirs_dict() project = None for proj, data_node_root in data_node_dict.items(): if data_node_root in url: project = proj if not project: raise InvalidProject( f"The project could not be identified from the URL {url} so it could not be mapped to a file path." ) return project
[docs] def url_to_file_path(url: str) -> str: """ Convert the input url of an original file to a file path. Parameters ---------- url : str The URL of the original file, which should contain the data node root. Returns ------- str The file path derived from the input URL, based on the project's base directory and data node root. """ project = get_project_from_data_node_root(url) data_node_root = CONFIG.get(f"project:{project}", {}).get("data_node_root") base_dir = CONFIG.get(f"project:{project}", {}).get("base_dir") file_path = os.path.join(base_dir, url.partition(data_node_root)[2]) return file_path