Source code for finch.processes.utils

# noqa: D100
import json
import logging
import os
import zipfile
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from itertools import chain
from multiprocessing.pool import ThreadPool
from pathlib import Path
from typing import (
    Callable,
    Deque,
    Dict,
    Generator,
    Iterable,
    List,
    Optional,
    Tuple,
    Union,
)

import cftime
import numpy as np
import pandas as pd
import requests
import sentry_sdk
import xarray as xr
import xclim
import yaml
from netCDF4 import num2date
from pandas.api.types import is_numeric_dtype
from pywps import (
    FORMATS,
    BoundingBoxInput,
    BoundingBoxOutput,
    ComplexInput,
    ComplexOutput,
    LiteralInput,
    LiteralOutput,
    Process,
    configuration,
)
from pywps.configuration import get_config_value
from pywps.inout.outputs import MetaFile, MetaLink4
from requests.exceptions import ConnectionError, InvalidSchema, MissingSchema
from slugify import slugify
from xclim.core.indicator import build_indicator_module_from_yaml
from xclim.core.utils import InputKind

LOGGER = logging.getLogger("PYWPS")

PywpsInput = Union[LiteralInput, ComplexInput, BoundingBoxInput]
PywpsOutput = Union[LiteralOutput, ComplexOutput, BoundingBoxOutput]
RequestInputs = Dict[str, Deque[PywpsInput]]

# These are parameters that set options. They are not `compute` arguments.
INDICATOR_OPTIONS = [
    "check_missing",
    "missing_options",
    "cf_compliance",
    "data_validation",
]


def get_virtual_modules():
    """Load virtual modules."""
    modules = {}
    if modfiles := get_config_value("finch", "xclim_modules"):
        for modfile in modfiles.split(","):
            if os.path.isabs(modfile):
                mod = build_indicator_module_from_yaml(Path(modfile))
            else:
                mod = build_indicator_module_from_yaml(
                    Path(__file__).parent.parent.joinpath(modfile)
                )
            indicators = []
            for indname, ind in mod.iter_indicators():
                indicators.append(ind.get_instance())
            modules[Path(modfile).name] = dict(indicators=indicators)
    return modules



[docs]
@dataclass
class DatasetConfiguration:
    """Dataset Configuration class.

    Attributes
    ----------
    path: str
        The path (or url) to the root directory where to search for the data.
    pattern: str
        The pattern of the filenames. Must include at least : "variable", "scenario" and "model".
        Patterns must be understandable by :py:func:`parse.parse`.
    local: bool
        Whether the path points to a local directory or a remote THREDDS catalog.
    depth : int
        The depth to which search for files below the directory. < 0 will search recursively.
    suffix : str
        When the files are local, this is the suffix of the files.
    allowed_values : dict
        Mapping from field name to a list of allowed values.
        Must include "scenario", "model" and "variable",
        the latter defines which variable are available and thus which indicator can be used.
    model_lists : dict
        A mapping from list name to a list of model names to provide special sub-lists.
        The values can also be a tuple of (model name, realization numer),
        in which case, pattern must include a "realization" field.
    """

    path: str
    pattern: str
    local: bool
    allowed_values: dict
    depth: int = 0
    suffix: str = "*nc"
    model_lists: dict = field(default_factory=dict)



def get_datasets_config():  # noqa: D103
    p = get_config_value("finch", "datasets_config")
    if not p:  # No config given.
        return {}

    if not Path(p).is_absolute():
        p = Path(__file__).parent.parent / p

    with open(p) as f:
        conf = yaml.safe_load(f)
    return {ds: DatasetConfiguration(**dsconf) for ds, dsconf in conf.items()}


def get_available_variables():  # noqa: D103
    conf = get_datasets_config()
    return set(chain(*(d.allowed_values["variable"] for d in conf.values())))


def iter_xc_variables(indicator: xclim.core.indicator.Indicator):  # noqa: D103
    for n, p in indicator.parameters.items():
        if p.kind in [InputKind.VARIABLE, InputKind.OPTIONAL_VARIABLE]:
            yield n


def log_file_path(process: Process) -> Path:
    """Return the filepath to write the process logfile."""
    return Path(process.workdir) / "log.txt"


def write_log(
    process: Process,
    message: str,
    level=logging.INFO,
    *,
    process_step: str = None,
    subtask_percentage: int = None,
):
    """Log the process status.

     - With the logging module
     - To a log file stored in the process working directory
     - Update the response document with the message and the status percentage

    subtask_percentage: not the percentage of the whole process, but the percent done
    in the current processing step. (see `process.status_percentage_steps`)
    """
    LOGGER.log(level, message)

    status_percentage = process.response.status_percentage

    # if a process_step is given, set this as the status percentage
    if process_step:
        status_percentage = process.status_percentage_steps.get(
            process_step, status_percentage
        )

    # if a subtask percentage is given, add this value to the status_percentage
    if subtask_percentage is not None:
        steps_percentages = list(process.status_percentage_steps.values())
        for n, percent in enumerate(steps_percentages):
            if status_percentage < percent:
                next_step_percentage = percent
                current_step_percentage = steps_percentages[n - 1]
                break
        else:
            current_step_percentage, next_step_percentage = 1, 100
            if steps_percentages:
                current_step_percentage = steps_percentages[-1]
        step_delta = next_step_percentage - current_step_percentage
        sub_percentage = subtask_percentage / 100 * step_delta
        status_percentage = current_step_percentage + int(sub_percentage)

    if level >= logging.INFO:
        log_file_path(process).open("a", encoding="utf8").write(message + "\n")
        try:
            process.response.update_status(message, status_percentage=status_percentage)
        except AttributeError:
            pass


def get_attributes_from_config():
    """Get all explicitly passed metadata attributes from the config in section finch:metadata."""
    # Remove all "defaults", only keep explicitly-passed options
    # This works because we didn't define any defaults for this section.
    # But will do strange things if any of the defaults have the same name as a passed field
    # This is especially risky, since ALL environment variables are listed in the defaults...
    names = set(configuration.CONFIG["finch:metadata"].keys()) - set(
        configuration.CONFIG._defaults.keys()
    )

    return {
        name: configuration.get_config_value("finch:metadata", name) for name in names
    }


def compute_indices(
    process: Process, func: Callable, inputs: RequestInputs
) -> xr.Dataset:  # noqa: D103
    kwds = {}
    global_attributes = {}
    for name, input_queue in inputs.items():
        if isinstance(input_queue[0], LiteralInput):
            value = [inp.data for inp in input_queue]
            if len(input_queue) == 1:
                value = value[0]
            kwds[name] = value

    variable = kwds.pop("variable", None)

    for name, input_queue in inputs.items():
        input = input_queue[0]

        if isinstance(input, ComplexInput):
            if input.supported_formats[0] == FORMATS.JSON:
                kwds[name] = json.loads(input.data)

            elif input.supported_formats[0] in [FORMATS.NETCDF, FORMATS.DODS]:
                ds = try_opendap(
                    input, logging_function=lambda msg: write_log(process, msg)
                )
                global_attributes = global_attributes or ds.attrs
                vars = list(ds.data_vars.values())

                if variable:
                    if variable in ds.data_vars:
                        kwds[name] = ds.data_vars[variable]

                    else:
                        raise KeyError(
                            f"Variable name '{name}' not in data_vars {list(ds.data_vars)}"
                        )
                else:
                    # Get variable matching input parameter name.
                    if name in ds.data_vars:
                        kwds[name] = ds.data_vars[name]

                    # If only one variable in dataset, use it.
                    elif len(vars) == 1:
                        kwds[name] = vars[0]

    user_attrs = get_attributes_from_config()

    global_attributes.update(
        {
            "climateindex_package_id": "https://github.com/Ouranosinc/xclim",
            "product": "derived climate index",
        },
        **user_attrs,
    )

    options = {name: kwds.pop(name) for name in INDICATOR_OPTIONS if name in kwds}
    with xclim.core.options.set_options(**options):
        out = func(**kwds)

    output_dataset = xr.Dataset(
        data_vars=None, coords=out.coords, attrs=global_attributes
    )

    # fix frequency of computed output (xclim should handle this)
    if output_dataset.attrs.get("frequency") == "day" and "freq" in kwds:
        conversions = {
            "YS": "yr",
            "MS": "mon",
            "QS-DEC": "seasonal",
            "AS-JUL": "seasonal",
        }
        output_dataset.attrs["frequency"] = conversions.get(kwds["freq"], "day")

    output_dataset[out.name] = out
    return output_dataset


def drs_filename(ds: xr.Dataset, variable: str = None):
    """Generate filename according to the data reference syntax (DRS).

    Parameters
    ----------
    ds : xr.Dataset
    variable : str
        appropriate variable for filename, if not set (default), variable will be determined from the dataset variables.

    Returns
    -------
    str
        DRS filename

    Raises
    ------
    KeyError
        When the dataset doesn't have the required attributes.

    Notes
    -----
    Copied and modified from https://github.com/bird-house/eggshell which doesn't have a release usable by finch.

    Based on the metadata in the resource.
    http://cmip-pcmdi.llnl.gov/cmip5/docs/cmip5_data_reference_syntax.pdf
    https://pypi.python.org/pypi/drslib
    """
    if len(ds.data_vars) == 1:
        variable = list(ds.data_vars)[0]

    if variable is None:
        variable = [k for k, v in ds.variables.items() if len(v.dims) >= 3][0]
    variable = variable.replace("_", "-")

    # CORDEX example: tas_EUR-11_ICHEC-EC-EARTH_historical_r3i1p1_DMI-HIRHAM5_v1_day
    cordex_pattern = "{variable}_{domain}_{driving_model}_{experiment}_{ensemble}_{model}_{version}_{frequency}"

    # CMIP5 example: tas_MPI-ESM-LR_historical_r1i1p1
    cmip5_pattern = "{variable}_{model}_{experiment}_{ensemble}"

    if ds.attrs["project_id"] in ("CORDEX", "EOBS"):
        filename = cordex_pattern.format(
            variable=variable,
            domain=ds.attrs["CORDEX_domain"],
            driving_model=ds.attrs["driving_model_id"],
            experiment=ds.attrs["experiment_id"],
            ensemble=ds.attrs["driving_model_ensemble_member"],
            model=ds.attrs["model_id"],
            version=ds.attrs["rcm_version_id"],
            frequency=ds.attrs["frequency"],
        )
    elif ds.attrs["project_id"] == "CMIP5":
        ensemble = "r{}i{}p{}".format(
            ds.attrs["driving_realization"],
            ds.attrs["driving_initialization_method"],
            ds.attrs["driving_physics_version"],
        )
        filename = cmip5_pattern.format(
            variable=variable,
            model=ds.attrs["driving_model_id"],
            experiment=ds.attrs["driving_experiment_id"].replace(",", "+"),
            ensemble=ensemble,
        )
    else:
        params = [
            variable,
            ds.attrs.get("frequency"),
            ds.attrs.get("model_id"),
            ds.attrs.get("driving_model_id"),
            ds.attrs.get("experiment_id", "").replace(",", "+"),
            ds.attrs.get("driving_experiment_id", "").replace(",", "+"),
        ]
        params = [k for k in params if k]
        filename = "_".join(params)

    if "time" in ds:
        date_from = ds.time[0].values
        date_to = ds.time[-1].values

        if "units" in ds.time.attrs:
            # times are encoded
            units = ds.time.units
            calendar = ds.time.attrs.get("calendar", "standard")
            date_from = num2date(date_from, units, calendar)
            date_to = num2date(date_to, units, calendar)

        date_from = pd.to_datetime(str(date_from))
        date_to = pd.to_datetime(str(date_to))

        filename += f"_{date_from:%Y%m%d}-{date_to:%Y%m%d}"

    # sanitize any spaces that came from the source input's metadata
    filename = filename.replace(" ", "-")

    filename += ".nc"

    return filename


def try_opendap(
    input: ComplexInput,
    *,
    chunks="auto",
    decode_times=True,
    chunk_dims=None,
    logging_function=lambda message: None,
) -> xr.Dataset:
    """Try to open the file as an OPeNDAP url and chunk it.

    By default, chunks are to be determined by xarray/dask.
    If `chunks=None` or `chunks_dims` is given, finch rechunks the dataset according to
    the logic of `chunk_dataset`.
    Pass `chunks=False` to disable dask entirely on this dataset.
    """
    url = input.url
    logging_function(f"Try opening DAP link {url}")

    if is_opendap_url(url):
        path = url
        logging_function(f"Opened dataset as an OPeNDAP url: {url}")
    else:
        if url.startswith("http"):
            # Accessing the file property writes it to disk if it's a url
            logging_function(f"Downloading dataset for url: {url}")
        else:
            logging_function(f"Opening as local file: {input.file}")
        path = input.file

    try:
        # Try to open the dataset
        ds = xr.open_dataset(path, chunks=chunks or None, decode_times=decode_times)
    except NotImplementedError:
        if chunks == "auto":
            # Some dtypes are not compatible with auto chunking (object, so unbounded strings)
            logging_function(
                "xarray auto-chunking failed, opening with no chunks and inferring chunks ourselves."
            )
            chunks = None
            ds = xr.open_dataset(path, chunks=None, decode_times=decode_times)
        else:
            raise

    # To handle large number of grid cells (50+) in subsetted data
    if "region" in ds.dims and "time" in ds.dims:
        chunks = dict(time=-1, region=5)
        ds = ds.chunk(chunks)
    elif chunks is None or chunk_dims is not None:
        ds = ds.chunk(chunk_dataset(ds, max_size=1000000, chunk_dims=chunk_dims))
    return ds


def process_threaded(function: Callable, inputs: Iterable):
    """Based on the current configuration, process a list threaded or not."""
    threads = int(configuration.get_config_value("finch", "subset_threads"))
    if threads > 1:
        pool = ThreadPool(processes=threads)
        outputs = list(pool.imap_unordered(function, inputs))
        pool.close()
        pool.join()
    else:
        outputs = [function(r) for r in inputs]

    return outputs


def chunk_dataset(ds, max_size=1000000, chunk_dims=None):
    """Ensure the chunked size of a xarray.Dataset is below a certain size.

    Cycle through the dimensions, divide the chunk size by 2 until criteria is met.
    If chunk_dims is given, limits the chunking to those dimensions, if they are
    found in the dataset.
    """
    from functools import reduce
    from itertools import cycle
    from operator import mul

    chunks = dict(ds.sizes)

    dims = set(ds.dims).intersection(chunk_dims or ds.dims)
    if not dims:
        LOGGER.warning(
            f"Provided dimension names for chunking ({chunk_dims}) were "
            f"not found in dataset dims ({ds.dims}). No chunking was done."
        )
        return chunks

    def chunk_size():
        return reduce(mul, chunks.values())

    for dim in cycle(dims):
        if chunk_size() < max_size:
            break
        chunks[dim] = max(chunks[dim] // 2, 1)

    return chunks


def make_metalink_output(
    process: Process, files: List[Path], description: str = None
) -> MetaLink4:
    """Make a MetaLink output from a list of files."""
    metalink = MetaLink4(
        identity=process.identifier,
        description=description,
        publisher="Finch",
        workdir=process.workdir,
    )

    for f in files:
        mf = MetaFile(identity=f.stem, fmt=FORMATS.NETCDF)
        mf.file = str(f)
        metalink.append(mf)

    return metalink


def is_opendap_url(url):
    """Check if a provided url is an OpenDAP url.

    The DAP Standard specifies that a specific tag must be included in the
    Content-Description header of every request. This tag is one of:
        "dods-dds" | "dods-das" | "dods-data" | "dods-error"

    So we can check if the header starts with `dods`.

    Even then, some OpenDAP servers seem to not include the specified header...
    So we need to let the netCDF4 library actually open the file.
    """
    try:
        content_description = requests.head(url, timeout=5).headers.get(
            "Content-Description"
        )
    except (ConnectionError, MissingSchema, InvalidSchema):
        return False

    if content_description:
        return content_description.lower().startswith("dods")
    else:
        return False

        # try:
        #     # For a non-DAP URL, this just hangs python.
        #     dataset = netCDF4.Dataset(url)
        # except OSError:
        #     return False
        # return dataset.disk_format in ("DAP2", "DAP4")


def single_input_or_none(inputs, identifier) -> Optional[str]:
    """Return first input item."""
    try:
        return inputs[identifier][0].data
    except KeyError:
        return None


def netcdf_file_list_to_csv(
    netcdf_files: Union[List[Path], List[str]],
    output_folder,
    filename_prefix,
    csv_precision: Optional[int] = None,
) -> Tuple[List[str], str]:
    """Write csv files for a list of netcdf files.

    Produces one csv file per calendar type, along with a metadata folder in the output_folder.
    """
    output_folder = Path(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)

    def get_attrs_fallback(ds, *args):
        for key in args:
            try:
                return ds.attrs[key]
            except KeyError:
                continue
        raise KeyError(f"Couldn't find any attribute in [{', '.join(args)}]")

    metadata = {}
    concat_by_calendar = {}
    for file in netcdf_files:
        ds = xr.open_dataset(str(file), decode_times=False)
        coords = ds.coords
        calendar = ds.time.calendar
        ds["time"] = xr.decode_cf(ds).time

        for variable in ds.data_vars:
            # for a specific dataset the keys are different:
            # BCCAQv2+ANUSPLIN300_BNU-ESM_historical+rcp85_r1i1p1_19500101-21001231
            model = get_attrs_fallback(ds, "driving_model_id", "GCM__model_id")
            experiment = get_attrs_fallback(
                ds, "driving_experiment_id", "GCM__experiment"
            )
            experiment = experiment.replace(",", "_")

            output_variable = f"{variable}_{model}_{experiment}"

            units = ds[variable].units
            if units:
                output_variable += f"_({units})"

            ds = ds.rename({variable: output_variable})
            if csv_precision and csv_precision < 0:
                ds = ds.round(csv_precision)
                csv_precision = 0
            df = dataset_to_dataframe(ds)

            if calendar not in concat_by_calendar:
                # TODO: Why was this there? When we have a time axis, this makes the concat fail.
                # if "lat" in df.index.names and "lon" in df.index.names:
                #     df = df.reset_index(["lat", "lon"])
                concat_by_calendar[calendar] = [df]
            else:
                concat_by_calendar[calendar].append(df[output_variable])

            metadata[output_variable] = format_metadata(ds)

    output_csv_list = []
    for calendar_type, data in concat_by_calendar.items():
        output_csv = output_folder / f"{filename_prefix}_{calendar_type}.csv"
        concat = pd.concat(data, axis=1)

        if "region" in concat.reset_index().columns:
            concat = (
                concat.reset_index()
                .sort_values(["region", "time"])
                .set_index(["lat", "lon", "time"])
                .drop(columns="region")
            )
        else:
            concat = (
                concat.reset_index()
                .sort_values(["lat", "lon", "time"])
                .set_index(["lat", "lon", "time"])
            )

        dropna_threshold = 1  # at least one value
        concat.dropna(thresh=dropna_threshold, inplace=True)
        if csv_precision is not None:
            for v in concat:
                if v not in coords and is_numeric_dtype(concat[v]):
                    concat[v] = concat[v].map(
                        lambda x: f"{x:.{csv_precision}f}" if not pd.isna(x) else ""
                    )
        concat.to_csv(output_csv)
        output_csv_list.append(output_csv)

    metadata_folder = output_folder / "metadata"
    metadata_folder.mkdir(parents=True, exist_ok=True)
    for output_variable, info in metadata.items():
        metadata_file = metadata_folder / f"{output_variable}.csv"
        metadata_file.write_text(info)

    return output_csv_list, str(metadata_folder)


def dataset_to_dataframe(ds: xr.Dataset) -> pd.DataFrame:
    """Convert a Dataset while keeping the hour of the day uniform at hour=12."""
    if not np.all(ds.time.dt.hour == 12):
        attrs = ds.time.attrs

        # np.datetime64 doesn't have the 'replace' method
        time_values = ds.time.values
        if not hasattr(time_values[0], "replace"):
            time_values = pd.to_datetime(time_values)

        ds["time"] = [y.replace(hour=12) for y in time_values]
        ds.time.attrs = attrs
    df = ds.to_dataframe().reset_index()

    if "realization" not in ds.dims:
        new_cols = [ll for ll in ["lat", "lon", "time"] if ll in df.columns]
    else:
        new_cols = [
            ll
            for ll in ["lat", "lon", "time", "scenario", "region"]
            if ll in df.columns
        ]
        values = [c for c in df.columns if c not in new_cols and c != "realization"]
        df = df.pivot(
            index=new_cols,
            columns="realization",
            values=values,
        ).reset_index()
        # pivot table columns are multi-indexes : flatten
        df.columns = [":".join(d) if d[1] else d[0] for d in df.columns]

    df = df.sort_values(new_cols).set_index(new_cols)

    # new_cols.extend([ll for ll in df.columns if ll not in new_cols])
    return df


def format_metadata(ds) -> str:
    """For an xarray dataset, return its formatted metadata."""

    def _fmt_attrs(obj, name="", comment="# ", tab=" "):
        """Return string of an object's attribute."""
        lines = ["", name]
        for key, val in obj.attrs.items():
            lines.append(
                tab + key + ":: " + str(val).replace("\n", "\n" + comment + tab + "  ")
            )

        out = ("\n" + comment + tab).join(lines)
        return out

    objs = [
        ({"": ds}, "Global attributes"),
        (ds.coords, "Coordinates"),
        (ds.data_vars, "Data variables"),
    ]

    out = ""
    for obj, name in objs:
        out += "# " + name
        tab = "" if name == "Global attributes" else "  "
        for key, val in obj.items():
            out += _fmt_attrs(val, key, tab=tab)
        out += "\n#\n"
    return out


def zip_files(
    output_filename, files: Iterable, log_function: Callable[[str, int], None] = None
):
    """Create a zipfile from a list of files or folders.

    log_function is a function that receives a message and a percentage.
    """
    log_function = log_function or (lambda *a: None)
    with zipfile.ZipFile(
        output_filename, mode="w", compression=zipfile.ZIP_DEFLATED
    ) as z:
        all_files = []
        for file in files:
            file = Path(file)
            if file.is_dir():
                all_files += list(file.rglob("*.*"))
            else:
                all_files.append(file)

        common_folder = None
        all_parents = [list(reversed(file.parents)) for file in all_files]
        for parents in zip(*all_parents):
            if len(set(parents)) == 1:
                common_folder = parents[0]
            else:
                break

        n_files = len(all_files)
        for n, filename in enumerate(all_files):
            percentage = int(n / n_files * 100)
            message = f"Zipping file {n + 1} of {n_files}"
            log_function(message, percentage)

            arcname = filename.relative_to(common_folder) if common_folder else None
            z.write(filename, arcname=arcname)


def make_tasmin_tasmax_pairs(
    filenames: List[Path],
) -> Generator[Tuple[Path, Path], None, None]:
    """Return pairs of corresponding tasmin-tasmax files based on their filename."""
    tasmin_files = [f for f in filenames if "tasmin" in f.name.lower()]
    tasmax_files = [f for f in filenames if "tasmax" in f.name.lower()]
    for tasmin in tasmin_files[:]:
        for tasmax in tasmax_files[:]:
            if tasmin.name.lower() == tasmax.name.lower().replace("tasmax", "tasmin"):
                yield tasmin, tasmax
                tasmax_files.remove(tasmax)
                tasmin_files.remove(tasmin)
                break
    for f in tasmax_files + tasmax_files:
        sentry_sdk.capture_message(
            f"Couldn't find matching tasmin or tasmax for: {f}", level="error"
        )


def fix_broken_time_index(ds: xr.Dataset):
    """Fix for a single broken index in a specific file."""
    if "time" not in ds.dims:
        return

    time_dim = ds.time.values
    times_are_encoded = "units" in ds.time.attrs

    if times_are_encoded:
        wrong_id = np.argwhere(np.isclose(time_dim, 0))
    else:
        if ds.time.dt.calendar != "noleap":
            return
        wrong_id = np.argwhere(
            time_dim == cftime.DatetimeNoLeap(year=1850, month=1, day=1, hour=0)
        )

    if wrong_id.size == 0:
        return

    wrong_id = wrong_id[0, 0]
    if wrong_id == 0 or wrong_id == len(ds.time) - 1:
        return

    daily_gap = 1.0 if times_are_encoded else timedelta(days=1)

    is_daily = time_dim[wrong_id + 1] - time_dim[wrong_id - 1] == daily_gap * 2

    if is_daily:
        fixed_time = time_dim
        fixed_time[wrong_id] = time_dim[wrong_id - 1] + daily_gap
        attrs = ds.time.attrs
        ds["time"] = fixed_time
        ds.time.attrs = attrs


def dataset_to_netcdf(
    ds: xr.Dataset, output_path: Union[Path, str], compression_level=0
) -> None:
    """Write an :py:class:`xarray.Dataset` dataset to disk, optionally using compression."""
    encoding = {}

    if "time" in ds.dims:
        encoding["time"] = {
            "dtype": "single",  # better compatibility with OpenDAP in thredds
        }
        fix_broken_time_index(ds)
    if compression_level:
        for v in ds.data_vars:
            encoding[v] = {"zlib": True, "complevel": compression_level}

    # Perform computations
    ds.load()

    # This is necessary when running with gunicorn to avoid lock-ups
    ds.to_netcdf(str(output_path), format="NETCDF4", encoding=encoding)


def update_history(
    hist_str: str,
    *inputs_list: Union[xr.DataArray, xr.Dataset],
    new_name: Optional[str] = None,
    **inputs_kws: Union[xr.DataArray, xr.Dataset],
):
    r"""Return a history string with the timestamped message and the combination of the history of all inputs.

    The new history entry is formatted as "[<timestamp>] <new_name>: <hist_str> - finch version : <finch version>."

    Parameters
    ----------
    hist_str : str
      The string describing what has been done on the data.
    new_name : Optional[str]
      The name of the newly created variable or dataset to prefix hist_msg.
    \*inputs_list : Union[xr.DataArray, xr.Dataset]
      The datasets or variables that were used to produce the new object.
      Inputs given that way will be prefixed by their "name" attribute if available.
    **inputs_kws : Union[xr.DataArray, xr.Dataset]
      Mapping from names to the datasets or variables that were used to produce the new object.
      Inputs given that way will be prefixes by the passed name.

    Returns
    -------
    str
      The combine history of all inputs starting with `hist_str`.

    See Also
    --------
    merge_attributes
    """
    from finch import __version__  # pylint: disable=cyclic-import

    merged_history = xclim.core.formatting.merge_attributes(
        "history",
        *inputs_list,
        new_line="\n",
        missing_str="",
        **inputs_kws,
    )
    if len(merged_history) > 0 and not merged_history.endswith("\n"):
        merged_history += "\n"
    merged_history += (
        f"[{datetime.now():%Y-%m-%d %H:%M:%S}] {new_name or ''}: "
        f"{hist_str} - finch version: {__version__}."
    )
    return merged_history


def valid_filename(name: Union[Path, str]) -> Union[Path, str]:
    """Remove unsupported characters from a filename.

    Returns
    -------
    str or Path

    Examples
    --------
    >>> valid_filename("summer's tasmin.nc")
    'summers_tasmin.nc'
    """
    p = Path(name)
    s = slugify(p.stem, separator="_")
    if not s:
        raise ValueError(f"Filename not valid. Got {name}.")
    out = p.parent / (s + p.suffix)
    if isinstance(name, str):
        return str(out)
    return out
Source code for finch.processes.utils

Finch

Navigation

Related Topics