Source code for openghg.standardise.meta._attributes

from typing import Any, cast
from collections.abc import Hashable
import logging
from xarray import Dataset
from openghg.types import pathType

__all__ = [
    "assign_attributes",
    "get_attributes",
    "define_species_label",
    "assign_flux_attributes",
    "get_flux_attributes",
    "dataset_formatter",
    "data_variable_formatter",
]

logger = logging.getLogger("openghg.standardise")
logger.setLevel(logging.DEBUG)  # Have to set level for logger as well as handler



[docs]
def assign_attributes(
    data: dict,
    site: str | None = None,
    network: str | None = None,
    sampling_period: str | float | int | None = None,
    update_mismatch: str = "never",
    site_filepath: pathType | None = None,
    species_filepath: pathType | None = None,
) -> dict:
    """Assign attributes to each site and species dataset. This ensures that the xarray Datasets produced
    are CF 1.7 compliant. Some of the attributes written to the Dataset are saved as metadata
    to the Datasource allowing more detailed searching of data.

    If accessing underlying stored site or species definitions, this will
    be accessed from the openghg/openghg_defs repository by default.

    Args:
        data: Dictionary containing data, metadata and attributes
        site: Site code
        sampling_period: Number of seconds for which air
                         sample is taken. Only for time variable attribute
        network: Network name
        update_mismatch: This determines how mismatches between the internal data
            "attributes" and the supplied / derived "metadata" are handled.
            This includes the options:
              - "never" - don't update mismatches and raise an AttrMismatchError
              - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes)
              - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json)
        site_filepath: Alternative site info file
        species_filepath: Alternative species info file

    Returns:
        dict: Dictionary of combined data with correct attributes assigned to Datasets
    """

    for _, gas_data in data.items():
        site_attributes = gas_data.get("attributes", {})
        species = gas_data["metadata"]["species"]

        if site is None:
            site = gas_data.get("metadata", {}).get("site")
        if network is None:
            network = gas_data.get("metadata", {}).get("network")

        units = gas_data.get("metadata", {}).get("units")
        scale = gas_data.get("metadata", {}).get("calibration_scale")

        if sampling_period is None:
            sampling_period = str(gas_data.get("metadata", {}).get("sampling_period", "NOT_SET"))

        gas_data["data"] = get_attributes(
            ds=gas_data["data"],
            species=species,
            site=site,
            network=network,
            units=units,
            scale=scale,
            global_attributes=site_attributes,
            sampling_period=sampling_period,
            site_filepath=site_filepath,
            species_filepath=species_filepath,
        )

    return data




[docs]
def get_attributes(
    ds: Dataset,
    species: str,
    site: str,
    network: str | None = None,
    global_attributes: dict[str, str] | None = None,
    units: str | None = None,
    scale: str | None = None,
    sampling_period: str | float | int | None = None,
    date_range: list[str] | None = None,
    site_filepath: pathType | None = None,
    species_filepath: pathType | None = None,
) -> Dataset:
    """
    This function writes attributes to an xarray.Dataset so that they conform with
    the CF Convention v1.6

    Attributes of the xarray DataSet are modified, and variable names are changed

    If accessing underlying stored site or species definitions, this will
    be accessed from the openghg/openghg_defs repository by default.

    Variable naming related to species name will be defined using
    define_species_label() function.

    Args:
        ds: Should contain variables such as "ch4", "ch4 repeatability".
            Must have a "time" dimension.
        species: Species name. e.g. "CH4", "HFC-134a", "dCH4C13"
        site: Three-letter site code
        network: Network site is associated with
        global_attribuates: Dictionary containing any info you want to
            add to the file header (e.g. {"Contact": "Contact_Name"})
        units: This routine will try to guess the units
            unless this is specified. Options are in units_interpret
        scale: Calibration scale for species.
        sampling_period: Number of seconds for which air
            sample is taken. Only for time variable attribute
        date_range: Start and end date for output
            If you only want an end date, just put a very early start date
            (e.g. ["1900-01-01", "2010-01-01"])
        site_filepath: Alternative site info file
        species_filepath: Alternative species info file
    """
    from openghg.util import load_internal_json, timestamp_now, get_species_info
    from pandas import Timestamp as pd_Timestamp

    if not isinstance(ds, Dataset):
        raise TypeError("This function only accepts xarray Datasets")

    # Load attributes files
    species_attrs = get_species_info()
    attributes_data = load_internal_json(filename="attributes.json")

    unit_interpret = attributes_data["unit_interpret"]
    unit_mol_fraction = attributes_data["unit_mol_fraction"]
    unit_non_standard = attributes_data["unit_non_standard"]

    # Extract both label to use for species and key for attributes
    # Typically species_label will be the lower case version of species_key
    species_label, species_key = define_species_label(species, species_filepath)

    # Global attributes
    global_attributes_default = {
        "conditions_of_use": "Ensure that you contact the data owner at the outset of your project.",
        "source": "In situ measurements of air",
        "Conventions": "CF-1.8",
    }

    if global_attributes is not None:
        # TODO - for some reason mypy doesn't see a Dict[str,str] as a valid Mapping[Hashable, Any] type
        global_attributes.update(global_attributes_default)  # type: ignore
    else:
        global_attributes = global_attributes_default

    global_attributes["file_created"] = str(timestamp_now())
    global_attributes["processed_by"] = "OpenGHG_Cloud"
    global_attributes["species"] = species_label

    if scale is None:
        global_attributes["calibration_scale"] = "unknown"
    else:
        global_attributes["calibration_scale"] = scale

    if sampling_period is None:
        global_attributes["sampling_period"] = "NOT_SET"
    else:
        global_attributes["sampling_period"] = str(sampling_period)
        global_attributes["sampling_period_unit"] = "s"

    # 04/2023: Switched around global and site attributes so
    # global attributes now supercede site attributes.
    # Add some site attributes
    site_attributes = _site_info_attributes(site.upper(), network, site_filepath)
    ds.attrs.update(site_attributes)

    # Update the Dataset attributes
    ds.attrs.update(global_attributes)  # type: ignore

    # Species-specific attributes
    # Extract long name
    try:
        name = species_attrs[species_key]["name"]
    except KeyError:
        name = species_label

    # Extract units if not defined
    if units is None:
        try:
            units = species_attrs[species_key]["units"]
        except KeyError:
            units = ""

    # Define label based on units
    if units in unit_mol_fraction:
        sp_long = f"mole_fraction_of_{name}_in_air"
    else:
        sp_long = name

    ancillary_variables = []

    variable_names = cast(dict[str, Any], ds.variables)

    # Write units as attributes to variables containing any of these
    match_words = ["variability", "repeatability", "stdev", "count"]

    for key in variable_names:
        key = key.lower()

        if species_label in key:
            # Standard name attribute
            # ds[key].attrs["standard_name"]=key.replace(species_label, sp_long)
            ds[key].attrs["long_name"] = key.replace(species_label, sp_long)

            # If units are required for variable, add attribute
            if key == species_label or any(word in key for word in match_words):
                if units in unit_interpret:
                    ds[key].attrs["units"] = unit_interpret[units]
                    # If units are non-standard, add details
                    if units in unit_non_standard:
                        ds[key].attrs["units_description"] = units
                elif units == "":
                    ds[key].attrs["units"] = unit_interpret["else"]
                else:
                    ds[key].attrs["units"] = units

            # Add to list of ancilliary variables
            if key != species_label:
                ancillary_variables.append(key)

    # TODO - for the moment skip this step - check status of ancilliary variables in standard
    # Write ancilliary variable list
    # ds[species_label_lower].attrs["ancilliary_variables"] = ", ".join(ancillary_variables)

    # Add quality flag attributes
    # NOTE - I've removed the whitespace before status_flag and integration_flag here
    variable_names = cast(dict[str, Any], ds.variables)
    quality_flags = [key for key in variable_names if "status_flag" in key]

    for key in quality_flags:
        ds[key] = ds[key].astype(int)
        try:
            long_name = ds[species_label].attrs["long_name"]
        except KeyError:
            raise KeyError(key, quality_flags)

        ds[key].attrs = {
            "flag_meaning": "0 = unflagged, 1 = flagged",
            "long_name": f"{long_name} status_flag",
        }

    variable_names = cast(dict[str, Any], ds.variables)
    # Add integration flag attributes
    integration_flags = [key for key in variable_names if "integration_flag" in key]

    for key in integration_flags:
        ds[key] = ds[key].astype(int)
        long_name = ds[species_label].attrs["long_name"]
        ds[key].attrs = {
            "flag_meaning": "0 = area, 1 = height",
            "standard_name": f"{long_name} integration_flag",
            "comment": "GC peak integration method (by height or by area). Does not indicate data quality",
        }

    first_year = pd_Timestamp(str(ds.time[0].values)).year

    ds.time.encoding = {"units": f"seconds since {str(first_year)}-01-01 00:00:00"}

    time_attributes: dict[str, str] = {}
    time_attributes["label"] = "left"
    time_attributes["standard_name"] = "time"
    time_attributes["comment"] = (
        "Time stamp corresponds to beginning of sampling period. "
        + "Time since midnight UTC of reference date. "
        + "Note that sampling periods are approximate."
    )

    if sampling_period is not None:
        time_attributes["sampling_period_seconds"] = str(sampling_period)

    ds.time.attrs.update(time_attributes)

    # If a date range is specified, slice dataset
    if date_range:
        ds = ds.loc[dict(time=slice(*date_range))]

    return ds




[docs]
def define_species_label(species: str, species_filepath: pathType | None = None) -> tuple[str, str]:
    """Define standardised label to use for observation datasets.
    This uses the data stored within openghg_defs/data/site_info JSON file
    by default with alternative names ('alt') defined within.

    Formatting:
     - species label will be all lower case
     - any spaces will be replaced with underscores
     - if species or synonym cannot be found, species name will used
        but with any hyphens taken out (see also openghg.util.clean_string function)

    Note: Suggested naming for isotopologues should be d<species><isotope>, e.g.
    dCH4C13, or dCO2C14

    Args:
        species : Species name.
        species_filepath : Alternative species info file.
    Returns:
        str, str: Both the species label to be used exactly and the original attribute
                  key needed to extract additional data from the 'site_info.json'
                  attributes file.
    Example:
        >>> define_species_label("methane")
            ("ch4", "CH4")
        >>> define_species_label("radon")
            ("rn", "Rn")
        >>> define_species_label("cfc-11")
            ("cfc11", "CFC11")
        >>> define_species_label("CH4C13")
            ("dch4c13", "DCH4C13")
    """
    from openghg.util import clean_string, synonyms

    # Extract species label using synonyms function
    try:
        species_label = synonyms(
            species, lower=False, allow_new_species=False, species_filepath=species_filepath
        )
    except ValueError:
        species_underscore = species.replace(" ", "_")
        species_remove_dash = species_underscore.replace("-", "")
        species_label = clean_string(species_remove_dash)

    species_label_lower = species_label.lower()

    return species_label_lower, species_label



def _site_info_attributes(
    site: str, network: str | None = None, site_filepath: pathType | None = None
) -> dict:
    """Reads site attributes from JSON

    This uses the data stored within openghg_defs/data/site_info JSON file by default.

    Args:
        site: Site code
        network: Network name
        site_filepath: Alternative site info file
    Returns:
        dict: Dictionary of site attributes
    """
    from openghg.util import get_site_info

    site = site.upper()

    # Read site info file
    site_data = get_site_info(site_filepath)

    if site not in site_data:
        logger.info(
            f"We haven't seen site {site} before, please let us know so we can update our records."
            + "\nYou can help us by opening an issue on GitHub for our supplementary data: https://github.com/openghg/openghg_defs"
        )
        return {}

    if network is None:
        network = next(iter(site_data[site]))
    else:
        network = network.upper()

    attributes_dict = {
        "longitude": "station_longitude",
        "latitude": "station_latitude",
        "long_name": "station_long_name",
        "height_station_masl": "station_height_masl",
    }

    attributes = {}

    for attr in attributes_dict:
        try:
            if attr in site_data[site][network]:
                attr_key = attributes_dict[attr]

                attributes[attr_key] = site_data[site][network][attr]
        except KeyError:
            pass

    return attributes



[docs]
def assign_flux_attributes(
    data: dict,
    species: str | None = None,
    source: str | None = None,
    domain: str | None = None,
    units: str = "mol/m2/s",
    prior_info_dict: dict | None = None,
) -> dict:
    """
    Assign attributes for the input flux dataset within dictionary based on
    metadata and passed arguments.

    Args:
        data: Dictionary containing data, metadata and attributes
        species: Species name
        source: Source name
        domain: Domain name
        units: Unit values for the "flux" variable.  Default = "mol/m2/s"
        prior_info_dict: Dictionary of additional 'prior' information about
            for the emissions sources. Expect this to be of the form e.g.
                {"EDGAR": {"version": "v4.3.2",
                           "raw_resolution": "0.1 degree x 0.1 degree",
                           "reference": "http://edgar.jrc.ec.europa.eu/overview.php?v=432_GHG"
                           ...},
                ...}

    Returns:
        Dict : Same format as inputted but with updated "data" component (Dataset)
    """

    for flux_dict in data.values():
        flux_attributes = flux_dict.get("attributes", {})

        # Ensure values for these attributes have been specified either manually
        # or within metadata.
        attribute_values = {"species": species, "source": source, "domain": domain}

        metadata = flux_dict["metadata"]
        for attr, value in attribute_values.items():
            if value is None:
                try:
                    attribute_values[attr] = metadata[attr]
                except KeyError:
                    raise ValueError(f"Attribute {attr} must be specified.")

        input_attributes = cast(dict[str, str], attribute_values)

        flux_dict["data"] = get_flux_attributes(
            ds=flux_dict["data"],
            units=units,
            prior_info_dict=prior_info_dict,
            global_attributes=flux_attributes,
            **input_attributes,
        )

    return data



def get_flux_attributes(
    ds: Dataset,
    species: str,
    source: str,
    domain: str,
    units: str = "mol/m2/s",
    prior_info_dict: dict | None = None,
    global_attributes: dict[Hashable, Any] | None = None,
) -> Dataset:
    """
    Assign additional attributes for the flux dataset.

    Args:
        ds: Should contain "flux" variable
        species: Species name
        source: Source name
        domain: Domain name
        units: Unit values for the "flux" variable. Default = "mol/m2/s"
        prior_info_dict: Dictionary of additional 'prior' information about
            for the emissions sources. Expect this to be of the form e.g.
                {"EDGAR": {"version": "v4.3.2",
                           "raw_resolution": "0.1 degree x 0.1 degree",
                           "reference": "http://edgar.jrc.ec.europa.eu/overview.php?v=432_GHG"
                           ...},
                ...}
        global_attributes: Additional global attributes to write to dataset.

    Returns:
        Dataset: Input dataset with updated variable/coordinate and global attributes
    """

    # Example flux attributes (from files)
    # :title = "EDGAR 4.3.2 year 2004" ;
    # :author = "ag12733" ;
    # :date_created = "2018-07-16 13:10:57.346915" ;
    # :number_of_prior_files_used = 1L ;
    # :prior_file_1 = "EDGAR" ;
    # :prior_file_1_version = "/data/shared/Gridded_fluxes/N2O/EDGAR_v4.3.2/v432_N2O_TOTALS_nc/v432_N2O_2004.0.1x0.1.nc" ;
    # :prior_file_1_raw_resolution = "0.1 degree x 0.1 degree" ;
    # :prior_file_1_reference = "http://edgar.jrc.ec.europa.eu/overview.php?v=432_GHG" ;
    # :regridder_used = "acrg_grid.regrid.regrid_3D" ;

    from openghg.util import timestamp_now

    # Define species variable/coordinate attributes and assign
    flux_attrs = {"source": source, "units": units, "species": species}

    lat_attrs = {"long_name": "latitude", "units": "degrees_north", "notes": "centre of cell"}

    lon_attrs = {"long_name": "longitude", "units": "degrees_east", "notes": "centre of cell"}

    ds["flux"].attrs = flux_attrs
    ds["lat"].attrs = lat_attrs
    ds["lon"].attrs = lon_attrs

    # Define default values for global attributes
    global_attributes_default: dict[Hashable, Any] = {
        "conditions_of_use": "Ensure that you contact the data owner at the outset of your project.",
        "Conventions": "CF-1.8",
    }

    if global_attributes is None:
        global_attributes = global_attributes_default
    else:
        global_attributes.update(global_attributes_default)

    # Extract any current attributes from the Dataset
    current_attributes = ds.attrs

    # Extract "title" from current attributes or define this.
    if "title" in current_attributes and "title" not in global_attributes:
        global_attributes["title"] = current_attributes["title"]
    else:
        global_attributes["title"] = f"{source} emissions/flux of {species} for {domain} domain"

    if "file_created" not in global_attributes:
        global_attributes["file_created"] = str(timestamp_now())
    if "process_by" not in global_attributes:
        global_attributes["processed_by"] = "OpenGHG_Cloud"

    species_label = define_species_label(species)

    global_attributes["species"] = species_label
    global_attributes["source"] = source
    global_attributes["domain"] = domain

    # Add any 'prior' information for flux / emissions databases.
    if prior_info_dict is not None:
        # For composite flux / emissions files this may contain > 1 prior input
        global_attributes["number_of_prior_files_used"] = len(prior_info_dict.keys())
        for i, source_key in enumerate(prior_info_dict.keys()):
            prior_number = i + 1
            label_start = f"prior_file_{prior_number}"
            global_attributes[label_start] = source_key

            for key, value in prior_info_dict[source_key].items():
                attr_key = f"{label_start}_{key}"
                global_attributes[attr_key] = value

    # Ensure keys which have been updated by OpenGHG are not overwritten
    # by current attributes.
    updated_keys = ["Conventions", "title", "file_created", "processed_by"]
    for key in updated_keys:
        if key in current_attributes:
            current_attributes.pop(key)

    global_attributes.update(current_attributes)
    ds.attrs = global_attributes

    return ds


def dataset_formatter(
    data: dict,
) -> dict:
    """
    Formats species/variables from the dataset by removing the whitespaces
    with underscores and species to lower case

    Args:
        data: Dict containing dataset information(gas_data)

    Returns:
        Dict: Dictionary of source_name : data, metadata, attributes
    """
    for _, gas_data in data.items():
        species = gas_data["metadata"]["species"]
        species_label, species_key = define_species_label(species)
        gas_data["data"] = data_variable_formatter(
            ds=gas_data["data"], species=species, species_label=species_label
        )

    return data


def data_variable_formatter(ds: Dataset, species: str, species_label: str) -> Dataset:
    """
    Formats variables from the dataset by removing the whitespaces
    with underscores and species data var to lower case

    Args:
        ds: Should contain variables such as "ch4", "ch4 repeatability".
            Must have a "time" dimension.
        species: Species name
        species_label: Species label

    Returns:
        ds: xarray dataset
    """
    variable_names = cast(dict[str, Any], ds.variables)
    to_underscores = {var: var.lower().replace(" ", "_") for var in variable_names}
    to_underscores.pop("time")  # Added to remove warning around resetting time index.
    ds = ds.rename(to_underscores)  # type: ignore

    species_lower = species.lower()
    species_search = species_lower.replace(" ", "_")

    variable_names = cast(dict[str, Any], ds.variables)
    matched_keys = [var for var in variable_names if species_search in var]

    # If we don't have any variables to rename, raise an error
    if not matched_keys:
        raise NameError(f"Cannot find species {species_search} in Dataset variables")

    species_rename = {}
    for var in matched_keys:
        species_rename[var] = var.replace(species_search, species_label)

    ds = ds.rename(species_rename)

    return ds