Source code for openghg.standardise.meta._metadata

import logging
import math
from copy import deepcopy
from openghg.types import AttrMismatchError
from openghg.util import is_number

logger = logging.getLogger("openghg.standardise.metadata")
logger.setLevel(logging.DEBUG)  # Have to set level for logger as well as handler



[docs]
def attributes_default_keys() -> list:
    """
    Defines default values expected within ObsSurface metadata.
    Returns:
        list: keys required in metadata
    """
    default_keys = [
        "site",
        "species",
        "inlet",
        "inlet_height_magl",
        "network",
        "instrument",
        "sampling_period",
        "calibration_scale",
        "data_owner",
        "data_owner_email",
        "station_longitude",
        "station_latitude",
        "station_long_name",
        "station_height_masl",
    ]

    return default_keys



def metadata_keys_as_floats() -> list:
    """
    Defines which keys should be consistently stored as numbers in the metadata
    (even if they are not numbers within the attributes).
    Returns:
        list: keys required to be floats in metadata
    """

    values_as_floats = [
        # "inlet_height_magl",
        "station_longitude",
        "station_latitude",
        "station_height_masl",
    ]

    return values_as_floats



[docs]
def sync_surface_metadata(
    metadata: dict,
    attributes: dict,
    keys_to_add: list | None = None,
    update_mismatch: str = "never",
) -> tuple[dict, dict]:
    """
    Makes sure any duplicated keys between the metadata and attributes
    dictionaries match and that certain keys are present in the metadata.

    Args:
        metadata: Dictionary of metadata
        attributes: Attributes
        keys_to_add: Add these keys to the metadata, if not present, based on
        the attribute values. Note: this skips any keys which can't be
        copied from the attribute values.
        update_mismatch: If case insensitive mismatch is found between an
          attribute and a metadata value, this determines the function behaviour.
          This includes the options:
            - "never" - don't update mismatches and raise an AttrMismatchError
            - "from_source" / "attributes" - update mismatches based on input attributes
            - "from_definition" / "metadata" - update mismatches based on input metadata
    Returns:
        dict, dict: Aligned metadata, attributes
    """
    meta_copy = deepcopy(metadata)
    attrs_copy = deepcopy(attributes)

    mismatch_keys = {
        "never": ["never"],
        "attributes": ["attributes", "from_source"],
        "metadata": ["metadata", "from_definition"],
    }

    for key, options in mismatch_keys.items():
        try:
            if update_mismatch.lower() in options:
                update_mismatch = key.lower()
                break
        except AttributeError:
            raise ValueError(
                f"Input for 'update_mismatch' must be a string and should be one of {mismatch_keys}"
            )
    else:
        raise ValueError(f"Input for 'update_mismatch' should be one of {mismatch_keys}")

    attr_mismatches = {}

    # Check if we have differences
    for key, meta_value in metadata.items():
        try:
            attr_value = attributes[key]

            # This should mainly be used for lat/long
            relative_tolerance = 1e-3

            if is_number(attr_value) and is_number(meta_value):
                if not math.isclose(float(attr_value), float(meta_value), rel_tol=relative_tolerance):
                    err_warn_num = f"Value of {key} not within tolerance, metadata: {meta_value} - attributes: {attr_value}"
                    if update_mismatch == "never":
                        attr_mismatches[key] = (meta_value, attr_value)
                    elif update_mismatch == "attributes":
                        logger.warning(
                            f"{err_warn_num}\nUpdating metadata to use attribute value of {key} = {attr_value}"
                        )
                        meta_copy[key] = str(attr_value)
                    elif update_mismatch == "metadata":
                        logger.warning(
                            f"{err_warn_num}\nUpdating attributes to use metadata value of {key} = {meta_value}"
                        )
                        attrs_copy[key] = str(meta_value)
            else:
                # Here we don't care about case. Within the Datasource we'll store the
                # metadata as all lowercase, within the attributes we'll keep the case.                err_warn_str = f"Metadata mismatch for '{key}', metadata: {meta_value} - attributes: {attr_value}"
                err_warn_str = (
                    f"Metadata mismatch for '{key}', metadata: {meta_value} - attributes: {attr_value}"
                )
                if str(meta_value).lower() != str(attr_value).lower():
                    if update_mismatch == "never":
                        attr_mismatches[key] = (meta_value, attr_value)
                    elif update_mismatch == "attributes":
                        logger.warning(
                            f"{err_warn_str}\nUpdating metadata to use attribute value of {key} = {attr_value}"
                        )
                        meta_copy[key] = attr_value
                    elif update_mismatch == "metadata":
                        logger.warning(
                            f"{err_warn_str}\nUpdating attributes to use metadata value: {key} = {meta_value}"
                        )
                        attrs_copy[key] = meta_value
        except KeyError:
            # Key wasn't in attributes for comparison
            pass

    if attr_mismatches:
        mismatch_details = [
            f" - '{key}', metadata: {values[0]}, attributes: {values[1]}"
            for key, values in attr_mismatches.items()
        ]
        mismatch_str = "\n".join(mismatch_details)
        raise AttrMismatchError(
            f"Metadata mismatch / value not within tolerance for the following keys:\n{mismatch_str}"
        )

    default_keys_to_add = attributes_default_keys()
    keys_as_floats = metadata_keys_as_floats()

    if keys_to_add is None:
        keys_to_add = default_keys_to_add

    # Check set of keys which should be in metadata and add if not present
    for key in keys_to_add:
        if key not in meta_copy.keys():
            try:
                meta_copy[key] = attributes[key]
            except KeyError:
                logger.warning(f"{key} key not in attributes or metadata")
            else:
                if key in keys_as_floats:
                    meta_copy[key] = float(meta_copy[key])

    return meta_copy, attrs_copy



def align_metadata_attributes(data: dict, update_mismatch: str) -> None:
    """
    Synchronize metadata and attributes in case of mismatches.

    This function currently applies to all surface-level data. Future enhancements
    will extend its functionality to handle column-level data as well.

    Since remote retrievals bypass the traditional `read_file` method, this function
    should be invoked before producing the final standardised output in the retrieval process.

    Args:
        data: Dictionary of source_name data, metadata, attributes
        update_mismatch: This determines how mismatches between the internal data
            "attributes" and the supplied / derived "metadata" are handled.
            This includes the options:
                - "never" - don't update mismatches and raise an AttrMismatchError
                - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes)
                - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json)
    Returns:
        None
    """
    for _, gas_data in data.items():
        measurement_data = gas_data["data"]
        metadata = gas_data["metadata"]

        attrs = measurement_data.attrs

        metadata_aligned, attrs_aligned = sync_surface_metadata(
            metadata=metadata, attributes=attrs, update_mismatch=update_mismatch
        )

        gas_data["metadata"] = metadata_aligned
        gas_data["attributes"] = attrs_aligned
        measurement_data.attrs = gas_data["attributes"]