Source code for openghg.standardise.meta._metadata

import logging
import math
from copy import deepcopy
from openghg.types import AttrMismatchError
from openghg.util import is_number

logger = logging.getLogger("openghg.standardise.metadata")
logger.setLevel(logging.DEBUG)  # Have to set level for logger as well as handler


[docs] def attributes_default_keys() -> list: """ Defines default values expected within ObsSurface metadata. Returns: list: keys required in metadata """ default_keys = [ "site", "species", "inlet", "inlet_height_magl", "network", "instrument", "sampling_period", "calibration_scale", "data_owner", "data_owner_email", "station_longitude", "station_latitude", "station_long_name", "station_height_masl", ] return default_keys
def metadata_keys_as_floats() -> list: """ Defines which keys should be consistently stored as numbers in the metadata (even if they are not numbers within the attributes). Returns: list: keys required to be floats in metadata """ values_as_floats = [ # "inlet_height_magl", "station_longitude", "station_latitude", "station_height_masl", ] return values_as_floats
[docs] def sync_surface_metadata( metadata: dict, attributes: dict, keys_to_add: list | None = None, update_mismatch: str = "never", ) -> tuple[dict, dict]: """ Makes sure any duplicated keys between the metadata and attributes dictionaries match and that certain keys are present in the metadata. Args: metadata: Dictionary of metadata attributes: Attributes keys_to_add: Add these keys to the metadata, if not present, based on the attribute values. Note: this skips any keys which can't be copied from the attribute values. update_mismatch: If case insensitive mismatch is found between an attribute and a metadata value, this determines the function behaviour. This includes the options: - "never" - don't update mismatches and raise an AttrMismatchError - "from_source" / "attributes" - update mismatches based on input attributes - "from_definition" / "metadata" - update mismatches based on input metadata Returns: dict, dict: Aligned metadata, attributes """ meta_copy = deepcopy(metadata) attrs_copy = deepcopy(attributes) mismatch_keys = { "never": ["never"], "attributes": ["attributes", "from_source"], "metadata": ["metadata", "from_definition"], } for key, options in mismatch_keys.items(): try: if update_mismatch.lower() in options: update_mismatch = key.lower() break except AttributeError: raise ValueError( f"Input for 'update_mismatch' must be a string and should be one of {mismatch_keys}" ) else: raise ValueError(f"Input for 'update_mismatch' should be one of {mismatch_keys}") attr_mismatches = {} # Check if we have differences for key, meta_value in metadata.items(): try: attr_value = attributes[key] # This should mainly be used for lat/long relative_tolerance = 1e-3 if is_number(attr_value) and is_number(meta_value): if not math.isclose(float(attr_value), float(meta_value), rel_tol=relative_tolerance): err_warn_num = f"Value of {key} not within tolerance, metadata: {meta_value} - attributes: {attr_value}" if update_mismatch == "never": attr_mismatches[key] = (meta_value, attr_value) elif update_mismatch == "attributes": logger.warning( f"{err_warn_num}\nUpdating metadata to use attribute value of {key} = {attr_value}" ) meta_copy[key] = str(attr_value) elif update_mismatch == "metadata": logger.warning( f"{err_warn_num}\nUpdating attributes to use metadata value of {key} = {meta_value}" ) attrs_copy[key] = str(meta_value) else: # Here we don't care about case. Within the Datasource we'll store the # metadata as all lowercase, within the attributes we'll keep the case. err_warn_str = f"Metadata mismatch for '{key}', metadata: {meta_value} - attributes: {attr_value}" err_warn_str = ( f"Metadata mismatch for '{key}', metadata: {meta_value} - attributes: {attr_value}" ) if str(meta_value).lower() != str(attr_value).lower(): if update_mismatch == "never": attr_mismatches[key] = (meta_value, attr_value) elif update_mismatch == "attributes": logger.warning( f"{err_warn_str}\nUpdating metadata to use attribute value of {key} = {attr_value}" ) meta_copy[key] = attr_value elif update_mismatch == "metadata": logger.warning( f"{err_warn_str}\nUpdating attributes to use metadata value: {key} = {meta_value}" ) attrs_copy[key] = meta_value except KeyError: # Key wasn't in attributes for comparison pass if attr_mismatches: mismatch_details = [ f" - '{key}', metadata: {values[0]}, attributes: {values[1]}" for key, values in attr_mismatches.items() ] mismatch_str = "\n".join(mismatch_details) raise AttrMismatchError( f"Metadata mismatch / value not within tolerance for the following keys:\n{mismatch_str}" ) default_keys_to_add = attributes_default_keys() keys_as_floats = metadata_keys_as_floats() if keys_to_add is None: keys_to_add = default_keys_to_add # Check set of keys which should be in metadata and add if not present for key in keys_to_add: if key not in meta_copy.keys(): try: meta_copy[key] = attributes[key] except KeyError: logger.warning(f"{key} key not in attributes or metadata") else: if key in keys_as_floats: meta_copy[key] = float(meta_copy[key]) return meta_copy, attrs_copy
def align_metadata_attributes(data: dict, update_mismatch: str) -> None: """ Synchronize metadata and attributes in case of mismatches. This function currently applies to all surface-level data. Future enhancements will extend its functionality to handle column-level data as well. Since remote retrievals bypass the traditional `read_file` method, this function should be invoked before producing the final standardised output in the retrieval process. Args: data: Dictionary of source_name data, metadata, attributes update_mismatch: This determines how mismatches between the internal data "attributes" and the supplied / derived "metadata" are handled. This includes the options: - "never" - don't update mismatches and raise an AttrMismatchError - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes) - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json) Returns: None """ for _, gas_data in data.items(): measurement_data = gas_data["data"] metadata = gas_data["metadata"] attrs = measurement_data.attrs metadata_aligned, attrs_aligned = sync_surface_metadata( metadata=metadata, attributes=attrs, update_mismatch=update_mismatch ) gas_data["metadata"] = metadata_aligned gas_data["attributes"] = attrs_aligned measurement_data.attrs = gas_data["attributes"]