Source code for openghg.standardise.surface._eurocom

from pathlib import Path
from typing import Dict, Optional, Union

from openghg.standardise.meta import dataset_formatter



[docs]
def parse_eurocom(
    filepath: Union[str, Path],
    site: str,
    sampling_period: str,
    network: Optional[str] = None,
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    update_mismatch: str = "never",
) -> Dict:
    """Parses EUROCOM data files into a format expected by OpenGHG

    Args:
        filepath: Path of file to read
        site: Site code
        sampling_period: Sampling period in seconds
        network: Network name
        Inlet: Inlet height in metres
        Instrument: Instrument name
        update_mismatch: This determines how mismatches between the internal data
            "attributes" and the supplied / derived "metadata" are handled.
            This includes the options:
              - "never" - don't update mismatches and raise an AttrMismatchError
              - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes)
              - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json)
    Returns:
        dict: Dictionary of measurement data
    """
    from openghg.standardise.meta import assign_attributes, get_attributes
    from openghg.util import load_internal_json, read_header, format_inlet
    from pandas import read_csv

    filepath = Path(filepath)

    if site is None:
        site = filepath.stem.split("_")[0]

    if sampling_period is None:
        sampling_period = "NOT_SET"

    filepath = Path(filepath)

    filename = filepath.name
    inlet_height = filename.split("_")[1]

    if "m" not in inlet_height:
        inlet_height = "NA"

    # This dictionary is used to store the gas data and its associated metadata
    combined_data = {}

    # Read the header as lines starting with #
    header = read_header(filepath, comment_char="#")
    n_skip = len(header) - 1
    species = "co2"

    datetime_columns = {"time": ["Year", "Month", "Day", "Hour", "Minute"]}
    use_cols = [
        "Day",
        "Month",
        "Year",
        "Hour",
        "Minute",
        str(species.lower()),
        "SamplingHeight",
        "Stdev",
        "NbPoints",
    ]

    dtypes = {
        species.lower(): float,
        "Stdev": float,
        "SamplingHeight": float,
        "NbPoints": int,
    }

    data = read_csv(
        filepath,
        skiprows=n_skip,
        parse_dates=datetime_columns,
        date_format="%Y %m %d %H %M",
        index_col="time",
        sep=";",
        usecols=use_cols,
        dtype=dtypes,
        na_values="-999.99",
    )

    data = data[data[species.lower()] >= 0.0]
    data = data.dropna(axis="rows", how="any")
    # Drop duplicate indices
    data = data.loc[~data.index.duplicated(keep="first")]
    # Convert to xarray Dataset
    data = data.to_xarray()

    attributes_data = load_internal_json(filename="attributes.json")
    eurocom_attributes = attributes_data["EUROCOM"]
    global_attributes = eurocom_attributes["global_attributes"]

    if inlet_height == "NA":
        try:
            inlet = eurocom_attributes["intake_height"][site]
            global_attributes["inlet_height_m"] = format_inlet(inlet, key_name="inlet_height_m")
            calibration_scale = eurocom_attributes["calibration"][site]
        except KeyError:
            calibration_scale = {}
            raise ValueError(f"Unable to find inlet from filename or attributes file for {site}")

    gas_data = get_attributes(
        ds=data,
        species=species,
        site=site,
        global_attributes=global_attributes,
        units="ppm",
    )

    # Create a copy of the metadata dict
    metadata = {}
    metadata["site"] = site
    metadata["species"] = species
    metadata["inlet"] = format_inlet(global_attributes["inlet_height_m"], key_name="inlet")
    metadata["calibration_scale"] = calibration_scale
    metadata["network"] = "EUROCOM"
    metadata["sampling_period"] = str(sampling_period)
    metadata["data_type"] = "surface"

    combined_data[species] = {
        "metadata": metadata,
        "data": gas_data,
        "attributes": global_attributes,
    }

    combined_data = dataset_formatter(data=combined_data)

    combined_data = assign_attributes(
        data=combined_data, site=site, sampling_period=sampling_period, update_mismatch=update_mismatch
    )

    return combined_data