Source code for openghg.standardise.surface._eurocom

from pathlib import Path
from typing import Dict, Optional, Union

from openghg.standardise.meta import dataset_formatter


[docs] def parse_eurocom( filepath: Union[str, Path], site: str, sampling_period: str, network: Optional[str] = None, inlet: Optional[str] = None, instrument: Optional[str] = None, update_mismatch: str = "never", ) -> Dict: """Parses EUROCOM data files into a format expected by OpenGHG Args: filepath: Path of file to read site: Site code sampling_period: Sampling period in seconds network: Network name Inlet: Inlet height in metres Instrument: Instrument name update_mismatch: This determines how mismatches between the internal data "attributes" and the supplied / derived "metadata" are handled. This includes the options: - "never" - don't update mismatches and raise an AttrMismatchError - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes) - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json) Returns: dict: Dictionary of measurement data """ from openghg.standardise.meta import assign_attributes, get_attributes from openghg.util import load_internal_json, read_header, format_inlet from pandas import read_csv filepath = Path(filepath) if site is None: site = filepath.stem.split("_")[0] if sampling_period is None: sampling_period = "NOT_SET" filepath = Path(filepath) filename = filepath.name inlet_height = filename.split("_")[1] if "m" not in inlet_height: inlet_height = "NA" # This dictionary is used to store the gas data and its associated metadata combined_data = {} # Read the header as lines starting with # header = read_header(filepath, comment_char="#") n_skip = len(header) - 1 species = "co2" datetime_columns = {"time": ["Year", "Month", "Day", "Hour", "Minute"]} use_cols = [ "Day", "Month", "Year", "Hour", "Minute", str(species.lower()), "SamplingHeight", "Stdev", "NbPoints", ] dtypes = { species.lower(): float, "Stdev": float, "SamplingHeight": float, "NbPoints": int, } data = read_csv( filepath, skiprows=n_skip, parse_dates=datetime_columns, date_format="%Y %m %d %H %M", index_col="time", sep=";", usecols=use_cols, dtype=dtypes, na_values="-999.99", ) data = data[data[species.lower()] >= 0.0] data = data.dropna(axis="rows", how="any") # Drop duplicate indices data = data.loc[~data.index.duplicated(keep="first")] # Convert to xarray Dataset data = data.to_xarray() attributes_data = load_internal_json(filename="attributes.json") eurocom_attributes = attributes_data["EUROCOM"] global_attributes = eurocom_attributes["global_attributes"] if inlet_height == "NA": try: inlet = eurocom_attributes["intake_height"][site] global_attributes["inlet_height_m"] = format_inlet(inlet, key_name="inlet_height_m") calibration_scale = eurocom_attributes["calibration"][site] except KeyError: calibration_scale = {} raise ValueError(f"Unable to find inlet from filename or attributes file for {site}") gas_data = get_attributes( ds=data, species=species, site=site, global_attributes=global_attributes, units="ppm", ) # Create a copy of the metadata dict metadata = {} metadata["site"] = site metadata["species"] = species metadata["inlet"] = format_inlet(global_attributes["inlet_height_m"], key_name="inlet") metadata["calibration_scale"] = calibration_scale metadata["network"] = "EUROCOM" metadata["sampling_period"] = str(sampling_period) metadata["data_type"] = "surface" combined_data[species] = { "metadata": metadata, "data": gas_data, "attributes": global_attributes, } combined_data = dataset_formatter(data=combined_data) combined_data = assign_attributes( data=combined_data, site=site, sampling_period=sampling_period, update_mismatch=update_mismatch ) return combined_data