Source code for openghg.standardise.surface._thamesbarrier

from pathlib import Path
from typing import Dict, Optional

from openghg.types import pathType, optionalPathType



[docs]
def parse_tmb(
    filepath: pathType,
    site: str = "TMB",
    network: str = "LGHG",
    inlet: Optional[str] = None,
    instrument: Optional[str] = None,
    sampling_period: Optional[str] = None,
    measurement_type: Optional[str] = None,
    update_mismatch: str = "never",
    site_filepath: optionalPathType = None,
    **kwargs: Dict,
) -> Dict:
    """Reads THAMESBARRIER data files and returns the UUIDS of the Datasources
    the processed data has been assigned to

    Args:
        filepath: Path of file to load
        site: Site name
        network: Network, defaults to LGHG
        inlet: Inlet height. Will be inferred if not specified
        instrument: Instrument name
        sampling_period: Sampling period
        measurement_type: Type of measurement taken e.g."flask", "insitu"
        update_mismatch: This determines how mismatches between the internal data
            "attributes" and the supplied / derived "metadata" are handled.
            This includes the options:
              - "never" - don't update mismatches and raise an AttrMismatchError
              - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes)
              - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json)
        site_filepath: Alternative site info file (see openghg/openghg_defs repository for format).
            Otherwise will use the data stored within openghg_defs/data/site_info JSON file by default.
    Returns:
        list: UUIDs of Datasources data has been assigned to
    """
    from openghg.standardise.meta import assign_attributes, dataset_formatter
    from openghg.util import (
        clean_string,
        get_site_info,
        format_inlet,
        synonyms,
        load_internal_json,
    )
    from pandas import read_csv as pd_read_csv

    if sampling_period is None:
        sampling_period = "NOT_SET"

    filepath = Path(filepath)

    data = pd_read_csv(filepath, parse_dates={"time": [0]}, index_col=0)
    # Drop NaNs from the data
    data = data.dropna(axis="rows", how="all")
    # Drop a column if it's all NaNs
    data = data.dropna(axis="columns", how="all")

    rename_dict = {}
    if "Methane" in data.columns:
        rename_dict["Methane"] = "CH4"

    data = data.rename(columns=rename_dict)

    site_upper = site.upper()
    network_upper = network.upper()

    attributes_data = load_internal_json(filename="attributes.json")
    tb_params = attributes_data[site_upper]

    site_data = get_site_info()
    site_info = site_data[site_upper][network_upper]

    try:
        site_inlet_values = site_info["height"]
    except KeyError:
        raise ValueError(
            f"Unable to extract inlet height details for site '{site}'. Please input inlet value."
        )

    inlet = format_inlet(inlet)
    if inlet is None:
        inlet = site_inlet_values[0]  # Use first entry
        inlet = format_inlet(inlet)
    elif inlet not in site_inlet_values:
        print(f"WARNING: inlet value of '{inlet}' does not match to known inlet values")

    gas_data = {}
    for species_column in data.columns:
        processed_data = data.loc[:, [species_column]].sort_index().to_xarray()

        # Convert methane to ppb
        if species_column == "CH4":
            processed_data[species_column] *= 1000

        species = clean_string(species_column)
        species = synonyms(species, allow_new_species=True)

        # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate
        # when averaging
        processed_data["{} variability".format(species)] = processed_data[species_column] * 0.0

        site_attributes = tb_params["global_attributes"]
        site_attributes["inlet"] = inlet
        site_attributes["inlet_height_magl"] = format_inlet(inlet, key_name="inlet_height_magl")
        site_attributes["instrument"] = clean_string(tb_params["instrument"])
        # site_attributes["unit_species"] = tb_params["unit_species"]
        # site_attributes["calibration_scale"] = tb_params["scale"]

        attributes = site_attributes
        attributes["species"] = species

        metadata = {
            "species": species,
            "sampling_period": str(sampling_period),
            "site": site,
            "network": "LGHG",
            "inlet": inlet,
            "data_type": "surface",
            "source_format": "tmb",
        }

        # TODO: All attributes stored in the metadata?
        # metadata.update(attributes)

        gas_data[species] = {
            "metadata": metadata,
            "data": processed_data,
            "attributes": attributes,
        }

    gas_data = dataset_formatter(data=gas_data)

    gas_data = assign_attributes(
        data=gas_data, site=site, update_mismatch=update_mismatch, site_filepath=site_filepath
    )

    return gas_data