Source code for openghg.standardise.surface._thamesbarrier

from pathlib import Path
from typing import Dict, Optional

from openghg.types import pathType, optionalPathType


[docs] def parse_tmb( filepath: pathType, site: str = "TMB", network: str = "LGHG", inlet: Optional[str] = None, instrument: Optional[str] = None, sampling_period: Optional[str] = None, measurement_type: Optional[str] = None, update_mismatch: str = "never", site_filepath: optionalPathType = None, **kwargs: Dict, ) -> Dict: """Reads THAMESBARRIER data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: filepath: Path of file to load site: Site name network: Network, defaults to LGHG inlet: Inlet height. Will be inferred if not specified instrument: Instrument name sampling_period: Sampling period measurement_type: Type of measurement taken e.g."flask", "insitu" update_mismatch: This determines how mismatches between the internal data "attributes" and the supplied / derived "metadata" are handled. This includes the options: - "never" - don't update mismatches and raise an AttrMismatchError - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes) - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json) site_filepath: Alternative site info file (see openghg/openghg_defs repository for format). Otherwise will use the data stored within openghg_defs/data/site_info JSON file by default. Returns: list: UUIDs of Datasources data has been assigned to """ from openghg.standardise.meta import assign_attributes, dataset_formatter from openghg.util import ( clean_string, get_site_info, format_inlet, synonyms, load_internal_json, ) from pandas import read_csv as pd_read_csv if sampling_period is None: sampling_period = "NOT_SET" filepath = Path(filepath) data = pd_read_csv(filepath, parse_dates={"time": [0]}, index_col=0) # Drop NaNs from the data data = data.dropna(axis="rows", how="all") # Drop a column if it's all NaNs data = data.dropna(axis="columns", how="all") rename_dict = {} if "Methane" in data.columns: rename_dict["Methane"] = "CH4" data = data.rename(columns=rename_dict) site_upper = site.upper() network_upper = network.upper() attributes_data = load_internal_json(filename="attributes.json") tb_params = attributes_data[site_upper] site_data = get_site_info() site_info = site_data[site_upper][network_upper] try: site_inlet_values = site_info["height"] except KeyError: raise ValueError( f"Unable to extract inlet height details for site '{site}'. Please input inlet value." ) inlet = format_inlet(inlet) if inlet is None: inlet = site_inlet_values[0] # Use first entry inlet = format_inlet(inlet) elif inlet not in site_inlet_values: print(f"WARNING: inlet value of '{inlet}' does not match to known inlet values") gas_data = {} for species_column in data.columns: processed_data = data.loc[:, [species_column]].sort_index().to_xarray() # Convert methane to ppb if species_column == "CH4": processed_data[species_column] *= 1000 species = clean_string(species_column) species = synonyms(species, allow_new_species=True) # No averaging applied to raw obs, set variability to 0 to allow get_obs to calculate # when averaging processed_data["{} variability".format(species)] = processed_data[species_column] * 0.0 site_attributes = tb_params["global_attributes"] site_attributes["inlet"] = inlet site_attributes["inlet_height_magl"] = format_inlet(inlet, key_name="inlet_height_magl") site_attributes["instrument"] = clean_string(tb_params["instrument"]) # site_attributes["unit_species"] = tb_params["unit_species"] # site_attributes["calibration_scale"] = tb_params["scale"] attributes = site_attributes attributes["species"] = species metadata = { "species": species, "sampling_period": str(sampling_period), "site": site, "network": "LGHG", "inlet": inlet, "data_type": "surface", "source_format": "tmb", } # TODO: All attributes stored in the metadata? # metadata.update(attributes) gas_data[species] = { "metadata": metadata, "data": processed_data, "attributes": attributes, } gas_data = dataset_formatter(data=gas_data) gas_data = assign_attributes( data=gas_data, site=site, update_mismatch=update_mismatch, site_filepath=site_filepath ) return gas_data