Source code for openghg.standardise.surface._btt

from pathlib import Path
from typing import Dict, Optional, Union
import warnings

from openghg.standardise.meta import dataset_formatter


[docs] def parse_btt( filepath: Union[str, Path], site: Optional[str] = "BTT", network: Optional[str] = "LGHG", inlet: Optional[str] = None, instrument: Optional[str] = None, update_mismatch: str = "never", ) -> Dict: """Reads NPL data files and returns the UUIDS of the Datasources the processed data has been assigned to Args: filepath: Path of file to load site: Site name update_mismatch: This determines how mismatches between the internal data "attributes" and the supplied / derived "metadata" are handled. This includes the options: - "never" - don't update mismatches and raise an AttrMismatchError - "from_source" / "attributes" - update mismatches based on input data (e.g. data attributes) - "from_definition" / "metadata" - update mismatches based on associated data (e.g. site_info.json) Returns: dict: Dictionary of gas data """ from numpy import nan as np_nan from openghg.standardise.meta import assign_attributes from openghg.util import clean_string, load_internal_json, get_site_info, format_inlet from pandas import Timestamp, isnull, read_csv, to_timedelta # TODO: Decide what to do about inputs which aren't use anywhere # at present - inlet, instrument, sampling_period, measurement_type filepath = Path(filepath) warnings.warn("This function will be removed in a future release", DeprecationWarning) site = "BTT" # Rename these columns rename_dict = {"co2.cal": "CO2", "ch4.cal.ppb": "CH4"} # We only want these species species_extract = ["CO2", "CH4"] # Take std-dev measurements from these columns for these species species_sd = {"CO2": "co2.sd.ppm", "CH4": "ch4.sd.ppb"} site_data = get_site_info() site_info = site_data[site][network] param_data = load_internal_json(filename="attributes.json") network_params = param_data["BTT"] site_attributes = network_params["global_attributes"] sampling_period = int(network_params["sampling_period"]) sampling_period_seconds = str(sampling_period) + "s" metadata = {} metadata["site"] = site metadata["inlet"] = format_inlet(network_params["inlet"], key_name="inlet") metadata["instrument"] = network_params["instrument"] metadata["sampling_period"] = str(sampling_period) metadata["station_longitude"] = site_info["longitude"] metadata["station_latitude"] = site_info["latitude"] metadata["station_long_name"] = site_info["long_name"] metadata["data_type"] = "surface" metadata["source_format"] = "btt" attributes = network_params["global_attributes"] attributes["inlet_height_magl"] = format_inlet(network_params["inlet"], key_name="inlet_height_magl") attributes.update(metadata) data = read_csv(filepath) data["time"] = Timestamp("2019-01-01 00:00") + to_timedelta(data["DOY"] - 1, unit="D") data["time"] = data["time"].dt.round(sampling_period_seconds) data = data[~isnull(data.time)] data = data.rename(columns=rename_dict) data = data.set_index("time") gas_data = {} for species in species_extract: processed_data = data.loc[:, [species]].sort_index() # Create a variability column species_stddev_label = species_sd[species] processed_data[species][f"{species} variability"] = data[species_stddev_label] # Replace any values below zero with NaNs processed_data[processed_data < 0] = np_nan # Drop NaNs processed_data = processed_data.dropna() # Convert to a Dataset processed_data = processed_data.to_xarray() species_attributes = attributes.copy() species_attributes["species"] = clean_string(species) species_metadata = metadata.copy() species_metadata["species"] = clean_string(species) gas_data[species] = { "metadata": species_metadata, "data": processed_data, "attributes": site_attributes, } gas_data = dataset_formatter(data=gas_data) gas_data = assign_attributes(data=gas_data, site=site, network=network, update_mismatch=update_mismatch) return gas_data