Source code for openghg.util._util

""" Utility functions that are used by multiple modules

"""

from collections.abc import Iterable
from pathlib import Path
from typing import Any
from collections.abc import Iterator
import logging

from openghg.types import multiPathType

logger = logging.getLogger("openghg.util")
logger.setLevel(logging.DEBUG)  # Have to set level for logger as well as handler


[docs] def unanimous(seq: dict) -> bool: """Checks that all values in an iterable object are the same Args: seq: Iterable object Returns bool: True if all values are the same """ it = iter(seq.values()) try: first = next(it) except StopIteration: return True else: return all(i == first for i in it)
[docs] def pairwise(iterable: Iterable) -> Iterator[tuple[Any, Any]]: """Return a zip of an iterable where a is the iterable and b is the iterable advanced one step. Args: iterable: Any iterable type Returns: tuple: Tuple of iterables """ from itertools import tee a, b = tee(iterable) next(b, None) return zip(a, b)
[docs] def site_code_finder(site_name: str) -> str | None: """Find the site code for a given site name. Args: site_name: Site long name Returns: str or None: Three letter site code if found """ from openghg.util import remove_punctuation from rapidfuzz import process # type: ignore site_name = remove_punctuation(site_name) inverted = _create_site_lookup_dict() # rapidfuzz 3.9.0 seemed to stop giving type details - ignoring for now. matches = process.extract(query=site_name, choices=inverted.keys()) # type:ignore highest_score = matches[0][1] if highest_score < 90: return None # If there are multiple >= 90 matches we return None as this is ambiguous greater_than_90 = sum(match[1] >= 90 for match in matches) if greater_than_90 > 1: logger.warning("Please provide more site information, more than one site found.") return None matched_site = matches[0][0] site_code: str = inverted[matched_site] return site_code.lower()
[docs] def find_matching_site(site_name: str, possible_sites: dict) -> str: """Try and find a similar name to site_name in site_list and return a suggestion or error string. Args: site_name: Name of site site_list: List of sites to check Returns: str: Suggestion / error message """ from rapidfuzz import process site_list = possible_sites.keys() # rapidfuzz 3.9.0 seemed to stop giving type details - ignoring for now. matches = process.extract(site_name, site_list) # type:ignore scores = [s for m, s, _ in matches] # This seems like a decent cutoff score for a decent find cutoff_score = 85 if scores[0] < cutoff_score: return f"No suggestion for {site_name}." elif scores[0] > cutoff_score and scores[0] > scores[1]: best_match = matches[0][0] return f"Did you mean {best_match.upper()}, code: {possible_sites[best_match]} ?" elif scores[0] == scores[1]: suggestions = [f"{match.title()}, code: {possible_sites[match]}" for match, _, _ in matches] nl_char = "\n" return f"Did you mean one of : \n {nl_char.join(suggestions)}" else: return f"Unknown site: {site_name}"
def _create_site_lookup_dict() -> dict: """Create a dictionary of site name: three letter site code values Returns: dict: Dictionary of site_name: site_code values """ from openghg_defs import site_info_file from openghg.util import load_json, remove_punctuation site_info = load_json(path=site_info_file) inverted = {} for site, site_data in site_info.items(): for _, network_data in site_data.items(): try: long_name = network_data["long_name"] except KeyError: pass else: # Remove the country from the name try: no_country = remove_punctuation(long_name.split(",")[0]) except IndexError: no_country = remove_punctuation(long_name) inverted[no_country] = site break return inverted
[docs] def verify_site(site: str) -> str | None: """Check if the passed site is a valid one and returns the three letter site code if found. Otherwise we use fuzzy text matching to suggest sites with similar names. Args: site: Three letter site code or site name Returns: str: Verified three letter site code if valid site """ from openghg.util import load_json from openghg_defs import site_info_file site_data = load_json(path=site_info_file) if site.upper() in site_data: return site.lower() else: site_code = site_code_finder(site_name=site) if site_code is None: logger.warning(f"Unable to find site code for {site}, please provide additional metadata.") return site_code
[docs] def multiple_inlets(site: str) -> bool: """Check if the passed site has more than one inlet Args: site: Three letter site code Returns: bool: True if multiple inlets """ from openghg.util import get_site_info site_data = get_site_info() site = site.upper() network = next(iter(site_data[site])) try: heights = set(site_data[network]["height"]) except KeyError: try: heights = set(site_data[network]["height_name"]) except KeyError: return True return len(heights) > 1
def sort_by_filenames(filepath: multiPathType | Any) -> list[Path]: """ Sorting time on filename basis Args: filepath: Path to the file Returns: list[Path]: List of sorted paths """ # This code is to stop mypy complaints regarding file types if isinstance(filepath, str): filepath = [Path(filepath)] elif isinstance(filepath, Path): filepath = [filepath] elif isinstance(filepath, (tuple, list)): filepath = [Path(f) for f in filepath] else: raise TypeError(f"Unsupported type for filepath: {type(filepath)}") return sorted(filepath)