Source code for openghg.util._util

""" Utility functions that are used by multiple modules

"""

from collections.abc import Iterable
from pathlib import Path
from typing import Any, Dict, Iterator, Optional, Tuple, Union
import logging

from openghg.types import multiPathType

logger = logging.getLogger("openghg.util")
logger.setLevel(logging.DEBUG)  # Have to set level for logger as well as handler



[docs]
def running_in_cloud() -> bool:
    """Are we running in the cloud?

    Checks for the OPENGHG_CLOUD environment variable being set

    Returns:
        bool: True if running in cloud
    """
    from os import environ

    cloud_env = environ.get("OPENGHG_CLOUD", "0")

    return bool(int(cloud_env))




[docs]
def running_on_hub() -> bool:
    """Are we running on the OpenGHG Hub?

    Checks for the OPENGHG_CLOUD environment variable being set

    Returns:
        bool: True if running in cloud
    """
    from os import environ

    hub_env = environ.get("OPENGHG_HUB", "0")

    return bool(int(hub_env))




[docs]
def running_locally() -> bool:
    """Are we running OpenGHG locally?

    Returns:
        bool: True if running locally
    """
    return not (running_on_hub() or running_in_cloud())




[docs]
def unanimous(seq: Dict) -> bool:
    """Checks that all values in an iterable object
    are the same

    Args:
        seq: Iterable object
    Returns
        bool: True if all values are the same

    """
    it = iter(seq.values())
    try:
        first = next(it)
    except StopIteration:
        return True
    else:
        return all(i == first for i in it)




[docs]
def pairwise(iterable: Iterable) -> Iterator[Tuple[Any, Any]]:
    """Return a zip of an iterable where a is the iterable
    and b is the iterable advanced one step.

    Args:
        iterable: Any iterable type
    Returns:
        tuple: Tuple of iterables
    """
    from itertools import tee

    a, b = tee(iterable)
    next(b, None)

    return zip(a, b)




[docs]
def site_code_finder(site_name: str) -> Optional[str]:
    """Find the site code for a given site name.

    Args:
        site_name: Site long name
    Returns:
        str or None: Three letter site code if found
    """
    from openghg.util import remove_punctuation
    from rapidfuzz import process  # type: ignore

    site_name = remove_punctuation(site_name)

    inverted = _create_site_lookup_dict()

    # rapidfuzz 3.9.0 seemed to stop giving type details - ignoring for now.
    matches = process.extract(query=site_name, choices=inverted.keys())  # type:ignore

    highest_score = matches[0][1]

    if highest_score < 90:
        return None

    # If there are multiple >= 90 matches we return None as this is ambiguous
    greater_than_90 = sum(match[1] >= 90 for match in matches)
    if greater_than_90 > 1:
        logger.warning("Please provide more site information, more than one site found.")
        return None

    matched_site = matches[0][0]
    site_code: str = inverted[matched_site]

    return site_code.lower()




[docs]
def find_matching_site(site_name: str, possible_sites: Dict) -> str:
    """Try and find a similar name to site_name in site_list and return a suggestion or
    error string.

    Args:
        site_name: Name of site
        site_list: List of sites to check
    Returns:
        str: Suggestion / error message
    """
    from rapidfuzz import process

    site_list = possible_sites.keys()

    # rapidfuzz 3.9.0 seemed to stop giving type details - ignoring for now.
    matches = process.extract(site_name, site_list)  # type:ignore

    scores = [s for m, s, _ in matches]

    # This seems like a decent cutoff score for a decent find
    cutoff_score = 85

    if scores[0] < cutoff_score:
        return f"No suggestion for {site_name}."
    elif scores[0] > cutoff_score and scores[0] > scores[1]:
        best_match = matches[0][0]
        return f"Did you mean {best_match.upper()}, code: {possible_sites[best_match]} ?"
    elif scores[0] == scores[1]:
        suggestions = [f"{match.title()}, code: {possible_sites[match]}" for match, _, _ in matches]
        nl_char = "\n"
        return f"Did you mean one of : \n {nl_char.join(suggestions)}"
    else:
        return f"Unknown site: {site_name}"



def _create_site_lookup_dict() -> Dict:
    """Create a dictionary of site name: three letter site code values

    Returns:
        dict: Dictionary of site_name: site_code values
    """
    from openghg_defs import site_info_file
    from openghg.util import load_json, remove_punctuation

    site_info = load_json(path=site_info_file)

    inverted = {}
    for site, site_data in site_info.items():
        for _, network_data in site_data.items():
            try:
                long_name = network_data["long_name"]
            except KeyError:
                pass
            else:
                # Remove the country from the name
                try:
                    no_country = remove_punctuation(long_name.split(",")[0])
                except IndexError:
                    no_country = remove_punctuation(long_name)

                inverted[no_country] = site

            break

    return inverted



[docs]
def verify_site(site: str) -> Optional[str]:
    """Check if the passed site is a valid one and returns the three
    letter site code if found. Otherwise we use fuzzy text matching to suggest
    sites with similar names.

    Args:
        site: Three letter site code or site name
    Returns:
        str: Verified three letter site code if valid site
    """
    from openghg.util import load_json
    from openghg_defs import site_info_file

    site_data = load_json(path=site_info_file)

    if site.upper() in site_data:
        return site.lower()
    else:
        site_code = site_code_finder(site_name=site)
        if site_code is None:
            logger.warning(f"Unable to find site code for {site}, please provide additional metadata.")
        return site_code




[docs]
def multiple_inlets(site: str) -> bool:
    """Check if the passed site has more than one inlet

    Args:
        site: Three letter site code
    Returns:
        bool: True if multiple inlets
    """
    from openghg.util import get_site_info

    site_data = get_site_info()

    site = site.upper()
    network = next(iter(site_data[site]))

    try:
        heights = set(site_data[network]["height"])
    except KeyError:
        try:
            heights = set(site_data[network]["height_name"])
        except KeyError:
            return True

    return len(heights) > 1



def sort_by_filenames(filepath: Union[multiPathType, Any]) -> list[Path]:
    """
    Sorting time on filename basis

    Args:
        filepath: Path to the file

    Returns:
        list[Path]: List of sorted paths
    """

    # This code is to stop mypy complaints regarding file types
    if isinstance(filepath, str):
        filepath = [Path(filepath)]
    elif isinstance(filepath, Path):
        filepath = [filepath]
    elif isinstance(filepath, (tuple, list)):
        filepath = [Path(f) for f in filepath]
    else:
        raise TypeError(f"Unsupported type for filepath: {type(filepath)}")

    return sorted(filepath)