Source code for openghg.util._strings
import re
from typing import Any, overload
from collections.abc import Iterable
from openghg.util import not_set_metadata_values, null_metadata_values
__all__ = ["clean_string", "to_lowercase"]
@overload
def clean_string(to_clean: str) -> str: ...
@overload
def clean_string(to_clean: None) -> None: ...
[docs]
def clean_string(to_clean: str | None) -> str | None:
"""Returns a lowercase string with only alphanumeric
characters and underscores.
Args:
to_clean: String to clean
Returns:
str or None: Clean string
"""
import re
if to_clean is None:
return None
if isinstance(to_clean, bool):
return str(to_clean).lower()
try:
# This might be used with numbers
if is_number(to_clean):
return str(to_clean)
# Removes all whitespace
cleaner = re.sub(r"\s+", "", to_clean, flags=re.UNICODE).lower()
# Removes non-alphanumeric characters but keep underscores
# cleanest = re.sub(r"\W+", "", cleaner)
cleanest = re.sub(r"[^\w-]+", "", cleaner)
except TypeError:
return to_clean
return cleanest
@overload
def to_lowercase(d: dict, skip_keys: list | None = None) -> dict: ...
@overload
def to_lowercase(d: list, skip_keys: list | None = None) -> list: ...
@overload
def to_lowercase(d: tuple, skip_keys: list | None = None) -> tuple: ...
@overload
def to_lowercase(d: set, skip_keys: list | None = None) -> set: ...
@overload
def to_lowercase(d: str, skip_keys: list | None = None) -> str: ...
[docs]
def to_lowercase(
d: dict | list | tuple | set | str, skip_keys: list | None = None
) -> dict | list | tuple | set | str:
"""Convert an object to lowercase. All keys and values in a dictionary will be converted
to lowercase as will all objects in a list, tuple or set. You can optionally pass in a list of keys to
skip when lowercasing a dictionary.
Based on the answer https://stackoverflow.com/a/40789531/1303032
Args:
d: Object to lower case
skip_keys: List of keys to skip when lowercasing.
Returns:
dict: Dictionary of lower case keys and values
"""
if skip_keys is None:
skip_keys = []
if isinstance(d, dict):
lowercased = {k.lower(): to_lowercase(v) for k, v in d.items() if k not in skip_keys}
if skip_keys:
missing = {k: v for k, v in d.items() if k not in lowercased}
lowercased.update(missing)
return lowercased
elif isinstance(d, (list, set, tuple)):
t = type(d)
return t(to_lowercase(o) for o in d)
elif isinstance(d, str):
return d.lower()
else:
return d
[docs]
def is_number(s: Any) -> bool:
"""Is it a number?
https://stackoverflow.com/q/354038
Args:
s: String which may be a number
Returns:
bool
"""
if isinstance(s, bool):
return False
try:
float(s)
return True
except (ValueError, TypeError):
return False
[docs]
def remove_punctuation(s: str) -> str:
"""Removes punctuation and converts the passed string
to lowercase
Args:
s: String to convert
Returns:
str: Unpunctuated, lowercased string
"""
import re
s = s.lower()
return re.sub(r"[^\w\s]", "", s)
def extract_float(s: str) -> float:
"""Extract float from string.
This extends the built-in `float` function to find floats within a larger string.
Args:
s: string to extract float from
Returns:
first float value found in given string
Raises:
ValueError if no floats found
"""
# construct regex for float following Python's float grammar:
# https://docs.python.org/3/library/functions.html#grammar-token-float-floatvalue
sign = r"[+-]?" # optional sign
letter_neg_lookbehind = r"(?<![a-zA-Z])" # negative lookbehind assertion for letters
letter_neg_lookahead = r"(?![a-zA-Z])" # negative lookahead assertion for letters
infinity = letter_neg_lookbehind + r"(Infinity|inf)" + letter_neg_lookahead
nan = letter_neg_lookbehind + "nan" + letter_neg_lookahead
digit_part = r"(\d(_?\d)*)" # underscores ignored
number = (
rf"({digit_part}?\.{digit_part}|{digit_part}\.?)" # at least 1 number before or after decimal place
)
exponent = r"([eE]?[+-]?\d+)?" # optional exponent
float_number = number + exponent
abs_float_value = "(" + "|".join([float_number, infinity, nan]) + ")"
float_pat = re.compile(sign + abs_float_value, re.IGNORECASE)
if m := float_pat.search(s):
return float(m.group(0))
raise ValueError(f"No float values found in '{s}'")
def check_and_set_null_variable(
param: str | None,
not_set_value: str | None = None,
null_values: Iterable = null_metadata_values(),
) -> str | None:
"""
Check whether a variable is set to a null value (e.g. None) and if so replace this with
a defined string used to indicate the variable has not been set.
This is typically: "not_set"
Args:
param: variable to check
not_set_value: Optional value to replace if None. By default details
from openghg.util.not_set_metadata_values() will be used.
null_values: Values to identify as null. By default details
from openghg.util.null_metadata_values() will be used.
Returns:
str: Original string or value to indicate this is not set
None: Only returned if value is None and None is not included as one of the null_values.
"""
if not_set_value is None:
not_set_values = not_set_metadata_values()
not_set_value = not_set_values[0]
if param in null_values:
param = not_set_value
return param