Source code for openghg.util._hashing
"""
Some functions for hashing data or strings for idendification of sources
"""
import hashlib
from pathlib import Path
[docs]
def hash_string(to_hash: str) -> str:
"""Return the SHA-1 hash of a string
Args:
to_hash: String to hash
Returns:
str: SHA1 hash of string
"""
return hashlib.sha1(str(to_hash).encode("utf-8")).hexdigest()
[docs]
def hash_file(filepath: Path) -> str:
"""Opens the file at filepath and calculates its SHA1 hash
Taken from https://stackoverflow.com/a/22058673
Args:
filepath (pathlib.Path): Path to file
Returns:
str: SHA1 hash
"""
# Let's read stuff in 64kB chunks
BUF_SIZE = 65536
sha1 = hashlib.sha1()
filepath = Path(filepath).expanduser().resolve()
with open(filepath, "rb") as f:
while True:
data = f.read(BUF_SIZE)
if not data:
break
sha1.update(data)
return sha1.hexdigest()
[docs]
def hash_bytes(data: bytes) -> str:
"""Calculate the SHA1 sum of some data
Args:
data: Binary data
Returns:
str: SHA1 hash
"""
return hashlib.sha1(data).hexdigest()
[docs]
def hash_retrieved_data(to_hash: dict[str, dict]) -> dict:
"""Hash data retrieved from a data platform. This calculates the SHA1 of the metadata
and the start date, end date and the number of timestamps in the Dataset.
Args:
to_hash: Dictionary to hash
We expected this to be a dictionary such as
{species_key: {"data": xr.Dataset, "metadata": {...}}}
Returns:
dict: Dictionary of hash: species_key
"""
from hashlib import sha1
from json import dumps
from openghg.util import timestamp_now
current_timestamp = str(timestamp_now())
hashes: dict[str, dict] = {}
for key, data in to_hash.items():
metadata = data["metadata"].copy()
try:
del metadata["file_created"]
except KeyError:
pass
metadata_hash = sha1(dumps(metadata, sort_keys=True).encode("utf8")).hexdigest()
ds = data["data"]
start_date = str(ds.time.min().values)
end_date = str(ds.time.max().values)
n_timestamps = str(ds.time.size)
basic_info = f"{start_date}_{end_date}_{n_timestamps}".encode()
time_hash = sha1(basic_info).hexdigest()
combo = (metadata_hash + time_hash).encode("utf8")
combo_hash = sha1(combo).hexdigest()
hashes[combo_hash] = {key: current_timestamp}
return hashes