Modifying and deleting data#

Sometimes you might want to modify some metadata after running the data through the standardisation scripts. After the standardisation process the metadata associated with some data can still be edited. This can save time if the data standardisation process is quite time consuming. Data can also be deleted from the object store.

from openghg.store import data_handler_lookup
from openghg.tutorial import populate_footprint_inert

We’ll first add some footprint data to the object store.

populate_footprint_inert()
Downloading tac_footprint_inert_201607.tar.gz:   0%|          | 0/70216145 [00:00<?, ?it/s]
Downloading tac_footprint_inert_201607.tar.gz:  27%|██▋       | 17.9M/67.0M [00:00<00:00, 188MB/s]
Downloading tac_footprint_inert_201607.tar.gz:  65%|██████▌   | 43.7M/67.0M [00:00<00:00, 236MB/s]
Downloading tac_footprint_inert_201607.tar.gz: 100%|██████████| 67.0M/67.0M [00:00<00:00, 202MB/s]

Standardising footprint data...
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/api.py:1226, in to_netcdf(dataset, path_or_file, mode, format, group, engine, encoding, unlimited_dims, compute, multifile, invalid_netcdf)
   1223 try:
   1224     # TODO: allow this work (setting up the file for writing array data)
   1225     # to be parallelized with dask
-> 1226     dump_to_store(
   1227         dataset, store, writer, encoding=encoding, unlimited_dims=unlimited_dims
   1228     )
   1229     if autoclose:

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/api.py:1273, in dump_to_store(dataset, store, writer, encoder, encoding, unlimited_dims)
   1271     variables, attrs = encoder(variables, attrs)
-> 1273 store.store(variables, attrs, check_encoding, writer, unlimited_dims=unlimited_dims)

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/common.py:267, in AbstractWritableDataStore.store(self, variables, attributes, check_encoding_set, writer, unlimited_dims)
    266 self.set_dimensions(variables, unlimited_dims=unlimited_dims)
--> 267 self.set_variables(
    268     variables, check_encoding_set, writer, unlimited_dims=unlimited_dims
    269 )

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/common.py:305, in AbstractWritableDataStore.set_variables(self, variables, check_encoding_set, writer, unlimited_dims)
    304 check = vn in check_encoding_set
--> 305 target, source = self.prepare_variable(
    306     name, v, check, unlimited_dims=unlimited_dims
    307 )
    309 writer.add(source, target)

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/scipy_.py:226, in ScipyDataStore.prepare_variable(self, name, variable, check_encoding, unlimited_dims)
    225 if name not in self.ds.variables:
--> 226     self.ds.createVariable(name, data.dtype, variable.dims)
    227 scipy_var = self.ds.variables[name]

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/scipy/io/_netcdf.py:387, in netcdf_file.createVariable(self, name, type, dimensions)
    386 data = empty(shape_, dtype=type.newbyteorder("B"))  # convert to big endian always for NetCDF 3
--> 387 self.variables[name] = netcdf_variable(
    388         data, typecode, size, shape, dimensions,
    389         maskandscale=self.maskandscale)
    390 return self.variables[name]

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

During handling of the above exception, another exception occurred:

AttributeError                            Traceback (most recent call last)
Cell In[2], line 1
----> 1 populate_footprint_inert()

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/openghg/tutorial/_tutorial.py:61, in populate_footprint_inert()
     58 domain = "EUROPE"
     59 model = "NAME"
---> 61 standardise_footprint(
     62     filepath=tac_inert_path, site=site, height=height, domain=domain, model=model
     63 )

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/openghg/standardise/_standardise.py:265, in standardise_footprint(filepath, site, height, domain, model, metmodel, species, network, period, chunks, continuous, retrieve_met, high_spatial_res, high_time_res, overwrite)
    263     return response_content
    264 else:
--> 265     return Footprints.read_file(
    266         filepath=filepath,
    267         site=site,
    268         height=height,
    269         domain=domain,
    270         model=model,
    271         metmodel=metmodel,
    272         species=species,
    273         network=network,
    274         period=period,
    275         chunks=chunks,
    276         continuous=continuous,
    277         retrieve_met=retrieve_met,
    278         high_spatial_res=high_spatial_res,
    279         high_time_res=high_time_res,
    280         overwrite=overwrite,
    281     )

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/openghg/store/_footprints.py:375, in Footprints.read_file(filepath, site, domain, model, inlet, height, metmodel, species, network, period, chunks, continuous, retrieve_met, high_spatial_res, high_time_res, short_lifetime, overwrite)
    372 lookup_results = datasource_lookup(metastore=metastore, data=footprint_data, required_keys=required)
    374 data_type = "footprints"
--> 375 datasource_uuids: Dict[str, Dict] = assign_data(
    376     data_dict=footprint_data,
    377     lookup_results=lookup_results,
    378     overwrite=overwrite,
    379     data_type=data_type,
    380 )
    382 fp.add_datasources(uuids=datasource_uuids, data=footprint_data, metastore=metastore)
    384 # Record the file hash in case we see this file again

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/openghg/store/_segment.py:51, in assign_data(data_dict, lookup_results, overwrite, data_type)
     49 datasource.add_data(metadata=metadata, data=data, overwrite=overwrite, data_type=data_type)
     50 # Save Datasource to object store
---> 51 datasource.save()
     53 new_datasource = uuid is False
     54 uuids[key] = {"uuid": datasource.uuid(), "new": new_datasource}

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/openghg/store/base/_datasource.py:550, in Datasource.save(self, bucket)
    547     if not parent_folder.exists():
    548         parent_folder.mkdir(parents=True, exist_ok=True)
--> 550     data.to_netcdf(filepath)
    552     # Can we just take the bytes from the data here and then write then straight?
    553     # TODO - for now just create a temporary directory - will have to update Acquire
    554     # or work on a PR for xarray to allow returning a NetCDF as bytes
   (...)
    560 
    561 # Copy the last version
    562 if "latest" in self._data_keys:

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/core/dataset.py:1890, in Dataset.to_netcdf(self, path, mode, format, group, engine, encoding, unlimited_dims, compute, invalid_netcdf)
   1887     encoding = {}
   1888 from ..backends.api import to_netcdf
-> 1890 return to_netcdf(  # type: ignore  # mypy cannot resolve the overloads:(
   1891     self,
   1892     path,
   1893     mode=mode,
   1894     format=format,
   1895     group=group,
   1896     engine=engine,
   1897     encoding=encoding,
   1898     unlimited_dims=unlimited_dims,
   1899     compute=compute,
   1900     multifile=False,
   1901     invalid_netcdf=invalid_netcdf,
   1902 )

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/api.py:1242, in to_netcdf(dataset, path_or_file, mode, format, group, engine, encoding, unlimited_dims, compute, multifile, invalid_netcdf)
   1240 finally:
   1241     if not multifile and compute:
-> 1242         store.close()
   1244 if not compute:
   1245     import dask

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/scipy_.py:240, in ScipyDataStore.close(self)
    239 def close(self):
--> 240     self._manager.close()

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/xarray/backends/file_manager.py:224, in CachingFileManager.close(self, needs_lock)
    222 file = self._cache.pop(self._key, default)
    223 if file is not None:
--> 224     file.close()

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/scipy/io/_netcdf.py:292, in netcdf_file.close(self)
    290 if hasattr(self, 'fp') and not self.fp.closed:
    291     try:
--> 292         self.flush()
    293     finally:
    294         self.variables = {}

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/scipy/io/_netcdf.py:402, in netcdf_file.flush(self)
    393 """
    394 Perform a sync-to-disk flush if the `netcdf_file` object is in write mode.
    395 
   (...)
    399 
    400 """
    401 if hasattr(self, 'mode') and self.mode in 'wa':
--> 402     self._write()

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/scipy/io/_netcdf.py:411, in netcdf_file._write(self)
    408 self.fp.write(array(self.version_byte, '>b').tobytes())
    410 # Write headers and data.
--> 411 self._write_numrecs()
    412 self._write_dim_array()
    413 self._write_gatt_array()

File /opt/hostedtoolcache/Python/3.9.15/x64/lib/python3.9/site-packages/scipy/io/_netcdf.py:418, in netcdf_file._write_numrecs(self)
    416 def _write_numrecs(self):
    417     # Get highest record count from all record variables.
--> 418     for var in self.variables.values():
    419         if var.isrec and len(var.data) > self._recs:
    420             self.__dict__['_recs'] = len(var.data)

AttributeError: 'numpy.ndarray' object has no attribute 'values'
result = data_handler_lookup(data_type="footprints", site="TAC", height="100m")
result.metadata

We want to update the model name so we’ll use the update_metadata method of the DataHandler object. To do this we need to take the UUID of the Datasource returned by the data_handler_lookup function, this is the key of the metadata dictionary.

NOTE: Each time an object is added to the object store it is assigned a unique id using the Python uuid4 function. This means any UUIDs you see in the documentation won’t match those created when you run these tutorials.

For the purposes of this tutorial we take the first key from the metadata dictionary. We can do this only because we’ve checked the dictionary and seen that only one key exists. It also means you can run through this notebook and it should work without you having to modify it. But be careful, if the dictionary contains more than one key, running the cell below might not result in the UUID you want. Each time you want to modify the data copy and paste the UUID and double check it.

uuid = next(iter(result.metadata))
updated = {"model": "new_model"}

result.update_metadata(uuid=uuid, to_update=updated)

When you run update_metadata the internal store of metadata for each Datasource is updated. If you want to really make sure that the metadata in the object store has been updated you can run refresh.

result.refresh()
metadata = result.metadata[uuid]

And check the model has been changed.

metadata["model"]

Deleting keys#

Let’s accidentally add too much metadata for the footprint and then delete.

excess_metadata = {"useless_key": "useless_value"}
result.update_metadata(uuid=uuid, to_update=excess_metadata)
result.metadata[uuid]["useless_key"]

Oh no! We’ve added some useless metadata, let’s remove it.

to_delete = ["useless_key"]
result.update_metadata(uuid=uuid, to_delete=to_delete)

And check if the key is in the metadata:

"useless_key" in result.metadata[uuid]

Restore from backup#

If you’ve accidentally pushed some bad metadata you can fix this easily by restoring from backup. Each DataHandler object stores a backup of the current metadata each time you run update_metadata. Let’s add some bad metadata, have a quick look at the backup and then restore it. We’ll start with a fresh DataHandler object.

result = data_handler_lookup(data_type="footprints", site="TAC", height="100m")
bad_metadata = {"domain": "neptune"}
result.update_metadata(uuid=uuid, to_update=bad_metadata)

Let’s check the domain

result.metadata[uuid]["domain"]

Using view_backup we can check the different versions of metadata we have backed up for each Datasource.

result.view_backup()

To restore the metadata to the previous version we use the restore function. This takes the UUID of the datasource and optionally a version string. The default for the version string is "latest", which is the version most recently backed up. We’ll use the default here.

result.restore(uuid=uuid)

Now we can check the domain again

result.metadata[uuid]["domain"]

To really make sure we can force a refresh of all the metadata from the object store and the Datasource.

result.refresh()

Then check again

result.metadata[uuid]["domain"]

Multiple backups#

The DataHandler object will store a backup each time you run update_metadata. This means you can restore any version of the metadata since you started editing. Do note that the backups, currently, only exist in memory belonging to the DataHandler object.

more_metadata = {"time_period": "1m"}
result.update_metadata(uuid=uuid, to_update=more_metadata)

We can view a specific metadata backup using the version argument. The first version is version 1, here we take a look at the backup made just before we made the update above.

backup_2 = result.view_backup(uuid=uuid, version=2)
backup_2["time_period"]

Say we want to keep some of the changes we’ve made to the metadata but undo the last one we can restore the last backup. To do this we can pass “latest” to the version argument when using restore.

result.restore(uuid=uuid, version="latest")
result.metadata[uuid]["time_period"]

We’re now back to where we want to be.

Deleting data#

To remove data from the object store we use data_handler_lookup again

result = data_handler_lookup(data_type="footprints", site="TAC", height="100m")
result.metadata

Each key of the metadata dictionary is a Datasource UUID. Please make sure that you double check the UUID of the Datasource you want to delete, this operation cannot be undone! Also remember to change the UUID below to the one in your version of the metadata.

uuid = "13fd70dd-e549-4b06-afdb-9ed495552eed"
result.delete_datasource(uuid=uuid)

To make sure it’s gone let’s run the search again

result = data_handler_lookup(data_type="footprints", site="TAC", height="100m")
result.metadata

An empty dictionary means no results, the deletion worked.